In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook

In [None]:
amp1_list = # list of amp1 file names before _R1
amp2_list = # list of amp2 file names before _R1

samples = # sample names
path = # dir with amp1_list and amp2_list files

In [None]:
amp1_list = np.array(amp1_list)
amp2_list = np.array(amp2_list)
samples = np.array(samples)
amp1_list, amp2_list, samples = list(amp1_list), list(amp2_list), list(samples)

In [None]:
for amp1, amp2, sample in tqdm_notebook(zip(amp1_list, amp2_list, samples), total=len(samples)):
    
    amp1H_file = pd.read_csv(path + amp1 + "_2.clonotypes.IGH.txt", sep="\t")
    
    amp2H_file = pd.read_csv(path + amp2 + "_IGH.alignments.txt", sep="\t")
    amp2Hc_file = pd.read_csv(path + amp2 + "_IGH.clonotypes.IGH.txt", sep="\t")
    amp2KL_file = pd.read_csv(path + amp2 + "_IGKL.alignments.txt", sep="\t")

    amp1H_file.drop(columns=["bestCHit", "cloneCount", "cloneFraction", "cloneId"], inplace=True)
    amp2H_file = amp2H_file[["cloneId", "readId","nSeqCDR3", "bestJHit"]] # witout V/D matches
    amp2Hc_file = amp2Hc_file[["cloneId", "cloneCount", "cloneFraction"]]
    amp2H_file = amp2H_file.merge(amp2Hc_file, on=["cloneId"])
    
    
    H_file = amp2H_file.add_prefix("IGH_")

    amp2KL_file = amp2KL_file.loc[amp2KL_file["cloneId"] != -1, ["cloneId", "readId", "nSeqCDR3"]]
    
    amp2KL_file_c = pd.concat([pd.read_csv(path + amp2 + "_IGKL.clonotypes.IGK.txt", sep="\t"),
                              pd.read_csv(path + amp2 + "_IGKL.clonotypes.IGL.txt", sep="\t")])

    amp2KL_file_c.drop(columns=["bestDHit", "bestCHit"], inplace=True)
    KL_file = amp2KL_file.merge(amp2KL_file_c, on=["cloneId", "nSeqCDR3"])
    KL_file = KL_file.add_prefix("IGKL_")

    
    
    df_total = H_file.merge(KL_file, left_on="IGH_readId", right_on="IGKL_readId")
    print(len(df_total["IGH_readId"].unique()))
    df_total.drop(columns=["IGH_readId", "IGKL_readId"], inplace=True)

    IGH_cols = [_ for _ in list(df_total.columns) if "IGH" in _]
    IGKL_cols = [_ for _ in list(df_total.columns) if "IGKL" in _]
    
    
    df_total = df_total.groupby(IGH_cols + IGKL_cols).size().reset_index(name='match')
    df_total['IGKL_chains'] = df_total['IGKL_bestJHit'].str[:3]
    
    # Merging by the identical CDR3aa
    cols = [col for col in df_total.columns if any(s in col for s in ["cloneId", "best", "nSeqCDR3", "chains", "targetSequences"])]
    df_total = df_total.groupby(["IGH_aaSeqCDR3","IGKL_aaSeqCDR3"]).agg({"IGH_cloneCount":"sum", "IGKL_cloneCount":"sum",
                                                         "IGH_cloneFraction":"sum", "IGKL_cloneFraction":"sum",
                                                         "match":"sum",
                                                         "IGH_cloneId":"unique", "IGKL_cloneId":"unique",
                                                         "IGKL_chains":"unique",
                                                         "IGH_bestVHit":"unique", "IGH_bestDHit":"unique", "IGH_bestJHit":"unique",
                                                         "IGKL_bestVHit":"unique", "IGKL_bestJHit":"unique",
                                                         "IGH_nSeqCDR3":"unique", "IGH_targetSequences":"unique",
                                                         "IGKL_nSeqCDR3":"unique", "IGKL_targetSequences":"unique"
                                                         })
    for col in cols:
        df_total[col] = [";".join(df_total[col][i].astype(str)) for i in range(len(df_total))]
    df_total = df_total.reset_index()
    
    # Filter clones with a low count
    df_total = df_total.loc[(df_total["IGH_cloneCount"] > 2) & (df_total["IGKL_cloneCount"] > 2)]
    
    
    df_total["IGH_matchFrac"] = df_total["match"] / df_total["IGH_cloneCount"]
    df_total["IGKL_matchFrac"] = df_total["match"] / df_total["IGKL_cloneCount"]
    
    df_total = df_total[["IGH_cloneId", "IGH_cloneCount", "IGH_cloneFraction", "IGH_bestVHit", "IGH_bestDHit", "IGH_bestJHit", "IGH_nSeqCDR3", "IGH_aaSeqCDR3", "IGH_targetSequences",
                         "IGKL_cloneId", "IGKL_chains", "IGKL_cloneCount", "IGKL_cloneFraction", "IGKL_bestVHit", "IGKL_bestJHit", "IGKL_nSeqCDR3", "IGKL_aaSeqCDR3", "IGKL_targetSequences", "IGH_matchFrac",  "IGKL_matchFrac", "match"]]

    df_total.rename(columns={"IGKL_cloneCount":"IGKL_count",
                            "IGH_cloneFraction":"IGH_freq",
                             "IGKL_cloneFraction":"IGKL_freq"}, inplace=True)
    
    df_total["IGKL_sym"] = df_total["IGH_freq"] / (df_total["IGKL_freq"] * df_total["IGKL_matchFrac"] )
    df_total["IGH_sym"] = df_total["IGH_freq"] * df_total["IGH_matchFrac"] / df_total["IGKL_freq"]
    
    df_total["IGKL_symNorm"] = df_total["IGKL_sym"].apply(lambda x: 100**x if x <= 1 else 100**(1/x))
    df_total["IGH_symNorm"] = df_total["IGH_sym"].apply(lambda x: 100**x if x <= 1 else 100**(1/x))
    
    df_total["signal"] = df_total["IGKL_matchFrac"] / df_total["IGH_freq"]
    
    df_total["symNorm_HMean"] = 2*df_total["IGKL_symNorm"]*df_total["IGH_symNorm"]/(df_total["IGKL_symNorm"] + df_total["IGH_symNorm"])
    
    df_total = df_total.sort_values(by=["IGH_cloneId","IGKL_cloneId"]).reset_index(drop=True)
    df_total.to_csv(path + sample + '_IGH-fusion_result.collapsed.txt', sep="\t", index=False)

In [None]:
df_total