In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from multiprocessing import Pool, cpu_count

Merging together all the samples of a patient and create the input file of hilary

In [None]:


def applyParallel(frame, func, silent=False, threads=None):
    """
    Parallel computing across rows of a Dataframe
    """
    max_threads = cpu_count()
    if threads is None: 
        threads = max_threads
    elif threads > max_threads: 
        threads = max_threads
    
    _dfGrouped = frame.groupby(level=0)
    with Pool(threads) as p:
        ret_list = list(tqdm(p.imap(func, _dfGrouped), total=len(_dfGrouped), disable=silent))
        
    return pd.concat(ret_list)


def hilary_parse(args):
    """
    Parsing a single row of a frame for Hilary formatting
    """
    row = args[1].loc[args[0]]
    d = dict()
    try:
        v_st, v_end = int(row.v_germline_start), int(row.v_germline_end)
        j_len = int(row.j_sequence_end - row.j_sequence_start)
        j_st = row.germline_alignment.rfind('N')+1
        d['v_sequence_alignment'] = row.sequence_alignment[v_st-1 : v_end]
        d['j_sequence_alignment'] = row.sequence_alignment[j_st : j_st + j_len]
        d['v_germline_alignment'] = row.germline_alignment[v_st-1 : v_end]
        d['j_germline_alignment'] = row.germline_alignment[j_st : j_st + j_len]
        return pd.DataFrame(d, index=[args[0]])
    except:
        print(args[0], row)
        return pd.DataFrame(index=[args[0]])

    
def hilary_preprocess(f):
    """
    Formatting the data as hilary wants (parallel computation)
    """
    df = applyParallel(f, hilary_parse)
    df['junction'] = df.index.map(f.junction)
    df['v_call'] = df.index.map(f.v_call)
    df['j_call'] = df.index.map(f.j_call)
    return df

In [3]:
metadata = pd.read_csv('metadata/metadata.tsv', sep='\t', index_col=0)
patients = set(metadata.patient)
patients

{'AT', 'D01', 'IM', 'IZ', 'MRK', 'MT'}

In [8]:
for pat in patients:
    pat_ids = metadata[metadata.patient == pat].index
    merged_fr = pd.DataFrame()
    for _id in pat_ids:
        
        if metadata.loc[_id].n_repls > 1:
            for r in range(1, metadata.loc[_id].n_repls + 1):
                fr = pd.read_csv("sequences/replicates/" + _id + '_r' + str(r) + '.tsv', sep='\t', index_col=0)
                fr = fr[fr.germline_alignment.notna()]
                fr = hilary_preprocess(fr)
                fr['sample'] = _id + '_r' + str(r)
                fr.index = fr.index.astype(str) + '_' + fr['sample']
                merged_fr = pd.concat((merged_fr, fr))
        else:
            fr = pd.read_csv("sequences/" + _id + '.tsv', sep='\t', index_col=0)
            fr = fr[fr.germline_alignment.notna()]
            fr = hilary_preprocess(fr)
            fr['sample'] = _id
            fr.index = fr.index.astype(str) + '_' + fr['sample']
            merged_fr = pd.concat((merged_fr, fr))
                
    print(pat)
    merged_fr.index.name = "sequence_id"
    #merged_fr.index = "id_" + merged_fr.index.astype(str)
    #merged_fr.index = fr['sequence_id']
    merged_fr.to_csv('lineages/seqs_in/' + pat + '.tsv', sep='\t')

100%|██████████| 26205/26205 [00:09<00:00, 2794.78it/s]
100%|██████████| 26424/26424 [00:09<00:00, 2837.01it/s]
100%|██████████| 3642/3642 [00:01<00:00, 2937.51it/s]
100%|██████████| 7935/7935 [00:02<00:00, 2849.68it/s]
100%|██████████| 15873/15873 [00:05<00:00, 2793.40it/s]
100%|██████████| 19948/19948 [00:07<00:00, 2819.11it/s]
100%|██████████| 4116/4116 [00:01<00:00, 2909.62it/s]
100%|██████████| 2608/2608 [00:00<00:00, 2790.93it/s]
100%|██████████| 2466/2466 [00:00<00:00, 2806.37it/s]


D01


100%|██████████| 29150/29150 [00:10<00:00, 2831.81it/s]
100%|██████████| 7356/7356 [00:02<00:00, 2803.17it/s]
100%|██████████| 21321/21321 [00:07<00:00, 2784.86it/s]
100%|██████████| 19736/19736 [00:06<00:00, 2834.45it/s]
100%|██████████| 4530/4530 [00:01<00:00, 2772.54it/s]
100%|██████████| 16875/16875 [00:06<00:00, 2798.12it/s]
100%|██████████| 7676/7676 [00:02<00:00, 2825.31it/s]
100%|██████████| 1561/1561 [00:00<00:00, 2815.26it/s]
100%|██████████| 4005/4005 [00:01<00:00, 2898.33it/s]


AT


100%|██████████| 17234/17234 [00:06<00:00, 2725.54it/s]
100%|██████████| 17988/17988 [00:06<00:00, 2666.09it/s]
100%|██████████| 2084/2084 [00:00<00:00, 3123.98it/s]
100%|██████████| 1928/1928 [00:00<00:00, 3100.23it/s]


MT


100%|██████████| 58672/58672 [00:21<00:00, 2739.27it/s]
100%|██████████| 431/431 [00:00<00:00, 2841.73it/s]
100%|██████████| 23764/23764 [00:07<00:00, 2994.83it/s]
100%|██████████| 34528/34528 [00:12<00:00, 2862.85it/s]
100%|██████████| 6145/6145 [00:02<00:00, 2776.48it/s]
100%|██████████| 5222/5222 [00:01<00:00, 2870.38it/s]
100%|██████████| 17528/17528 [00:06<00:00, 2829.39it/s]
100%|██████████| 16692/16692 [00:05<00:00, 2878.10it/s]
100%|██████████| 2722/2722 [00:00<00:00, 2736.70it/s]
100%|██████████| 2794/2794 [00:01<00:00, 2712.29it/s]


IM


100%|██████████| 44143/44143 [00:15<00:00, 2885.11it/s]
100%|██████████| 5175/5175 [00:01<00:00, 2964.92it/s]
100%|██████████| 28363/28363 [00:09<00:00, 2893.30it/s]
100%|██████████| 31490/31490 [00:11<00:00, 2855.37it/s]
100%|██████████| 2003/2003 [00:00<00:00, 2690.13it/s]
100%|██████████| 5418/5418 [00:01<00:00, 2838.52it/s]
100%|██████████| 30771/30771 [00:10<00:00, 2842.73it/s]
100%|██████████| 32853/32853 [00:11<00:00, 2824.72it/s]
100%|██████████| 3933/3933 [00:01<00:00, 3104.92it/s]
100%|██████████| 4795/4795 [00:01<00:00, 3056.62it/s]


IZ


100%|██████████| 49504/49504 [00:16<00:00, 2975.13it/s]
100%|██████████| 573/573 [00:00<00:00, 3334.33it/s]
100%|██████████| 31750/31750 [00:10<00:00, 2985.15it/s]
100%|██████████| 21099/21099 [00:06<00:00, 3054.97it/s]
100%|██████████| 5131/5131 [00:01<00:00, 3091.81it/s]
100%|██████████| 2964/2964 [00:00<00:00, 3086.46it/s]
100%|██████████| 4995/4995 [00:01<00:00, 3050.36it/s]
100%|██████████| 2084/2084 [00:00<00:00, 3268.19it/s]
100%|██████████| 1928/1928 [00:00<00:00, 3178.64it/s]
100%|██████████| 157/157 [00:00<00:00, 3582.16it/s]
100%|██████████| 122/122 [00:00<00:00, 5027.81it/s]


MRK
