In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
sys.path.insert(0, "../func_py/")
import data_utils as ut

### Here we map back the families found by hilary at the patient level to the different samples of each patient

In [2]:
metadata = pd.read_csv('metadata/metadata.tsv', sep='\t', index_col=0)

In [6]:
pat = 2

famh_sample = 'pat'+str(pat)+'_hilary_heavy'
f_famh = pd.read_csv('lineages/hilary_out/inferred_full_method_'+famh_sample+'.tsv', sep='\t', index_col='sequence_id')
f_famh.index = f_famh.index.str.split('-').str[0] + '-' + f_famh.index.str.split('-').str[1]

famp_sample = 'pat'+str(pat)+'_hilary_pairs_'
f_famph = pd.read_csv('lineages/hilary_out/inferred_full_method_'+famp_sample+'h.tsv', sep='\t', index_col='old_seq_id')
f_famph.index = f_famph.index.str.split('-').str[0] + '-' + f_famph.index.str.split('-').str[1]
f_fampl = pd.read_csv('lineages/hilary_out/inferred_full_method_'+famp_sample+'l.tsv', sep='\t', index_col='old_seq_id')
f_fampl.index = f_fampl.index.str.split('-').str[0] + '-' + f_fampl.index.str.split('-').str[1]
famp_map = pd.concat((f_famph.family, f_fampl.family))

## Mapping families from hilary frame to sample-frames

In [8]:
for samp, row in metadata[metadata.patient == pat].iterrows():
    f = pd.read_csv('sequences/'+samp+'.tsv', sep='\t', index_col=0, low_memory=False)
    print(samp)
    
    # Mapping families of heavy sequences
    f['familiy_heavy'] = f.pat_heavy_id.map(f_famh.family)
    fh = f[f.chain == 'H']
    print('check percentage assigned heavy families', np.sum(fh.familiy_heavy.notna()) / len(fh) * 100)
    
    # Mapping families of paired sequences
    f['familiy_pairs'] = f.pat_pairs_id.map(famp_map)
    fp = f[f.paired_seq.notna()]
    print('check percentage assigned pair families', np.sum(fp.familiy_pairs.notna()) / len(fp) * 100)
    f.to_csv('sequences/'+samp+'.tsv', sep='\t')
    print()

pat2_t1_mc
check percentage assigned heavy families 99.96082949308756
check percentage assigned pair families 99.95696703675016

pat2_t2_mc
check percentage assigned heavy families 99.96275720803203
check percentage assigned pair families 99.96262660238442

pat2_t3_mc
check percentage assigned heavy families 99.98562633255875
check percentage assigned pair families 99.98656556006688

pat2_t1_pc
check percentage assigned heavy families 100.0
check percentage assigned pair families 100.0

pat2_t2_pc
check percentage assigned heavy families 100.0
check percentage assigned pair families 100.0

pat2_t3_pc
check percentage assigned heavy families 99.97851079832384
check percentage assigned pair families 99.9881390107935

pat2_t4_pc
check percentage assigned heavy families 100.0
check percentage assigned pair families 100.0



## Mapping families from sample frames to replicate frames

In [9]:
for samp, row in metadata[metadata.patient == pat].iterrows():
#for samp in ['pat1_t2_mc']:
    
    f_samp = pd.read_csv('sequences/'+samp+'.tsv', sep='\t', index_col=0, low_memory=False)
    print(samp)
    
    for r in range(metadata.loc[samp, 'repl_count']):
        
        f_r = pd.read_csv('sequences/replicates/'+samp+'_r'+str(r+1)+'.tsv', sep='\t', index_col=0, low_memory=False)
        f_r['familiy_heavy'] = f_r.sample_id.map(f_samp.familiy_heavy)
        fh = f_r[f_r.chain == 'H']
        print('check percentage assigned heavy families', np.sum(fh.familiy_heavy.notna()) / len(fh) * 100)
        
        f_r['familiy_pairs'] = f_r.sample_id.map(f_samp.familiy_pairs)
        fp = f_r[f_r.paired_seq.notna()]
        print('check percentage assigned pair families', np.sum(fp.familiy_pairs.notna()) / len(fp) * 100)
    
        f_r.to_csv('sequences/replicates/'+samp+'_r'+str(r+1)+'.tsv', sep='\t')

pat2_t1_mc
check percentage assigned heavy families 99.95506122907538
check percentage assigned pair families 99.94531784005468
check percentage assigned heavy families 99.98765889176849
check percentage assigned pair families 99.98482779547868
check percentage assigned heavy families 99.94984954864594
check percentage assigned pair families 99.95323460639128
check percentage assigned heavy families 99.94445678738059
check percentage assigned pair families 99.94563740146779
check percentage assigned heavy families 99.97064866451424
check percentage assigned pair families 99.96237772761475
pat2_t2_mc
check percentage assigned heavy families 99.94044073853485
check percentage assigned pair families 99.94679907785068
check percentage assigned heavy families 99.98514777959305
check percentage assigned pair families 99.98222222222222
check percentage assigned heavy families 99.96305191206355
check percentage assigned pair families 99.95513683266039
check percentage assigned heavy families 9