# Collect all simulated data into one df

In [1]:
# Imports
import os

import pandas as pd

os.chdir('path_to_your_dir/generation_prob')
import gen_prob_distributions as gpd

# Set directory
os.chdir('path_to_your_dir')

In [2]:
# Simulated data
simulations = ['simulation1','simulation2','simulation3',
               'simulation4','simulation5','simulation6',
               'simulation7','simulation8']

all_data = pd.DataFrame()

for simulation in simulations:
    
    # Read in simulated TCR data
    data = pd.read_csv(('./results/ligo_simulations_one_seed/'+simulation+'/results/inst1/exported_dataset/airr/batch1.tsv'),sep='\t')
    
    # Remove duplicated CDR3 beta sequence-signal combinations
    data = data.drop_duplicates(subset=['junction_aa','signals_aggregated'])
    
    # Collect data from all simulations in one df
    data['simulation'] = simulation
    all_data = pd.concat([all_data,data])
    
# Add epitope column to the table    
all_data['epitope'] = all_data['simulation']

In [3]:
all_data = all_data[['v_call', 'j_call',
                     'junction_aa','epitope']]


In [4]:
all_data

Unnamed: 0,v_call,j_call,junction_aa,epitope
0,TRBV7-3*01,TRBJ1-1*01,CASSTWTGGKSEAFF,simulation1
1,TRBV10-1*02,TRBJ2-7*01,CASSGSKTGEPHEQYF,simulation1
2,TRBV6-6*01,TRBJ2-1*01,CASSYWAGETHEQFF,simulation1
3,TRBV6-6*02,TRBJ1-4*01,CASILWTGEHNEKLFF,simulation1
4,TRBV9*01,TRBJ2-7*01,CASSHWTGENYEQYF,simulation1
...,...,...,...,...
2995,TRBV6-5*01,TRBJ2-7*01,CASKPPGLAGGSYEQYF,simulation8
2996,TRBV10-1*02,TRBJ2-7*01,CASRALAGGGYEQYF,simulation8
2997,TRBV18*01,TRBJ2-7*01,CASSPGLAGAPHEQYF,simulation8
2998,TRBV4-2*01,TRBJ2-7*01,CASSRLAGGCYEQYF,simulation8


In [5]:
# Calculate p gen
all_data['p_gen'] = all_data['junction_aa'].apply(lambda x: gpd.calculate_p_gen(x))
print(all_data.shape)
all_data = all_data[all_data['p_gen']!=0]
print(all_data.shape)

(21576, 5)
(21576, 5)


In [6]:
all_data.to_csv('./notebooks/paper/generation_prob/simulated_repertoires/all_data.csv', 
            index=False)