# Generation probability distribution in true repertoires

In [1]:
# Imports
import os
import re
import numpy as np
import pandas as pd

os.chdir('path_to_your_dir/generation_prob')
import gen_prob_distributions as gpd

# Set directory
os.chdir('path_to_your_dir')


## Read TCRex data

In [2]:
# Read in all epitope-specific TCRs parsed by TCRex
# df may not contain , in epitopes: every tcr-epitope should take one row 
data = pd.read_csv('./data/final/all_tcrs.tsv')

# Remove duplicated CDR3 beta sequence-epitope combinations
data = data.drop_duplicates(subset=['junction_aa','epitope'])
data

Unnamed: 0,v_call,junction_aa,j_call,epitope
0,TRBV07-06,CASSLARGVLMNTEAFF,TRBJ01-01,TVYDPLQPELDSFK
1,TRBV10-02,CASSKGSTEAFF,TRBJ01-01,TVYDPLQPELDSFK
2,TRBV27,CASSLMGGSSYEQYF,TRBJ02-07,TVYDPLQPELDSFK
3,TRBV07-02,CASSLVLASYEQYF,TRBJ02-07,TVYDPLQPELDSFK
4,TRBV04-01,CASSLMAGPGNIQYF,TRBJ02-04,TVYDPLQPELDSFK
...,...,...,...,...
44192,TRBV04-02,CASSQDSGQIDTGELFF,TRBJ02-02,ALSKGVHFV
44193,TRBV27,CASSLSGGWAGGLEQYF,TRBJ02-07,ALSKGVHFV
44194,TRBV27,CASSLSGTYYEQYF,TRBJ02-07,ALSKGVHFV
44195,TRBV27,CASSISVYSPLHF,TRBJ01-06,ALSKGVHFV


In [3]:
# Select the 8 epitopes with the most data
sizes = data.groupby('epitope').size().reset_index().sort_values(by=0, ascending=False)
epitopes = sizes[0:8]['epitope'].tolist()
data = data[data['epitope'].isin(epitopes)]

In [4]:
# Calculate p gen and remove those with p_gen ==0
data['p_gen'] = data['junction_aa'].apply(lambda x: gpd.calculate_p_gen(x))
print(data.shape)
data = data[data['p_gen']!=0]
print(data.shape)

(22646, 5)
(22588, 5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [5]:
data = data[['v_call', 'junction_aa', 'j_call', 'epitope']]
data.to_csv('./results/generation_prob/true_repertoires/tcrex_pgen_data.csv', 
            index=False, float_format='%.8f')