# Summarize collected data

In [1]:
# Imports
import os

#import matplotlib.pyplot as plt
import pandas as pd
import plotnine as pn


# Set directory
os.chdir('path_to_your_dir')

In [2]:
# Read performance table of TCRex models
data = pd.read_csv('./data/parsed/tcrex_models.csv')

# Parse performance metrics
data = data.round({'accuracy': 2, 'accuracy_std':2, 
                   'average_precision':2, 'average_precision_std':2,
                   'roc_auc':2, 'roc_auc_std':2})
data['Balanced accuracy'] = data['accuracy'].astype(str) + ' ± ' + data['accuracy_std'].astype(str)
data['Average precision'] = data['average_precision'].astype(str) + ' ± ' + data['average_precision_std'].astype(str)
data['ROC AUC'] = data['roc_auc'].astype(str) + ' ± ' + data['roc_auc_std'].astype(str)

# Parse activity column
data['Active'] = data['is_active'].apply(lambda x: 'Yes' if x == 't' else 'No')

# Parse column names
data = data.rename(columns={'epitope': 'Epitope', 
                            'category_main': 'Viral/Cancer',
                            'category_sub': 'Origin',
                            'nr_target_sequences': 'Number of TCR sequences'})

# Select columns
data = data[['Epitope','Viral/Cancer','Origin',
             'Number of TCR sequences','Balanced accuracy',
             'Average precision','ROC AUC','Active']]
# Sort by origin
data = data.sort_values(by=['Viral/Cancer','Origin'], ascending=True)

In [3]:
data['Origin'] = data['Origin'].replace({'HomoSapiens':'Unknown'})
data['Viral/Cancer'] = data['Viral/Cancer'].replace({'HomoSapiens':'Cancer'})
data[data['Epitope']=='NLSALGIFST']

Unnamed: 0,Epitope,Viral/Cancer,Origin,Number of TCR sequences,Balanced accuracy,Average precision,ROC AUC,Active
84,NLSALGIFST,Cancer,Unknown,111,0.51 ± 0.01,0.19 ± 0.03,0.66 ± 0.04,No


## Export table

In [4]:
data

Unnamed: 0,Epitope,Viral/Cancer,Origin,Number of TCR sequences,Balanced accuracy,Average precision,ROC AUC,Active
4,EAAGIGILTV,Cancer,Melanoma,266,0.69 ± 0.03,0.74 ± 0.01,0.9 ± 0.01,Yes
37,ELAGIGILTV,Cancer,Melanoma,1035,0.54 ± 0.0,0.37 ± 0.03,0.72 ± 0.01,Yes
73,AMFWSVPTV,Cancer,Melanoma,82,0.59 ± 0.04,0.48 ± 0.12,0.78 ± 0.04,Yes
74,FLYNLLTRV,Cancer,Melanoma,61,0.63 ± 0.08,0.59 ± 0.1,0.87 ± 0.03,Yes
96,LLLGIGILV,Cancer,Multiple Myeloma,233,0.58 ± 0.01,0.44 ± 0.06,0.77 ± 0.04,Yes
...,...,...,...,...,...,...,...,...
124,FVDGVPFVV,Viral,SARS-CoV-2,2420,0.54 ± 0.0,0.37 ± 0.01,0.77 ± 0.01,Yes
125,HTTDPSFLGRY,Viral,SARS-CoV-2,5000,0.7 ± 0.01,0.7 ± 0.02,0.9 ± 0.01,Yes
119,ILIEGIFFV,Viral,VZV,111,0.63 ± 0.03,0.55 ± 0.09,0.82 ± 0.04,Yes
121,ALSQYHVYV,Viral,VZV,69,0.62 ± 0.03,0.51 ± 0.06,0.74 ± 0.04,Yes


In [5]:
# Save the plot to a file
data.to_csv('./results/data_description/data_description.tsv',sep='\t', index=False)

## Export info in as figure

In [6]:
# Add column for groupby function on nr of epitopes
data['Number epitopes'] = 1
# Sum Nr of sequences and epitopes for each origin
summary = data.groupby(['Origin']).sum().reset_index()
summary

Unnamed: 0,Origin,Number of TCR sequences,Number epitopes
0,CMV,5656,8
1,DENV1,165,1
2,DENV2,60,1
3,DENV3/4,158,1
4,EBV,1687,6
5,HCV,484,5
6,HIV,2477,20
7,HSV2,63,1
8,HTLV1,131,1
9,Influenza,5536,3


In [7]:
# Add Viral/Cancer label
summary['Viral/Cancer'] = summary['Origin'].apply(lambda x: data[data['Origin']==x]['Viral/Cancer'].tolist()[0])
summary = summary.sort_values(by=['Viral/Cancer','Number of TCR sequences'])
summary

Unnamed: 0,Origin,Number of TCR sequences,Number epitopes,Viral/Cancer
13,Unknown,111,1,Cancer
11,Multiple Myeloma,233,1,Cancer
10,Melanoma,1444,4,Cancer
2,DENV2,60,1,Viral
7,HSV2,63,1,Viral
8,HTLV1,131,1,Viral
3,DENV3/4,158,1,Viral
1,DENV1,165,1,Viral
14,VZV,180,2,Viral
15,YellowFeverVirus,474,1,Viral


In [8]:
# Order plot according to viral/cancer class and size
# https://plotnine.readthedocs.io/en/stable/tutorials/miscellaneous-order-plot-series.html
summary['Origin'] = summary['Origin'].astype("category")
sorted_data = summary['Origin'].tolist()
summary= summary.assign(Origin =
  summary['Origin'].cat.reorder_categories(sorted_data))

In [9]:
plot = (pn.ggplot(summary, pn.aes('Origin','Number of TCR sequences', label='Number epitopes', fill='Viral/Cancer')) +
        pn.geom_col() + 
        pn.coord_flip()+
        pn.geom_text(pn.aes(label = 'Number epitopes'),nudge_y =300)+
        # https://davidmathlogic.com/colorblind/#%23FFC20A-%230C7BDC
        pn.scale_fill_manual(values = ('#FFC20A','#0C7BDC'))+
        pn.theme(legend_title=pn.element_blank())+
        pn.theme_bw()+
        pn.theme(figure_size=(12, 8)))





In [10]:
# Save the plot to a file
plot.save('./results/data_description/data_overview.png', dpi=600)

