In [86]:
#Allows relative imports
import os
import sys
import pandas as pd 

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
#imports 
from src.preprocessing import *
from src.models import *
from src.train_eval_helpers import *
from src.plots import *
import torch
import torch.nn as nn
import torch.nn.functional as F
%load_ext autoreload
%autoreload 2
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rcParams['figure.dpi']= 300
import seaborn as sns
sns.set_style('darkgrid')

#checking gpu status
if torch.cuda.is_available():
    device = torch.device('cuda')
    print("Using : {}".format(device))
else:
    device = torch.device('cpu')
    print("Using : {}".format(device))

RANGE = range(12,17)
TRAINDIR = '../TrainingData/'
SAMPLEDIR = '../SampleData/'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Using : cuda


In [29]:
train_tumor = pd.read_csv('../TrainingData/TumorCDR3.txt', header = None)\
                .rename(columns={0:'amino_acid'})
train_normal = pd.read_csv('../TrainingData/NormalCDR3.txt', header = None)\
                .rename(columns={0:'amino_acid'})
test_tumor = pd.read_csv('../TrainingData/TumorCDR3_test.txt', header = None)\
                .rename(columns={0:'amino_acid'})
test_normal = pd.read_csv('../TrainingData/NormalCDR3_test.txt', header = None)\
                .rename(columns={0:'amino_acid'})

train_tumor['type']="train_tumor"
train_normal['type'] = "train_normal"
test_tumor['type']="test_tumor"
test_normal['type'] = "test_normal"

train_tumor_aa = train_tumor.amino_acid.values 
train_normal_aa = train_normal.amino_acid.values
test_tumor_aa = test_tumor.amino_acid.values 
test_normal_aa = test_normal.amino_acid.values 
            
test_tumor['in_train'] = test_tumor.apply(lambda x: True if x['amino_acid'] in train_tumor_aa else False, axis=1)
test_normal['in_train'] = test_normal.apply(lambda x: True if x['amino_acid'] in train_normal_aa else False, axis=1)
print("tumor seqs in both train and test : ",len(test_tumor.query('in_train==1')))
print("Normal seqs in both train and test : ",len(test_normal.query('in_train==1')))

tumor seqs in both train and test :  10000
Normal seqs in both train and test :  0


In [87]:
train_df = pd.concat([train_tumor, train_normal], ignore_index=True)
test_df = pd.concat([test_tumor, test_normal], ignore_index=True)

#Cancer patient level TCRs
cancer_sample_df = pd.DataFrame(columns=['amino_acid','file'])
for file in os.listdir(SAMPLEDIR+'Cancer'):
    if '.md' in file :continue
    temp = pd.read_csv(os.path.join(SAMPLEDIR+'Cancer',file),
                       sep='\t',usecols=['aminoAcid'])
    temp.columns = ['amino_acid']
    temp['file'] = 'BR'+file.split('BR')[1].split('.tsv')[0]
    cancer_sample_df = cancer_sample_df.append(temp, ignore_index=True)

#Control patient level TCRs
control_sample_df = pd.DataFrame(columns=['amino_acid','file'])
for file in os.listdir(SAMPLEDIR+'Control'):
    if '.md' in file :continue
    temp = pd.read_csv(os.path.join(SAMPLEDIR+'Control',file),
                       sep='\t',usecols=['aminoAcid'])
    temp.columns = ['amino_acid']
    temp['file'] = 'HIP'+file.split('HIP')[1].split('.tsv')[0]
    control_sample_df = control_sample_df.append(temp, ignore_index=True)

In [89]:
cancer_sample_df

Unnamed: 0,amino_acid,file
0,CASSLELGALGGNTIYF,BR01B
1,CASSLELGALGGNTIYF,BR01B
2,CASGTSGFTDTQYF,BR01B
3,CAWRTSGLTDTQYF,BR01B
4,CSVGGGGFNEKLFF,BR01B
...,...,...
3937,CASSDGFEAFF,BR26B
3938,CASSISGVEQYF,BR26B
3939,CASSISGVEQFF,BR26B
3940,CAWRAGVGGYTF,BR26B


In [90]:
duplicates = pd.DataFrame(columns=[''])
for x in cancer_sample_df.amino_acid.values:
    if len(train_df.query('@x in amino_acid'))>0:
        tmp = cancer_sample_df.query('amino_acid == @x').copy()
        tmp['type']='cancer'
        duplicates = duplicates.append(cancer_sample_df.query('amino_acid == @x'), 
                                       ignore_index =True)
for y in control_sample_df.amino_acid.values:
    if len(train_df.query('@y in amino_acid'))>0:
        tmp = control_sample_df.query('amino_acid == @y').copy()
        tmp['type']='control'
        duplicates = duplicates.append(control_sample_df.query('amino_acid == @y'),
                                       ignore_index=True)

KeyboardInterrupt: 

In [92]:
def get_type(row):
    if row['file'].startswith('BR'):return "cancer"
    elif row['file'].startswith('HIP'):return "control"        

duplicates['type']= duplicates.apply(get_type, axis=1)
duplicates.drop(columns='', inplace=True)
duplicates['len']= duplicates.apply(lambda x: len(x['amino_acid']), axis=1)
duplicates.query('len>=12 & len <=16').head()
len(duplicates.amino_acid.unique())

488

In [93]:
duplicates

Unnamed: 0,amino_acid,file,type,len
0,CASSLVSANYGYTF,BR01B,cancer,14
1,CASSLVSANYGYTF,BR01B,cancer,14
2,CASSLVSANYGYTF,BR01B,cancer,14
3,CASSLVSANYGYTF,BR01B,cancer,14
4,CASSLGQGGYEQYF,BR01B,cancer,14
...,...,...,...,...
1484,CASSEGGSGANVLTF,HIP09022,control,15
1485,CASSEGGSGANVLTF,HIP09122,control,15
1486,CASSEGGSGANVLTF,HIP09366,control,15
1487,CASSEGGSGANVLTF,HIP09559,control,15


In [56]:
test_aa = test_df.amino_acid.values
train_aa = train_df.amino_acid.values
duplicates_aa = duplicates.amino_acid.values


train_df['both'] = train_df.apply(lambda x: True if x['amino_acid'] in test_aa else False, axis=1)
train_df['patients'] = train_df.apply(lambda x: True if x['amino_acid'] in duplicates_aa else False, axis=1)
train_df['len']= train_df.apply(lambda x: len(x['amino_acid']), axis=1)

test_df['both'] = test_df.apply(lambda x: True if x['amino_acid'] in train_aa else False, axis=1)
#test_df['patients'] = test_df.apply(lambda x: True if x['amino_acids'] in duplicates_aa else False, axis=1)
test_df['len']= test_df.apply(lambda x: len(x['amino_acid']), axis=1)


In [143]:
train_filter = train_df.query('both==False and patients ==False and len>=12 and len<=16')
#and file.str.startswith("Tumor")')
train_tumor = train_filter.query('file.str.startswith("Tumor")')
train_control = train_filter.query('file.str.startswith("Normal")')

train_tumor[['amino_acids']].to_csv('Tumor_CDR3s_filtered.txt',
                                   encoding='utf8',index=False)
train_control[['amino_acids']].to_csv('Normal_CDR3s_filtered.txt',
                                     encoding='utf8',index=False)


In [77]:
mine = read_adaptive_tsv('../training_data_new/new_data/ostmeyer_breast/breast_pbmc/BR05B.tsv')
theirs = pd.read_csv('../../DeepCAT/SampleData/Cancer/TestReal-BR05B.tsv_ClusteredCDR3s_7.5.txt',
                header=0, sep='\t')

Currently reading :  ../training_data_new/new_data/ostmeyer_breast/breast_pbmc/BR05B.tsv

In [84]:
theirs

Unnamed: 0,aminoAcid,vMaxResolved,frequencyCount....,Group
0,CASSPGGVSTDTQYF,TCRBV18-01*01,0.000029,1
1,CASSPGGTSTDTQYF,TCRBV18-01*01,0.000029,1
2,CASSETRGGTDTQYF,TCRBV06-01*01,0.000931,2
3,CASSETRGGTDTQYF,TCRBV06-01*01,0.000058,2
4,CASSWTSGDYNEQFF,TCRBV05-01*01,0.000873,3
...,...,...,...,...
237,CASSPLAPNYGYTF,TCRBV11-02*02,0.000029,109
238,CASSYAGDYGYTF,TCRBV06-06,0.000029,110
239,CASSYAGDYGYTF,TCRBV06-06,0.000029,110
240,CASSRRTNEQFF,TCRBV28-01*01,0.000029,111


In [78]:
print("their version of reading the .tsv file (pre-parsed in SampleData)")
display(theirs.sort_values('frequencyCount....', ascending=False).head(5))
print("My version of reading the .tsv file")
display(mine.sort_values('frequency',ascending=False)[['amino_acid','v_resolved','frequency']].head(5))

their version of reading the .tsv file (pre-parsed in SampleData)


Unnamed: 0,aminoAcid,vMaxResolved,frequencyCount....,Group
65,CARSLGLYEQYF,TCRBV05-03,0.002676,28
71,CASSFNGLEETQYF,TCRBV05-01*01,0.002356,31
2,CASSETRGGTDTQYF,TCRBV06-01*01,0.000931,2
4,CASSWTSGDYNEQFF,TCRBV05-01*01,0.000873,3
152,CASSPGHEQYF,TCRBV07-06*01,0.000814,67


My version of reading the .tsv file


Unnamed: 0,amino_acid,v_resolved,frequency
3703,CAWQNKERGANVLTF,TCRBV30-01*01,0.00605
20053,CARSLGLYEQYF,TCRBV05-03,0.002676
11139,CASSKGELADTQYF,TCRBV06-04,0.002501
18427,CASSFNGLEETQYF,TCRBV05-01*01,0.002356
11028,CASSGTSGSTDTQYF,TCRBV06-04,0.002123


In [79]:
theirs.query('aminoAcid == "CAWQNKERGANVLTF"')

Unnamed: 0,aminoAcid,vMaxResolved,frequencyCount....,Group


In [80]:
mine

Unnamed: 0,amino_acid,v_gene,v_resolved,frequency,len
3703,CAWQNKERGANVLTF,TCRBV30-01,TCRBV30-01*01,0.006050,15
20053,CARSLGLYEQYF,TCRBV05-03,TCRBV05-03,0.002676,12
11139,CASSKGELADTQYF,TCRBV06-04,TCRBV06-04,0.002501,14
18427,CASSFNGLEETQYF,TCRBV05-01,TCRBV05-01*01,0.002356,14
11028,CASSGTSGSTDTQYF,TCRBV06-04,TCRBV06-04,0.002123,15
...,...,...,...,...,...
3241,CSVEIGDSGYTF,TCRBV29-01,TCRBV29-01*01,0.000029,12
3242,CSVAFDRDYTF,TCRBV29-01,TCRBV29-01*01,0.000029,11
3245,CSVEHYVPNEQFF,TCRBV29-01,TCRBV29-01*01,0.000029,13
3246,CSVGGAGGPYEQYF,TCRBV29-01,TCRBV29-01*01,0.000029,14


In [85]:
mine.to_csv('mine_test.tsv', index=False, header=True, sep='\t',
            columns = ['amino_acid','v_resolved','frequency'])