In [4]:
#Allows relative imports
import os
import sys
import pandas as pd 

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
#imports 
from src.preprocessing import *
from src.models import *
from src.train_eval_helpers import *
from src.plots import *
import torch
import torch.nn as nn
import torch.nn.functional as F
%load_ext autoreload
%autoreload 2
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rcParams['figure.dpi']= 300
import seaborn as sns
sns.set_style('darkgrid')

#checking gpu status
if torch.cuda.is_available():
    device = torch.device('cuda')
    print("Using : {}".format(device))
else:
    device = torch.device('cpu')
    print("Using : {}".format(device))

RANGE = range(12,17)
TRAINDIR = '../TrainingData/'
SAMPLEDIR = '../SampleData/'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Using : cuda


In [40]:
#Cancer patient level TCRs
cancer_sample_df = pd.DataFrame(columns=['amino_acids','file'])
for file in os.listdir(SAMPLEDIR+'Cancer'):
    if '.md' in file :continue
    temp = pd.read_csv(os.path.join(SAMPLEDIR+'Cancer',file),
                       sep='\t',usecols=['aminoAcid'])
    temp.columns = ['amino_acids']
    temp['file'] = 'BR'+file.split('BR')[1].split('.tsv')[0]
    cancer_sample_df = cancer_sample_df.append(temp, ignore_index=True)

#Control patient level TCRs
control_sample_df = pd.DataFrame(columns=['amino_acids','file'])
for file in os.listdir(SAMPLEDIR+'Control'):
    if '.md' in file :continue
    temp = pd.read_csv(os.path.join(SAMPLEDIR+'Control',file),
                       sep='\t',usecols=['aminoAcid'])
    temp.columns = ['amino_acids']
    temp['file'] = 'HIP'+file.split('HIP')[1].split('.tsv')[0]
    control_sample_df = control_sample_df.append(temp, ignore_index=True)

In [57]:
def get_type(row):
    if row['file'].startswith('BR'):return "cancer"
    elif row['file'].startswith('HIP'):return "control"    

In [102]:
duplicates = pd.DataFrame(columns=[''])
for x in cancer_sample_df.amino_acids.values:
    if len(train_df[train_df.amino_acids.str.contains(x)])>0:
        duplicates = duplicates.append(cancer_sample_df.query('amino_acids == @x'), 
                                       ignore_index =True)
for y in control_sample_df.amino_acids.values:
    if len(train_df[train_df.amino_acids.str.contains(y)])>0:
        duplicates = duplicates.append(control_sample_df.query('amino_acids == @y'),
                                       ignore_index=True)
duplicates['type']= duplicates.apply(get_type, axis=1)
duplicates.drop(columns='', inplace=True)
duplicates['len']= duplicates.apply(lambda x: len(x['amino_acids']), axis=1)
duplicates.query('len>=12 & len <=16')

Unnamed: 0,amino_acids,file,type,len
0,CASSLVSANYGYTF,BR01B,cancer,14
1,CASSLVSANYGYTF,BR01B,cancer,14
2,CASSLVSANYGYTF,BR01B,cancer,14
3,CASSLVSANYGYTF,BR01B,cancer,14
4,CASSLGQGGYEQYF,BR01B,cancer,14
...,...,...,...,...
8621,CASSLGNSPLHF,HIP10377,control,12
8622,CASSRTGTYEQYF,HIP10377,control,13
8623,CASSPLGTEAFF,HIP09041,control,12
8624,CASSPLGTEAFF,HIP09041,control,12


In [107]:
len(duplicates.amino_acids.unique())

1558

In [125]:
test_aa = test_df.amino_acids.values
train_aa = train_df.amino_acids.values
duplicates_aa = duplicates.amino_acids.values


train_df['both'] = train_df.apply(lambda x: True if x['amino_acids'] in test_aa else False, axis=1)
train_df['patients'] = train_df.apply(lambda x: True if x['amino_acids'] in duplicates_aa else False, axis=1)
train_df['len']= train_df.apply(lambda x: len(x['amino_acids']), axis=1)

test_df['both'] = test_df.apply(lambda x: True if x['amino_acids'] in train_aa else False, axis=1)
#test_df['patients'] = test_df.apply(lambda x: True if x['amino_acids'] in duplicates_aa else False, axis=1)
test_df['len']= test_df.apply(lambda x: len(x['amino_acids']), axis=1)


In [143]:
train_filter = train_df.query('both==False and patients ==False and len>=12 and len<=16')
#and file.str.startswith("Tumor")')
train_tumor = train_filter.query('file.str.startswith("Tumor")')
train_control = train_filter.query('file.str.startswith("Normal")')

train_tumor[['amino_acids']].to_csv('Tumor_CDR3s_filtered.txt',
                                   encoding='utf8',index=False)
train_control[['amino_acids']].to_csv('Normal_CDR3s_filtered.txt',
                                     encoding='utf8',index=False)
