In [1]:
from __future__ import print_function, division
#Allows relative imports
import os, sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
#imports from files
from src.preprocessing import *
from src.VAE_train import *
from vae_cel.vae_cel import *
from src.loss_metrics import *
from src.pickling import *
from src.datasets import *

import pandas as pd 
import numpy as np
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

#checking gpu status
if torch.cuda.is_available():
    device = torch.device('cuda')
    print("Using : {}".format(device))
else:
    device = torch.device('cpu')
    print("Using : {}".format(device))
    
#Plot and stuff
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rcParams['figure.dpi']= 200
sns.set_style('darkgrid')

torch.cuda.empty_cache()
# Ignore warnings)
import warnings
warnings.filterwarnings("ignore")
    
%load_ext autoreload
%autoreload 2

Using : cuda


In [2]:
db = pd.read_csv('../training_data_new/db/db_TRB.csv')\
       .query('cdr3_TRB.str.len()>10 and cdr3_TRB.str.len() <= 23')\
       .query('cdr3_TRB.str.endswith("F") and cdr3_TRB.str.startswith("C")')
#, nrows = 100)
em = pd.read_csv('../training_data_new/emerson_raw/batch1/emerson_batch1_626hla_parsed_tagged.tsv', 
                 sep='\t')\
       .query('amino_acid.str.len()>10 and amino_acid.str.len()<=23')\
       .query('amino_acid.str.endswith("F") and amino_acid.str.startswith("C")')
em = em[['amino_acid','v_family','j_family']]
em = em.sample(2*len(db))

In [12]:
db = pd.concat([db2, db1.sample(frac=0.2)], ignore_index=True)
db

Unnamed: 0,amino_acid,v_family,j_family,db
0,CASSRDRGNQETQYF,27,2,naive_db
1,CASSETVDGPRAEAFF,6,1,naive_db
2,CAISERVESETQYF,10,2,naive_db
3,CASSSGVTGNTIYF,18,1,naive_db
4,CASSPPGGGSSYEQYF,18,2,naive_db
...,...,...,...,...
4032793,CSARGTGGAYGYTF,20,1,naive_db
4032794,CSVDSGQGQPQHF,29,1,naive_db
4032795,CSASPPTIAGGPSPYNEQFF,20,2,naive_db
4032796,CSAMGLGEASYEQYF,20,2,naive_db


In [14]:
em['V'] = em['v_family'].apply(lambda x : int(x.split('V')[1]))
em['J'] = em['j_family'].apply(lambda x : int(x.split('J')[1]))
em = em.drop(columns = ['v_family','j_family']).rename(columns = {'V':'v_family', 'J':'j_family'})
db = db[['cdr3_TRB', 'TRBV','TRBJ']]
db['V'] = db['TRBV'].apply(lambda x: x.split('V')[1])
db['V'] = db['V'].apply(lambda x : int(x.split('-')[0]) if '-' in x else int(x))
db['J'] = db['TRBJ'].apply(lambda x: x.split('J')[1])
db['J'] = db['J'].apply(lambda x : int(x.split('-')[0]) if '-' in x else int(x))
db = db.drop(columns=['TRBV','TRBJ']).rename(columns = {'V':'v_family', 'J':'j_family'})
em['db'] = 'emerson_b1'
db['db'] = 'naive_db'
db = db.rename(columns = {'cdr3_TRB':'amino_acid'})

In [19]:
db.to_csv('../training_data_new/mixed_vj_dataset/naive_vj_parsed.tsv',
         sep = '\t', header = True, index = True)
em.to_csv('../training_data_new/mixed_vj_dataset/emerson_b1_vj_parsed.tsv',
         sep = '\t', header = True, index = True)
total = pd.concat([db, em], ignore_index = True)

In [29]:
train = total.sample(frac=.75)
test = total.loc[total.index.difference(train.index)]
train = train.sample(4000000)
test = test.sample(1000000)

In [31]:
for df in [train, test]:
    df['v_family'] -= 1
    df['j_family'] -= 1

In [32]:
train.to_csv('../training_data_new/mixed_vj_dataset/mixed_vj_train.csv', header=True,
            index=True)
test.to_csv('../training_data_new/mixed_vj_dataset/mixed_vj_test.csv', header=True,
           index=True)

### Recleaning Emerson repertoires for DeepRC (13.04.21)

In [10]:
train = pd.read_csv('../training_data_new/emerson_raw/batch1/emerson_batch1_train.tsv',
                   sep = '\t', nrows =3)
train.columns

Index(['amino_acid', 'frequency', 'rel_freq', 'v_family', 'v_gene', 'd_family',
       'd_gene', 'j_family', 'j_gene', 'filename', 'pred_cmv', 'true_cmv',
       'age', 'sex', 'race', 'hla_a1', 'hla_a2', 'hla_b1', 'hla_b2', 'len'],
      dtype='object')

#### directly encode V/J genes and then save with a smaller subset of columns to reduce size


In [4]:
for fn in ['../training_data_new/emerson_raw/batch1/emerson_batch1_test.tsv']:
    df = pd.read_csv(fn, sep = '\t', usecols = ['amino_acid','frequency','v_family','j_family',
                                               'filename', 'hla_a1','hla_a2','hla_b1','hla_b2'])
    
    df['V'] = df['v_family'].apply(lambda x : int(x.split('V')[1]))
    df['J'] = df['j_family'].apply(lambda x : int(x.split('J')[1]))
    df = df.drop(columns = ['v_family','j_family'])\
           .rename(columns = {'V':'v_family', 'J':'j_family'})
    df['v_family'] -= 1
    df['j_family'] -= 1
    df = df[['filename','amino_acid','frequency','v_family','j_family',
             'hla_a1','hla_a2','hla_b1','hla_b2']]
    df.to_csv(fn, sep = '\t', header=True, index=True)
    del df

In [9]:
pd.read_csv('../training_data_new/emerson_raw/batch1/emerson_batch1_train.tsv', sep ='\t', usecols =['filename', 'amino_acid','frequency','v_family','j_family',
                                'hla_a1','hla_a2','hla_b1','hla_b2'], nrows = 2)

Unnamed: 0,filename,amino_acid,frequency,v_family,j_family,hla_a1,hla_a2,hla_b1,hla_b2
0,HIP00734.tsv,CASSLQGATEAFF,0.014656,5,0,A02,Unknown,B41,B44
1,HIP00734.tsv,CASSNPGTGGGGYTF,0.007007,5,0,A02,Unknown,B41,B44


In [12]:
fn.split('.tsv')[0]+'.csv'

'../training_data_new/emerson_raw/batch1/emerson_batch1_test.csv'

In [14]:
for fn in ['../training_data_new/emerson_raw/batch1/emerson_batch1_train.tsv',
           '../training_data_new/emerson_raw/batch1/emerson_batch1_test.tsv']:
    df = pd.read_csv(fn, sep ='\t', usecols= ['filename', 'amino_acid','frequency','v_family','j_family',
                                'hla_a1','hla_a2','hla_b1','hla_b2'])
    fn = fn.split('.tsv')[0]+'.csv'
    df.to_csv(fn, header = True, index = False)
    del df 


### Checking dataset object

In [49]:
import pandas as pd
import numpy as np
from torch.utils.data import Dataset
from datetime import datetime as dt 

class xd(Dataset):
    """
    Dataset class from the Emerson study (batch1), should facilitate how I handle classes etc.
    path should contain all the necessary data (top10k_train.csv, top10k_test.csv, sample_tags)
    """
    
    def __init__(self, path, max_len, istrain=True, top_k = 10000,
                 allele = 'A', pos_class = 'A01'):
        
        if allele != 'A' and allele != 'B':
            raise Exception('Allele must be "A" or "B"!')
            
        if istrain == True : 
            fn = 'emerson_batch1_train.csv'
            which = 'train'
        elif istrain == False : 
            fn = 'emerson_batch1_test.csv'
            which = 'test'
            
        #Reading the sample tags with the associated split-df
        tags = pd.read_csv('../training_data_new/emerson_raw/batch1/emerson_batch1_sampletags.tsv', 
                  sep = '\t')\
                 .query('dataset == @which')\
                 .reset_index()

        #Getting the one vs rest label (must be defined at initialization)
        if allele == 'A' : columns = ['hla_a1', 'hla_a2']
        elif allele == 'B' : columns = ['hla_b1', 'hla_b2'] 
        # 1 if either of hla_x1 or hla_x2 is of that label 
        tags['class_label'] = tags.apply(lambda x: 1 if (x[columns[0]] == pos_class or x[columns[1]] == pos_class) else 0, axis = 1)
        
        #Also this is the actual iterable (i.e. one sample = one patient)
        #saving values
        self.patients = tags.filename.values
        self.labels = tags.class_label.values 
        self.len = len(tags)        
        
        #Reading the top_K most frequent (grouped by patient) values from DF 
        df = get_patient_topk_sequences(pd.read_csv(os.path.join(path, fn))\
                                          .query('amino_acid.str.len() <= @max_len'), 
                                        top_k)
        
        #Loading the values from DF and saving to attribute
        self.seq_filename = df.filename.values 
        self.values = df[['amino_acid', 'v_family','j_family']].values
        self.frequency = df.frequency.values
        self.n_per_bag = np.array([len(df.query('filename == @x')) for x in tags.filename])

    def __getitem__(self, index):
        # 1 index = 1 patient
        patient = self.patients[index]
        target = self.labels[index]
        n_per_bag = self.n_per_bag[index]
        #print(patient, type(patient))
        #Getting the sequences associated with the patients
        if type(patient) == str:
            indices = np.where(self.seq_filename == patient)
        else:
            indices = np.empty(0,dtype=np.int64)
            for p in patient:
                indices = np.append(indices, np.where(self.seq_filename == p))
        values = self.values[indices] 
        
        #An input to DeepRC should be ((x_tuple), n_per_bag) , where x_tuple is batch_aa_vj(values)
        #But should probly do the encoding in DeepRC or in the network
        return values, n_per_bag, target
    
    def __len__(self):
        return self.len
     

In [38]:
#Time it takes to load a given dataset (with top_k sequences)
df.sort_values('top_k')

Unnamed: 0,train,top_k,time_minute,time_seconds
0,False,5000,0.0,7.408663
0,True,5000,1.0,44.91153
0,False,10000,0.0,8.032257
0,True,10000,2.0,22.483328
0,False,15000,0.0,8.42307
0,True,15000,3.0,7.144351
0,False,20000,0.0,9.396336
0,True,20000,4.0,16.154159


In [37]:
df= pd.DataFrame(columns = ['train','top_k','time_minute','time_seconds'])
for t in [False, True]:
    for top_k in [5000, 10000, 15000]:
        print(f'FOR TRAIN = {t}, TOP_K = {top_k}:')
        start_time = dt.now()
        dataset = EmersonRepertoire_Dataset(path = '../training_data_new/emerson_raw/batch1/',
                                            max_len = 23, istrain=t,
                                            top_k = top_k, allele = 'A', pos_class = 'A01')
        del dataset
        end_time = dt.now()       
        elapsed = divmod((end_time-start_time).total_seconds(), 60)
        print(f"\nTime elapsed for train == {t}, top_k = {top_k}\n\t{elapsed[0]} minutes\n\t{elapsed[1]} seconds\n\n\t###############\n")
        df = df.append(pd.DataFrame([[t, top_k, elapsed[0], elapsed[1]]], columns = df.columns))

FOR TRAIN = False, TOP_K = 20000:

Time elapsed reading tags : 
	0.0 minutes
	0.01097 seconds

Time elapsed getting/saving label : 
	0.0 minutes
	0.002992 seconds

Time elapsed getting Top K : 
	0.0 minutes
	6.479115 seconds

Time elapsed saving values (seq/V/J/freq/n_per_bag) : 
	0.0 minutes
	2.826464 seconds

Time elapsed for train == False, top_k = 20000
	0.0 minutes
	9.396336 seconds

	###############

FOR TRAIN = True, TOP_K = 20000:

Time elapsed reading tags : 
	0.0 minutes
	0.007978 seconds

Time elapsed getting/saving label : 
	0.0 minutes
	0.012966 seconds

Time elapsed getting Top K : 
	1.0 minutes
	3.9994200000000006 seconds

Time elapsed saving values (seq/V/J/freq/n_per_bag) : 
	3.0 minutes
	11.190397999999988 seconds

Time elapsed for train == True, top_k = 20000
	4.0 minutes
	16.154158999999993 seconds

	###############



### Split/Batching ? 

In [1]:
from __future__ import print_function, division
#Allows relative imports
import os, sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
#imports from files
from src.preprocessing import *
from src.VAE_train import *
from src.vautoencoders import *
from src.loss_metrics import *
from src.pickling import *
from src.datasets import *

import pandas as pd 
import numpy as np
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

#checking gpu status
if torch.cuda.is_available():
    device = torch.device('cuda')
    print("Using : {}".format(device))
else:
    device = torch.device('cpu')
    print("Using : {}".format(device))
    
#Plot and stuff
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rcParams['figure.dpi']= 200
sns.set_style('darkgrid')

torch.cuda.empty_cache()
# Ignore warnings)
import warnings
warnings.filterwarnings("ignore")
    
%load_ext autoreload
%autoreload 2

Using : cuda


In [2]:
import pandas as pd
import numpy as np
from torch.utils.data import Dataset
from datetime import datetime as dt 

class xd(Dataset):
    """
    Dataset class from the Emerson study (batch1), should facilitate how I handle classes etc.
    path should contain all the necessary data (top10k_train.csv, top10k_test.csv, sample_tags)
    """
    
    def __init__(self, path, max_len, istrain=True, top_k = 10000,
                 allele = 'A', pos_class = 'A01'):
        
        if allele != 'A' and allele != 'B':
            raise Exception('Allele must be "A" or "B"!')
            
        if istrain == True : 
            fn = 'emerson_batch1_train.csv'
            which = 'train'
        elif istrain == False : 
            fn = 'emerson_batch1_test.csv'
            which = 'test'
            
        #Reading the sample tags with the associated split-df
        tags = pd.read_csv('../training_data_new/emerson_raw/batch1/emerson_batch1_sampletags.tsv', 
                  sep = '\t')\
                 .query('dataset == @which')\
                 .reset_index()

        #Getting the one vs rest label (must be defined at initialization)
        if allele == 'A' : columns = ['hla_a1', 'hla_a2']
        elif allele == 'B' : columns = ['hla_b1', 'hla_b2'] 
        # 1 if either of hla_x1 or hla_x2 is of that label 
        tags['class_label'] = tags.apply(lambda x: 1 if (x[columns[0]] == pos_class or x[columns[1]] == pos_class) else 0, axis = 1)
        
        #Also this is the actual iterable (i.e. one sample = one patient)
        #saving values
        self.patients = tags.filename.values
        self.labels = tags.class_label.values 
        self.len = len(tags)        
        
        #Reading the top_K most frequent (grouped by patient) values from DF 
        df = get_patient_topk_sequences(pd.read_csv(os.path.join(path, fn))\
                                          .query('amino_acid.str.len() <= @max_len'), 
                                        top_k)
        
        #Loading the values from DF and saving to attribute
        self.seq_filename = df.filename.values 
        self.values = df[['amino_acid', 'v_family','j_family']].values
        self.frequency = df.frequency.values
        self.n_per_bag = np.array([len(df.query('filename == @x')) for x in tags.filename])

    def __getitem__(self, index):
        # 1 index = 1 patient
        patient = self.patients[index]
        target = self.labels[index]
        n_per_bag = self.n_per_bag[index]
        #print(patient, type(patient))
        #Getting the sequences associated with the patients
        if type(patient) == str:
            indices = np.where(self.seq_filename == patient)
        else:
            indices = np.empty(0,dtype=np.int64)
            for p in patient:
                indices = np.append(indices, np.where(self.seq_filename == p))
        
        values = self.values[indices] 
        #An input to DeepRC should be ((x_tuple), n_per_bag) , where x_tuple is batch_aa_vj(values)
        #But should probly do the encoding in DeepRC or in the network
        return values, n_per_bag, target
    
    def __len__(self):
        return self.len
        
from torch.utils.data import Dataset, DataLoader, random_split
dataset = xd(path = '../training_data_new/emerson_raw/batch1/',
                                    max_len = 23, istrain= False,
                                    top_k = 1000, allele = 'A', pos_class = 'A01')
len(dataset.frequency)


62000

In [10]:
xd = dataset[[0,5]][0]

<__main__.xd at 0x1d3aeb67940>

In [3]:
train_len = int(.7*len(dataset))
val_len = len(dataset)-train_len
train,val = random_split(dataset, [train_len, val_len])


In [37]:
train.indices

[6,
 39,
 40,
 49,
 32,
 22,
 60,
 51,
 59,
 44,
 21,
 18,
 35,
 45,
 9,
 0,
 7,
 53,
 43,
 12,
 15,
 31,
 10,
 29,
 42,
 58,
 19,
 47,
 17,
 24,
 38,
 50,
 16,
 4,
 5,
 1,
 54,
 34,
 3,
 57,
 14,
 36,
 41]

In [38]:
for b in BatchSampler(RandomSampler(train), batch_size = 4, drop_last = False):
    print(b)

[22, 12, 19, 36]
[2, 7, 23, 28]
[9, 10, 42, 35]
[14, 18, 29, 24]
[38, 0, 5, 16]
[6, 3, 21, 1]
[33, 20, 26, 4]
[30, 34, 25, 27]
[8, 31, 17, 13]
[15, 37, 39, 11]
[32, 41, 40]


In [32]:
b

[2, 38]

In [27]:
c,d,e = dataset[[b]]
f,g,h = dataset[b]
c==f

array([[ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True],
       ...,
       [ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True]])