In [None]:
from __future__ import print_function, division
#Allows relative imports
import os, sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
#imports from files
from src.preprocessing import *
from src.VAE_train import *
from vae_cel.vae_cel import *
from vae_cel.vae_cel_train import *
from vae_cel.DeepRC_VAE import *
from src.embedding_visualisation import * 
from src.loss_metrics import *
from src.pickling import *
from src.datasets import *

import pandas as pd 
import numpy as np
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

#checking gpu status
if torch.cuda.is_available():
    device = torch.device('cuda')
    print("Using : {}".format(device))
else:
    device = torch.device('cpu')
    print("Using : {}".format(device))
    
#Plot and stuff
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rcParams['figure.dpi']= 200
sns.set_style('darkgrid')

torch.cuda.empty_cache()
# Ignore warnings)
import warnings
warnings.filterwarnings("ignore")
    
%load_ext autoreload
%autoreload 2

#### plotting loss & rechecking VAE

In [None]:
start = load_pkl('../output/HyperbolicTuning/LOWERBETA_gamma7_wd1e-3_ad3097/VAE_tune-weighted0.5_latent100_Pad-before_Annealing-hyper-gamma7.0_losses.pkl')
end = load_pkl('../output/HyperbolicContinueTraining/LOWERBETA_DirectlyMax/VAE_tune-weighted0.5_latent100_Pad-before_Annealing-None-gamma1.0_losses.pkl')
start['train'] += end['train']
start['val'] += end['val']

plt.plot(start['train'], 'r-',lw=.8, label = 'Train losses')
plt.plot(start['val'], 'b-',lw=.8, label = 'Val losses')
plt.xlabel('epochs')
plt.ylabel('loss')
plt.axvline(50, color='k',linestyle = '--', 
            linewidth=0.5, label = 'Resumed training here')
plt.legend()

In [None]:
from src.torch_util import *
model = VAE_cel(latent_dim = 100, aa_dim = 25)
model = load_model(model, '../output/HyperbolicContinueTraining/LOWERBETA_DirectlyMax/BEST_VAE_tune-weighted0.5_latent100_Pad-before_Annealing-None-gamma1.0.pth.tar')
model.to(device);

In [None]:
test_dataset = pd.read_csv('../training_data_new/mixed_vj_dataset/mixed_vj_test.csv', usecols = ['amino_acid', 'v_family', 'j_family']).query('amino_acid.str.len() <= 23 and amino_acid.str.len() >=10')
df = test_model(model, VAELoss_cel(beta = 1), test_dataset.values, 2**14, 23, 0.5, 'before', True, False, 'cuda')
display(df)
#model with 25% hamming seq loss
test_decode(model, test_dataset.values, 10, 23, 0.5, 'before', True)

In [None]:
for cols in tqdm(list(chunks([x for x in top5_merged.columns if 'z_' in x], 5))):
    plot_latent(top5_merged, cols, hue='antigen_epitope')

## LDA/PCA

_see https://www.youtube.com/watch?v=9IDXYHhAfGA,_

_https://sebastianraschka.com/Articles/2014_python_lda.html,_

_https://arxiv.org/pdf/2101.06772.pdf_

### class def here

In [None]:
class LDA:
    def __init__(self, n_components):
        self.n_components = n_components
        self.linear_discriminiants = None
        self.eig_vals = None
    def fit(self, X,y):
        n_features = X.shape[1]
        class_labels = np.unique(y)
        
        #S_W, S_B : 
        mean_overall = np.mean(X,axis=0) #Shape 100, mean across all samples for each feature
        #Initializing empty (square) matrices 
        S_W = np.zeros((n_features, n_features))
        S_B = np.zeros((n_features, n_features))
        
        
        for c in class_labels:
            # Within class (S_W)
            X_c = X[y==c] #X_c for a given class, shape (N_sample, 100)
            mean_c = np.mean(X_c, axis=0)#Only for features in this selected class
            #       (100, n_c) * (n_c, 100) = (100, 100)
            S_W += (X_c-mean_c).T.dot(X_c-mean_c) #sum over all classes
            
            # Between class (S_B)
            n_c = X_c.shape[0] # = N_samples
            mean_diff = (mean_c - mean_overall).reshape(n_features, 1) #reshape to column vector
            S_B += n_c * (mean_diff).dot(mean_diff.T)
        
        A = np.linalg.inv(S_W).dot(S_B) #Solve eigval & eigvec of this
        eigenvalues, eigenvectors = np.linalg.eig(A)
        eigenvectors = eigenvectors.T 
        idxs = np.argsort(abs(eigenvalues))[::-1] #inverse with slicing
        eigenvalues = eigenvalues[idxs]
        eigenvectors = eigenvectors[idxs]
        #Store top_n in linear_discriminants
        self.linear_discriminants = eigenvectors[0:self.n_components]
        self.eig_vals = eigenvalues[0:self.n_components]
        
    def transform(self,X):
        #project data
        return np.dot(X, self.linear_discriminants.T)

### encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
df = top5_merged.sort_values('antigen_epitope')
features = [x for x in df.columns if 'z_' in x]
X = df[features].values
y = df['antigen_epitope'].values

enc = LabelEncoder()
label_encoder = enc.fit(y)
y = label_encoder.transform(y)

label_dict = dict((x,y) for x,y in zip(range(len(df.antigen_epitope.unique())),
                                       df.antigen_epitope.unique()))
label_dict

In [None]:
lda = LDA(5)
lda.fit(X,y)
X_lda = lda.transform(X)
eig_pairs = [(np.abs(lda.eig_vals[i]), lda.linear_discriminants[:,i]) for i in range(len(lda.eig_vals))]
print('Variance explained:\n')
eigv_sum = sum([x[0] for x in eig_pairs])
for i,j in enumerate(eig_pairs):
    print('eigenvalue {0:}: {1:.2%}'.format(i+1, (j[0]/eigv_sum).real))

In [None]:
lda_df = pd.DataFrame(data= np.concatenate((X_lda,y.reshape(len(y),1)), axis=1), columns = ['LDA_1','LDA_2','LDA_3','LDA_4','LDA_5', 'class'])
lda_df['epitope'] = lda_df['class'].apply(lambda x : label_dict[x])
lda_df = lda_df.drop(columns=['class'])
lda_df = lda_df.merge(df[['cdr3', 'v_segm', 'j_segm', 'species', 'mhc_a',
       'mhc_b', 'mhc_class', 'antigen_epitope', 'antigen_gene',
       'antigen_species']], left_index = True, right_index = True)
from sklearn.decomposition import PCA
pca = PCA(10)
X_pca = pca.fit_transform(X,)
tmp = pd.DataFrame(data=X_pca, columns = ['PCA_'+str(i) for i in range(1,X_pca.shape[1]+1)])
merged = lda_df.join(tmp)

In [None]:
lda_cols = [x for x in lda_df.columns if 'LDA' in x]
#sns.histplot(data= lda_df, x = cols, hue # 'antigen_epitope')
pca_cols = [x for x in merged.columns if 'PCA' in x]
cols = lda_cols + pca_cols
len(cols)

In [None]:
f,a = plt.subplots(5,3, figsize=( 18,30))
sns.set_palette('gist_ncar')
for ax, col in zip(a.ravel(), cols):
    sns.histplot(data = merged, x=col, ax=ax, hue = 'antigen_epitope', kde = True)

# Top5 nn embedding Classifier

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, precision_score
#Getting top epitopes
top5_epitopes = vdjdb.groupby('antigen_epitope').agg(count=('gene','count')).sort_values('count',ascending=False).head().index
top5 = vdjdb.query('antigen_epitope in @top5_epitopes and cdr3.str.len() <= 23').reset_index()
#Getting the DF features for neural net 
top5_nn = top5[['cdr3','v','j','mhc_a','mhc_class','antigen_epitope','antigen_gene']]
top5_nn['mhc_a_main'] = top5_nn['mhc_a'].apply(lambda x : int(x.split('A*')[1].split(':',1)[0]) if ':' in x else int(x.split('A*')[1]))
top_nn_emb = get_embedding_df(model, top5_nn[['cdr3','v','j']].values, device)
top5_nn = top5_nn.merge(top_nn_emb[['z_'+str(i) for i in range(100)]], left_index = True, right_index = True).sort_values('antigen_epitope')

In [None]:
display(top5_nn.head())
top5_nn.mhc_a.unique()

In [None]:
from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()
top5_nn['epitope_label'] = enc.fit_transform(top5_nn['antigen_epitope'].values)
top5_nn['mhc_a_label'] = enc.fit_transform(top5_nn['mhc_a'].values)
top5_nn.head(3)

In [None]:
class MLP(nn.Module):
    def __init__(self, n_layers, n_hidden, n_output, 
                 activation = nn.SELU(), p_drop = 0.5, dropout = True):
        
        super(MLP, self).__init__()
        if dropout == True :
            self.drop = nn.Dropout(p_drop)
        else: 
            self.drop = nn.Identity()
            
        self.input_layers = nn.Sequential(nn.Linear(100, 200),
                                          nn.BatchNorm1d(200),
                                          activation,
                                          self.drop,
                                          nn.Linear(200,150),
                                          nn.BatchNorm1d(150),
                                          activation,
                                          self.drop,
                                          nn.Linear(150,n_hidden),
                                          activation,
                                          self.drop)
        layers = []
        for i in range(n_layers):
            layers.append(nn.Linear(n_hidden, n_hidden))
            layers.append(activation)
            layers.append(self.drop) 
        self.hidden_layers = nn.Sequential(*layers)
        self.output = nn.Linear(n_hidden, n_output) 
    
    def forward(self, x):
        x = self.input_layers(x)
        x = self.hidden_layers(x)
        x = self.output(x)
        #x = F.relu(x)
        #x = F.softmax(x, dim= 1) No need because CEL takes logits and has softmax built-in
        return x 

In [None]:
nn.BCELoss().__class__.__name__

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, precision_score

def train_model(model, X_train, y_train, batch_size, criterion, optimizer):
    model.train()
    train_loss = 0
    for b in BatchSampler(RandomSampler(X_train), batch_size = batch_size,
                               drop_last = False):
                     #position = 1,
                     #leave = False):
        values = X_train[b]
        target = y_train[b]
        score = model(values)
        t_loss = criterion(score, target)
        model.zero_grad()
        t_loss.backward()
        optimizer.step()
        train_loss += t_loss.item()   
        
    train_loss /= math.ceil((len(X_train)/batch_size))
    return train_loss

def val_model(model, X_val, y_val, batch_size, criterion):
    model.eval()
    val_loss = 0
    for b in BatchSampler(RandomSampler(X_val), batch_size = batch_size,
                           drop_last = False):
        values = X_val[b]
        target = y_val[b]
        with torch.no_grad():
            score = model(values)
        v_loss = criterion(score, target)
        val_loss += v_loss.item()
        
    val_loss /= math.ceil((len(X_val)/batch_size))
    return val_loss
        
def train_clf(model, data:tuple, loss_fct = nn.CrossEntropyLoss, 
              optimizer_module = torch.optim.AdamW,
              lr=5e-5, wd=1e-4, nb_epochs=1000, 
              batch_size=2**9, loss_weight = None, device='cuda'):
    
    X_train, X_val, y_train, y_val = data
    
    #X_train.to(device)
    #X_val.to(device)
    #y_train.to(device)
    #y_val.to(device)
    #
    #model.to(device)
    
    if loss_weight is not None:
        weights = loss_weight
    else:
        tmp = torch.cat((y_train,y_val), dim=0).max().item()+1
        weights = torch.ones((tmp,))
    #weights = 1/(top5_nn.groupby('antigen_epitope').agg(count=('cdr3','count')).values/len(top5_nn))
    criterion = loss_fct(weight=torch.tensor(weights,device='cuda').float())
    optimizer = optimizer_module(model.parameters(), lr = lr, weight_decay= wd)
    
    train_losses = []
    val_losses = []
    rocs = []
    accs = []
    broken = False
    for e in tqdm(range(nb_epochs),
                 position = 0, leave = False):
        train_loss = train_model(model, X_train, y_train, batch_size, criterion, optimizer)
        val_loss = val_model(model, X_val, y_val, batch_size, criterion)
        #train
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        mlp.eval()
        if val_loss > 1 and e > 100:
            broken = True
            break
        if e%50 == 0 or e==nb_epochs-1:
            y_true = y_val.cpu().numpy()
            with torch.no_grad():
                score = F.softmax(model(X_val), dim =1 ).cpu()
                preds = torch.argmax(score,dim=1).numpy()
            if y_val.max() >1:
                multi = 'ovo'
                average = 'macro'
            else:
                multi = 'ovr'
                average = 'macro'
                score = score[:,1]
            roc_auc = roc_auc_score(y_true, score.numpy(), average = average,
                                    multi_class = multi)
            rocs.append(roc_auc)
            acc = accuracy_score(y_true, preds)
            accs.append(acc)
            print(f'Epoch:{e};\tTrain: {train_loss:.3e}\tVal: {val_loss:.3e}'\
                  f'\n\t\tROC AUC :{roc_auc:.3f}, \taccuracy: {acc:.3f}')
    
    losses = {'train': train_losses,
              'val' : val_losses,
              'roc': rocs,
              'acc': accs}
    cs = ['b-', 'r-']#, 'g-.', 'm-.']
    
    for k, c in zip(losses.keys(), cs):
        plt.plot(losses[k], color = c[0], ls = c[1], label = k)
    
    if broken == False:
        x = np.arange(start=0,stop=nb_epochs,step=50)
        x = np.append(x, nb_epochs-1)
        plt.plot(x, rocs, color = 'g', ls = '-.', marker = 'o', markersize = 5, label = 'ROC AUC')
        plt.plot(x, accs, color = 'm', ls = '-.', marker = 'x', markersize = 5, label = 'Accuracy')
        
    plt.legend()
    plt.xlabel('epochs')
    plt.ylabel('loss')
    plt.show()
    return losses

### Multi(weighted) class

In [None]:
X = torch.tensor(top5_nn[['z_'+str(i) for i in range(100)]].values, device = 'cuda')
y = torch.tensor(top5_nn['epitope_label'].values, device = 'cuda').long()
X_train, X_val, y_train, y_val = train_test_split(X,y, train_size = 0.7, shuffle=True)

mlp = MLP(10, 300, activation = nn.ReLU(), n_output = len(y.unique()))
mlp.to(device);

nb_epochs = 1000
batch_size = 2**9
#total len = 27700
weights = 1/(top5_nn.groupby('antigen_epitope').agg(count=('cdr3','count')).values/len(top5_nn))

train_clf(model = mlp, data = (X_train,X_val,y_train, y_val), lr = 3e-5, wd = 1e-4,
          nb_epochs = nb_epochs, batch_size = batch_size, loss_weight = weights)

In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score

In [None]:
mlp.eval()
with torch.no_grad():
    out = F.softmax(mlp(X_val), dim =1).cpu()
    preds = torch.argmax(out,dim=1).numpy()
print(f'roc_auc : {roc_auc_score(y_val.cpu().numpy(), out.numpy(), multi_class ="ovo")}')

In [None]:
precision_score(y_val.cpu().numpy(), preds, average='macro')

### OvR 

In [None]:
top5_nn.antigen_epitope.str.len().describe()

In [None]:
top5_nn.groupby('antigen_epitope').count()

##### Doing for GILGFVFTL

In [None]:
def ovr_xy_split_class(df, col = 'antigen_epitope', pos_label = 'GILGFVFTL', train_size=.7):
    top5_nn['positive_class'] = top5_nn[col].apply(lambda x: 1 if x == pos_label else 0)
    X = torch.tensor(top5_nn[['z_'+str(i) for i in range(100)]].values, device = 'cuda')
    y = torch.tensor(top5_nn['positive_class'].values, device = 'cuda').long()
    X_train, X_val, y_train, y_val = train_test_split(X,y, train_size = train_size, shuffle=True)
    return X_train, X_val, y_train, y_val

def get_min(array, x_epochs):
    index = np.argmin(array)
    value = array[index]
    epoch = x_epochs[index]
    return epoch, value

def get_max(array, x_epochs):
    index = np.argmax(array)
    value = array[index]
    epoch = x_epochs[index]
    return epoch, value

In [None]:
class MLP(nn.Module):
    
    def __init__(self, n_layers, n_hidden, n_output, 
                 activation = nn.SELU(), p_drop = 0.5)# dropout = True):
        
        super(MLP, self).__init__()
        if p_drop >0 :
            self.drop = nn.Dropout(p_drop)
        else: 
            self.drop = nn.Identity()
            
        self.input_layers = nn.Sequential(nn.Linear(100, 200),
                                          nn.BatchNorm1d(200),
                                          activation,
                                          self.drop,
                                          nn.Linear(200,150),
                                          nn.BatchNorm1d(150),
                                          activation,
                                          self.drop,
                                          nn.Linear(150,n_hidden),
                                          activation,
                                          self.drop)
        layers = []
        for i in range(n_layers):
            layers.append(nn.Linear(n_hidden, n_hidden))
            layers.append(activation)
            layers.append(self.drop)
        
        self.hidden_layers = nn.Sequential(*layers)
        self.output = nn.Linear(n_hidden, n_output)
        
    
    def forward(self, x):
        x = self.input_layers(x)
        x = self.hidden_layers(x)
        x = self.output(x)
        x = F.relu(x)
        #x = F.softmax(x, dim= 1) No need because CEL takes logits and has softmax built-in
        
        return x 

In [None]:
top5_nn.antigen_epitope.unique()

In [None]:
epi_ls = {}
for epi in tqdm(top5_nn.antigen_epitope.unique()):
    X_train, X_val, y_train, y_val = ovr_xy_split_class(top5_nn, epi, train_size = .667)
    cb = len(top5_nn.query('antigen_epitope==@epi'))/len(top5_nn)
    print(f'FOR CURRENT EPITOPE : {epi}, % of positive class = {cb:.2%}')
    
    mlp = MLP(2,300 , n_output = 2)
    mlp.to(device);
    losses = train_clf(model = mlp, data = (X_train,X_val,y_train, y_val), lr = 6.67e-5, wd = 1e-2,
              nb_epochs = 750, batch_size = 2**10, loss_weight = None)
    epi_ls[epi] = losses

In [None]:
torch.manual_seed(20)
epi_ls = {}
for epi in tqdm(top5_nn.antigen_epitope.unique()):
    X_train, X_val, y_train, y_val = ovr_xy_split_class(top5_nn, epi, train_size = .667)
    cb = len(top5_nn.query('antigen_epitope==@epi'))/len(top5_nn)
    print(f'FOR CURRENT EPITOPE : {epi}, % of positive class = {cb:.2%}')
    
    mlp = MLP(4, 50 , n_output = 2)# nobatchnorm, but with ReLU
    mlp.to(device);
    
    losses = train_clf(model = mlp, data = (X_train,X_val,y_train, y_val), lr = 6.67e-5, wd = 1e-2,
                  nb_epochs = 750, batch_size = 2**10, loss_weight = None)
    epi_ls[epi] = losses

#### gridsearch for MLP1

In [None]:
# gridsearch CNN
nb_epochs = 500
losses = {}
epi = 'GILGFVFTL'
x_epochs = np.arange(0, nb_epochs, 50)
x_epochs = np.append(x_epochs, nb_epochs-1)
X_train, X_val, y_train, y_val = ovr_xy_split_class(top5_nn, epi, train_size = .667)
X_train.to(device)
X_val.to(device)
y_train.to(device)
y_val.to(device)

df_mlp1=pd.DataFrame(columns = ['name','best_roc','roc_epoch','best_val','val_epoch'])
ls = {}
for criterion in [nn.CrossEntropyLoss]:
    for hidden in tqdm([25,50,75, 100],
                   leave = False):
        for n_layers in tqdm([1,2,3,4],
                            leave=False):
            for act in tqdm([nn.SELU(), nn.ReLU(), nn.Sigmoid(), nn.Tanh(), nn.Softmax(dim=1)], 
                          leave = False):
                for p_drop in tqdm([0, 0.5], 
                                  leave = False):
                    for lr in tqdm([1e-5, 1e-4], 
                                  leave = False):
                        for wd in tqdm([1e-2, 5e-7],
                                      leave = False):
                            name = '_'.join([act.__class__.__name__,
                                             criterion().__class__.__name__,
                                             'hidden'+str(hidden),
                                             'layers'+str(n_layers),
                                             'drop'+str(p_drop),
                                             'lr'+str(lr),
                                             'wd'+str(wd)])
                            if criterion().__class__.__name__ == 'BCEWithLogitsLoss':
                                outdim = 1
                                y_val = y_val.view(-1,1)
                                y_train = y_train.view(-1,1)
                            else : 
                                outdim = 2
                            mlp = MLP(n_layers, hidden, outdim, act, p_drop)
                            print(f'FOR {name}')
                            mlp.to(device)
                            losses = train_clf(model = mlp, data = (X_train,X_val,y_train, y_val), 
                                               loss_fct = criterion,
                                               lr = lr, wd = wd, nb_epochs = nb_epochs,
                                               batch_size = 2**12, loss_weight = None)
                            ls[name]= losses
                            roc_epoch, best_roc = get_max(losses['roc'], x_epochs)
                            val_epoch, best_val = get_min(losses['val'], range(nb_epochs))
                            df_mlp1 = df_mlp1.append(pd.DataFrame(data=[[name, best_roc, roc_epoch, best_val, val_epoch]],
                                                            columns = ['name','best_roc','roc_epoch','best_val','val_epoch']),
                                               ignore_index=True)

In [None]:
df_mlp1.sort_values('best_roc', ascending = False)

In [None]:
# gridsearch CNN
nb_epochs = 500
losses = {}
epi = 'GILGFVFTL'
x_epochs = np.arange(0, nb_epochs, 50)
x_epochs = np.append(x_epochs, nb_epochs-1)
X_train, X_val, y_train, y_val = ovr_xy_split_class(top5_nn, epi, train_size = .667)
X_train.to(device)
X_val.to(device)
y_train.to(device)
y_val.to(device)

ls = {}
for criterion in [nn.CrossEntropyLoss]:
    for hidden in tqdm([150,200,250,300],
                   leave = False):
        for n_layers in tqdm([2,3,4],
                            leave=False):
            for act in tqdm([nn.SELU(), nn.ReLU(), nn.Sigmoid()], 
                          leave = False):
                for p_drop in tqdm([0, 0.5], 
                                  leave = False):
                    for lr in tqdm([1e-5, 1e-4], 
                                  leave = False):
                        for wd in tqdm([1e-2, 5e-7],
                                      leave = False):
                            name = '_'.join([act.__class__.__name__,
                                             criterion().__class__.__name__,
                                             'hidden'+str(hidden),
                                             'layers'+str(n_layers),
                                             'drop'+str(p_drop),
                                             'lr'+str(lr),
                                             'wd'+str(wd)])
                            if criterion().__class__.__name__ == 'BCEWithLogitsLoss':
                                outdim = 1
                                y_val = y_val.view(-1,1)
                                y_train = y_train.view(-1,1)
                            else : 
                                outdim = 2
                            mlp = MLP(n_layers, hidden, outdim, act, p_drop)
                            print(f'FOR {name}')
                            mlp.to(device)
                            losses = train_clf(model = mlp, data = (X_train,X_val,y_train, y_val), 
                                               loss_fct = criterion,
                                               lr = lr, wd = wd, nb_epochs = nb_epochs,
                                               batch_size = 2**12, loss_weight = None)
                            ls[name]= losses
                            roc_epoch, best_roc = get_max(losses['roc'], x_epochs)
                            val_epoch, best_val = get_min(losses['val'], range(nb_epochs))
                            df_mlp1 = df_mlp1.append(pd.DataFrame(data=[[name, best_roc, roc_epoch, best_val, val_epoch]],
                                                            columns = ['name','best_roc','roc_epoch','best_val','val_epoch']),
                                               ignore_index=True)

In [None]:
x = np.arange(0, nb_epochs, 50 )
x = np.append(x, nb_epochs-1)

for k1 in df_mlp1.sort_values('best_roc', ascending = False)['name'].values[0:3]:
    plt.figure(figsize=(10,7))
    k = list(ls[k1].keys())
    plt.plot(ls[k1][k[0]], 'b-.' ,label = k[0])
    plt.plot(ls[k1][k[1]], 'r-.' ,label = k[1])
    
    plt.plot(x, ls[k1][k[2]], c='g', ls = '-', lw = 1, marker = 'o' ,label = k[2])
    plt.plot(x, ls[k1][k[3]], c='m', ls = '-', lw = 1, marker = 'o' ,label = k[3])
    
    plt.title(k1)
    

In [None]:
df_mlp1.sort_values('best_roc', ascending = False).head()


### 1D CNN

In [None]:
class OD_CNN(nn.Module):
    
    def __init__(self, n_kernels, act = nn.SELU(), p_drop = .3, final=nn.ReLU()):
        super(OD_CNN, self).__init__()
        self.n_kernels = n_kernels
        if p_drop != 0:
            self.drop = nn.Dropout(p_drop)
        else:
            self.drop = nn.Identity()
        
        self.conv1 = nn.Conv1d(1, n_kernels, kernel_size = 51)
        self.maxpool = nn.MaxPool1d(2)
        self.conv2 = nn.Conv1d(n_kernels, n_kernels*2, kernel_size = 22)
        
        self.fc1 = nn.Linear(n_kernels*4, n_kernels*2)
        self.bn1 = nn.BatchNorm1d(n_kernels*2)
        self.fc2 = nn.Linear(n_kernels*2, n_kernels)
        self.bn2 = nn.BatchNorm1d(n_kernels)
        self.fc3 = nn.Linear(n_kernels, 64)
        self.output = nn.Linear(64, 2)
        self.act = act
        self.final = final
        
    def forward(self, x): 
        # 1D-conv layers
        x = self.act(self.maxpool(self.conv1(x.unsqueeze(1))))
        x = self.act(self.maxpool(self.conv2(x)))
        # Flattening & FC layers
        x = x.view(-1, 4*self.n_kernels)
        x = self.bn1(self.drop(self.act(self.fc1(x))))
        x = self.bn2(self.drop(self.act(self.fc2(x))))
        x = self.act(self.fc3(x))
        x = self.final(self.output(x))
        return x 

In [None]:
torch.manual_seed(20)
cnn_losses = {}
for epi in tqdm(top5_nn.antigen_epitope.unique()):
    X_train, X_val, y_train, y_val = ovr_xy_split_class(top5_nn, epi, train_size = .667)
    cb = len(top5_nn.query('antigen_epitope==@epi'))/len(top5_nn)
    print(f'FOR CURRENT EPITOPE : {epi}, % of positive class = {cb:.2%}')
    cnn = OD_CNN(128, act = nn.ReLU(), p_drop=.25, final = nn.ReLU())
    cnn.to(device);
    
    losses = train_clf(model = cnn, data = (X_train,X_val,y_train, y_val), 
                       lr = 6.67e-5, wd = 1e-3, nb_epochs = 750, 
                       batch_size = 2**10, loss_weight = None)
    cnn_losses[epi] = losses

In [None]:
torch.manual_seed(20)
cnn_losses = {}
for epi in tqdm(top5_nn.antigen_epitope.unique()):
    X_train, X_val, y_train, y_val = ovr_xy_split_class(top5_nn, epi, train_size = .667)
    cb = len(top5_nn.query('antigen_epitope==@epi'))/len(top5_nn)
    print(f'FOR CURRENT EPITOPE : {epi}, % of positive class = {cb:.2%}')
    cnn = OD_CNN(128, act = nn.SELU(), p_drop=.25, final = nn.Tanh())
    cnn.to(device);
    
    losses = train_clf(model = cnn, data = (X_train,X_val,y_train, y_val), 
                       lr = 1e-4, wd = 1e-3, nb_epochs = 500,
                       batch_size = 2**10, loss_weight = None)
    cnn_losses[epi] = losses

##### gridsearch cnn


In [None]:
# gridsearch CNN
nb_epochs = 500
losses = {}
epi = 'GILGFVFTL'
x_epochs = np.arange(0, nb_epochs, 50)
x_epochs = np.append(x, nb_epochs-1)
X_train, X_val, y_train, y_val = ovr_xy_split_class(top5_nn, epi, train_size = .667)
X_train.to(device)
X_val.to(device)
y_train.to(device)
y_val.to(device)

df=pd.DataFrame(columns = ['name','best_roc','roc_epoch','best_val','val_epoch'])
l = {}
for nk in tqdm([128, 72],
               leave = False):
    for act in tqdm([nn.SELU(), nn.ReLU(), nn.Sigmoid(), nn.Tanh(), nn.Softmax(dim=1)], 
                  leave = False):
        for p_drop in tqdm([0, 0.4], 
                          leave = False):
            for lr in tqdm([1e-5], 
                          leave = False):
                for wd in tqdm([1e-3, 5e-7],
                              leave = False):
                        name = '_'.join([act.__class__.__name__,
                                         'nk'+str(nk),
                                         'drop'+str(p_drop),
                                         'lr'+str(lr),
                                         'wd'+str(wd)])
                        cnn = OD_CNN(nk, act = act, p_drop=p_drop, final = act)
                        print(f'FOR {name}')
                        cnn.to(device)
                        losses = train_clf(model = cnn, data = (X_train,X_val,y_train, y_val), 
                                           lr = lr, wd = wd, nb_epochs = nb_epochs,
                                           batch_size = 2**10, loss_weight = None)
                        l[name]= losses
                        roc_epoch, best_roc = get_max(losses['roc'], x_epochs)
                        val_epoch, best_val = get_min(losses['val'], range(nb_epochs))
                        df = df.append(pd.DataFrame(data=[[name, best_roc, roc_epoch, best_val, val_epoch]],
                                                    columns = ['name','best_roc','roc_epoch','best_val','val_epoch']),
                                       ignore_index=True)

In [None]:
df.sort_values('best_roc', ascending=False)

### MLP2

In [None]:
class MLP2(nn.Module):
    def __init__(self, n_layers, n_hidden, n_output, 
                 activation = nn.SELU(), p_drop = 0.5): # dropout = True):
        super(MLP2, self).__init__()
        if p_drop >0 :
            self.drop = nn.Dropout(p_drop)
        else: 
            self.drop = nn.Identity()
            
        self.input_layers = nn.Sequential(nn.Linear(100, 75),
                                          activation,
                                          nn.BatchNorm1d(75),
                                          self.drop,
                                          nn.Linear(75,50),
                                          activation,
                                          nn.BatchNorm1d(50),
                                          self.drop,
                                          nn.Linear(50, 25),
                                          activation,
                                          nn.BatchNorm1d(25),
                                          self.drop)
        # 100 -> 75
        # 75 -> 50 
        # 50 -> 25
        layers = [nn.Linear(25,n_hidden),
                  activation,]
        
        for i in range(n_layers):
            layers.append(nn.Linear(n_hidden, n_hidden))
            layers.append(activation)
            layers.append(self.drop)
        
        self.hidden_layers = nn.Sequential(*layers)
        self.output = nn.Linear(n_hidden, n_output)
        
    
    def forward(self, x):
        x = self.input_layers(x)
        x = self.hidden_layers(x)
        x = self.output(x)
        #x = nn.Sigmoid(x) ?
        return x 

In [None]:
# gridsearch CNN
nb_epochs = 500
losses = {}
epi = 'GILGFVFTL'
x_epochs = np.arange(0, nb_epochs, 50)
x_epochs = np.append(x, nb_epochs-1)
X_train, X_val, y_train, y_val = ovr_xy_split_class(top5_nn, epi, train_size = .667)
X_train.to(device)
X_val.to(device)
y_train.to(device)
y_val.to(device)

df_mlp2 = pd.DataFrame(columns = ['name','best_roc','roc_epoch','best_val','val_epoch'])
ls2 = {}
for hidden in tqdm([30,50,70],
               leave = False):
    for n_layers in tqdm([1,2,3],
                        leave=False):
        for act in tqdm([nn.SELU(), nn.ReLU(), nn.Sigmoid(), nn.Tanh()], 
                      leave = False):
            for p_drop in tqdm([0, 0.5], 
                              leave = False):
                for lr in tqdm([1e-5,3e-4], 
                              leave = False):
                    for wd in tqdm([1e-2, 5e-7],
                                  leave = False):
                            
                            name = '_'.join([act.__class__.__name__,
                                             'hidden'+str(hidden),
                                             'layers'+str(n_layers),
                                             'drop'+str(p_drop),
                                             'lr'+str(lr),
                                             'wd'+str(wd)])
                            
                            mlp = MLP2(n_layers, hidden, 2, act, p_drop)
                            print(f'FOR {name}')
                            mlp.to(device)
                            losses = train_clf(model = mlp, data = (X_train,X_val,y_train, y_val), 
                                               lr = lr, wd = wd, nb_epochs = nb_epochs,
                                               batch_size = 2**12, loss_weight = None)
                            ls2[name]= losses
                            roc_epoch, best_roc = get_max(losses['roc'], x_epochs)
                            val_epoch, best_val = get_min(losses['val'], range(nb_epochs))
                            df_mlp2 = df_mlp2.append(pd.DataFrame(data=[[name, best_roc, roc_epoch, best_val, val_epoch]],
                                                        columns = ['name','best_roc','roc_epoch','best_val','val_epoch']),
                                           ignore_index=True)
                            
df_mlp2.sort_values('best_roc', ascending=False)

In [None]:
os.makedirs('../output/mlp_nn/')
df_mlp1.to_csv('../output/mlp_nn/df_mlp1.csv', header=True,index=False)
df_mlp2.to_csv('../output/mlp_nn/df_mlp2.csv', header=True,index=False)

In [None]:
display(df_mlp1.sort_values('best_roc', ascending=False).head(10))
display(df_mlp2.sort_values('best_roc', ascending=False).head(10))


In [None]:
x = np.arange(0, nb_epochs, 50 )
x = np.append(x, nb_epochs-1)

for k1 in df_mlp2.sort_values('best_roc', ascending=False).head()['name'].values:
    plt.figure(figsize=(10,7))
    k = list(ls2[k1].keys())
    plt.plot(ls2[k1][k[0]], 'b-.' ,label = k[0])
    plt.plot(ls2[k1][k[1]], 'r-.' ,label = k[1])
    
    plt.plot(x, ls2[k1][k[2]], c='g', ls = '-', lw = 1, marker = 'o' ,label = k[2])
    plt.plot(x, ls2[k1][k[3]], c='m', ls = '-', lw = 1, marker = 'o' ,label = k[3])
    plt.legend()
    plt.title(k1)
    