# Train DL

> A collection of deep learning tools via Fastai

## Setup

In [None]:
#| default_exp dl

In [None]:
#| export
import pandas as pd, numpy as np
import fastcore.all as fc,torch.nn.init as init
from torch.utils.data import WeightedRandomSampler
from fastai.vision.all import *

# katlas
from katlas.core import Data
from katlas.feature import *
from katlas.train import *

# sklearn
from sklearn.model_selection import *
from sklearn.metrics import mean_squared_error
from scipy.stats import spearmanr,pearsonr

## Utils

In [None]:
#| export
def seed_everything(seed=123):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [None]:
seed_everything()

In [None]:
#| export
def_device = 'mps' if torch.backends.mps.is_available() else 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
def_device

'cpu'

## Load Data

In [None]:
# read training data
df = pd.read_parquet('https://github.com/sky1ove/katlas_raw/raw/refs/heads/main/nbs/raw/combine_t5_kd.parquet').reset_index()

# read data contains info for split
info_df = Data.get_kinase_info().query('pseudo!="1"') # get non-pseudo kinase

# merge info with training data
info = df[['kinase']].merge(info_df)
info.head()

# splits
splits = get_splits(info,stratified='group')
split0 = splits[0]


# column name of feature and target
feat_col = df.columns[df.columns.str.startswith('T5_')]
target_col = df.columns[~df.columns.isin(feat_col)][1:]

StratifiedKFold(n_splits=5, random_state=123, shuffle=True)
# kinase group in train set: 9
# kinase group in test set: 9
---------------------------
# kinase in train set: 312
---------------------------
# kinase in test set: 78
---------------------------
test set: ['EPHA3' 'FES' 'FLT3' 'FYN' 'EPHB1' 'EPHB3' 'FER' 'EPHB4' 'FLT4' 'FGFR1'
 'EPHA5' 'TEK' 'DDR2' 'ZAP70' 'LIMK1' 'ULK3' 'JAK1' 'WEE1' 'TESK1'
 'MAP2K3' 'AMPKA2' 'ATM' 'CAMK1D' 'CAMK2D' 'CAMK4' 'CAMKK1' 'CK1D' 'CK1E'
 'DYRK2' 'DYRK4' 'HGK' 'IKKE' 'JNK2' 'JNK3' 'KHS1' 'MAPKAPK5' 'MEK2'
 'MSK2' 'NDR1' 'NEK6' 'NEK9' 'NIM1' 'NLK' 'OSR1' 'P38A' 'P38B' 'P90RSK'
 'PAK1' 'PERK' 'PKCH' 'PKCI' 'PKN1' 'ROCK2' 'RSK2' 'SIK' 'STLK3' 'TAK1'
 'TSSK1' 'ALPHAK3' 'BMPR2' 'CDK10' 'CDK13' 'CDK14' 'CDKL5' 'GCN2' 'GRK4'
 'IRE1' 'KHS2' 'MASTL' 'MLK4' 'MNK1' 'MRCKA' 'PRPK' 'QSK' 'SMMLCK' 'SSTK'
 'ULK2' 'VRK1']


## Dataset

In [None]:
#| export
class GeneralDataset:
    def __init__(self, 
                 df, # a dataframe of values
                 feat_col, # feature columns
                 target_col=None # Will return test set for prediction if target col is None
                ):
        "A general dataset that can be applied to any dataframe"
        
        self.test = False if target_col is not None else True
        
        self.X = df[feat_col].values 
        self.y = df[target_col].values if not self.test else None
        
        self.len = df.shape[0]

    def __len__(self):
        return self.len

    def __getitem__(self, index):
        X = torch.Tensor(self.X[index])
        if self.test:
            return X
        else:
            y = torch.Tensor(self.y[index])
            return X, y

In [None]:
# dataset
ds = GeneralDataset(df,feat_col,target_col)

In [None]:
len(ds)

390

In [None]:
dl = DataLoader(ds, batch_size=64, shuffle=True)

In [None]:
#| export
def get_sampler(info,col):
    
    "For imbalanced data, get higher weights for less-represented samples"
    
    # get value counts
    group_counts = info[col].value_counts()
    
    # to reduce the difference through log
    # group_counts = group_counts.apply(lambda x: np.log(x+1.01))
    
    weights = 1. / group_counts[info[col]]

    sample_weights = torch.from_numpy(weights.to_numpy())
    sample_weights = torch.clamp_min(sample_weights,0.01)

    sampler = WeightedRandomSampler(sample_weights, len(sample_weights),replacement=True)
    
    return sampler

In [None]:
sampler = get_sampler(info,'subfamily')

In [None]:
# dataloader
dl = DataLoader(ds, batch_size=64, sampler=sampler)

In [None]:
xb,yb = next(iter(dl))

xb.shape,yb.shape

(torch.Size([64, 1024]), torch.Size([64, 210]))

## Models

### MLP

In [None]:
#| export
def MLP_1(num_features, 
          num_targets,
          hidden_units = [512, 218],
          dp = 0.2):
    
    # Start with the first layer from num_features to the first hidden layer
    layers = [
        nn.Linear(num_features, hidden_units[0]),
        nn.BatchNorm1d(hidden_units[0]),
        nn.Dropout(dp),
        nn.PReLU()
    ]
    
    # Loop over hidden units to create intermediate layers
    for i in range(len(hidden_units) - 1):
        layers.extend([
            nn.Linear(hidden_units[i], hidden_units[i+1]),
            nn.BatchNorm1d(hidden_units[i+1]),
            nn.Dropout(dp),
            nn.PReLU()
        ])
    
    # Add the output layer
    layers.append(nn.Linear(hidden_units[-1], num_targets))
    
    model = nn.Sequential(*layers)
    
    return model

In [None]:
n_feature = len(feat_col)
n_target = len(target_col)

In [None]:
model = MLP_1(n_feature, n_target)

In [None]:
model(xb)

tensor([[-0.1115, -0.3755, -0.3818,  ..., -0.1483, -0.0387, -0.1111],
        [ 0.8555,  0.9352, -0.9642,  ..., -0.4723,  0.7757, -0.0121],
        [ 0.3422,  0.3537, -0.1441,  ...,  0.5467, -0.4535,  0.2103],
        ...,
        [-0.4287,  0.6751,  0.1797,  ...,  0.0192,  0.0692, -0.0573],
        [-0.0206, -0.1953,  0.7445,  ..., -0.2206, -0.1188,  0.4579],
        [ 0.2342, -0.0243,  0.4630,  ...,  0.8393,  0.5747, -0.6881]],
       grad_fn=<AddmmBackward0>)

### CNN1D

***Version 1***

In [None]:
#| export
class CNN1D_1(Module):
    
    def __init__(self, 
                 num_features, # this does not matter, just for format
                 num_targets):

        self.conv1 = nn.Conv1d(in_channels=1, out_channels=3, kernel_size=3, dilation=1, padding=1, stride=1)
        self.pool1 = nn.MaxPool1d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv1d(in_channels=3, out_channels=8, kernel_size=3, dilation=1, padding=1, stride=1)
        self.pool2 = nn.MaxPool1d(kernel_size=2, stride=2)
        self.flatten = Flatten()
        self.fc1 = nn.Linear(in_features = int(8 * num_features/4), out_features=128)
        self.fc2 = nn.Linear(in_features=128, out_features=num_targets)

    def forward(self, x):
        x = x.unsqueeze(1) # need shape (bs, 1, num_features) for CNN
        x = self.pool1(nn.functional.relu(self.conv1(x)))
        x = self.pool2(nn.functional.relu(self.conv2(x)))
        # x = torch.flatten(x, 1)
        x = self.flatten(x)
        x = nn.functional.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [None]:
model = CNN1D_1(n_feature, n_target)

In [None]:
model(xb)

tensor([[ 0.0193,  0.0690,  0.0138,  ..., -0.0428, -0.0026,  0.0840],
        [ 0.0203,  0.0693,  0.0136,  ..., -0.0422, -0.0023,  0.0846],
        [ 0.0198,  0.0703,  0.0148,  ..., -0.0424, -0.0029,  0.0839],
        ...,
        [ 0.0197,  0.0694,  0.0147,  ..., -0.0429, -0.0019,  0.0841],
        [ 0.0193,  0.0687,  0.0146,  ..., -0.0429, -0.0017,  0.0843],
        [ 0.0191,  0.0692,  0.0148,  ..., -0.0425, -0.0028,  0.0834]],
       grad_fn=<AddmmBackward0>)

***Version 2***

In [None]:
#| export
def init_weights(m, leaky=0.):
    "Initiate any Conv layer with Kaiming norm."
    if isinstance(m, (nn.Conv1d,nn.Conv2d,nn.Conv3d)): init.kaiming_normal_(m.weight, a=leaky)

In [None]:
#| export
def lin_wn(ni,nf,dp=0.1,act=nn.SiLU):
    "Weight norm of linear."
    layers =  nn.Sequential(
            nn.BatchNorm1d(ni),
            nn.Dropout(dp),
            nn.utils.weight_norm(nn.Linear(ni, nf)) )
    if act: layers.append(act())
    return layers

In [None]:
#| export
def conv_wn(ni, nf, ks=3, stride=1, padding=1, dp=0.1,act=nn.ReLU):
    "Weight norm of conv."
    layers =  nn.Sequential(
        nn.BatchNorm1d(ni),
        nn.Dropout(dp),
        nn.utils.weight_norm(nn.Conv1d(ni, nf, ks, stride, padding)) )
    if act: layers.append(act())
    return layers

In [None]:
#| export
class CNN1D_2(nn.Module):
    
    def __init__(self, ni, nf, amp_scale = 16):
        super().__init__()

        cha_1,cha_2,cha_3 = 256,512,512
        hidden_size = cha_1*amp_scale

        cha_po_1 = hidden_size//(cha_1*2)
        cha_po_2 = (hidden_size//(cha_1*4)) * cha_3
        
        self.lin = lin_wn(ni,hidden_size)
        
        # bs, 256, 16
        self.view = View(-1,cha_1,amp_scale)
        
        self.conv1 = nn.Sequential(
            conv_wn(cha_1, cha_2, ks=5, stride=1, padding=2, dp=0.1),
            nn.AdaptiveAvgPool1d(output_size = cha_po_1),
            conv_wn(cha_2, cha_2, ks=3, stride=1, padding=1, dp=0.1))
        
        self.conv2 = nn.Sequential(
            conv_wn(cha_2, cha_2, ks=3, stride=1, padding=1, dp=0.3),
            conv_wn(cha_2, cha_3, ks=5, stride=1, padding=2, dp=0.2))
        
        self.head = nn.Sequential(
            nn.MaxPool1d(kernel_size=4, stride=2, padding=1),
            nn.Flatten(),
            lin_wn(cha_po_2,nf,act=None) )


    def forward(self, x):
        # amplify features to 4096
        x = self.lin(x)
        
        # reshape to bs,256,16 for conv1d
        x = self.view(x) 

        x = self.conv1(x)
        
        x_s = x  # for skip connection (multiply)
        x = self.conv2(x)
        x = x * x_s

        # Final block
        x = self.head(x)

        return x

In [None]:
model = CNN1D_2(n_feature,n_target).apply(init_weights)



In [None]:
model(xb)

tensor([[-0.5740,  0.0151, -0.0819,  ...,  0.2636,  0.3405, -0.1404],
        [-0.6800,  0.5530, -0.0958,  ..., -0.3752, -0.6124,  0.7171],
        [ 0.4427, -0.3204, -0.3243,  ..., -0.2290,  0.1070,  0.1504],
        ...,
        [-0.3660, -0.2667, -0.6036,  ..., -0.3130,  0.5462, -0.0055],
        [ 0.4511,  0.6824,  0.8659,  ..., -0.0171,  0.2362, -0.3475],
        [-0.0746, -0.1699,  0.6895,  ...,  1.1522, -0.3472,  0.6422]],
       grad_fn=<AddmmBackward0>)

## DL Trainer

In [None]:
#| export
def train_dl(df, 
            feat_col, 
            target_col,
            split, # tuple of numpy array for split index
            model_func, # function to get pytorch model
             n_epoch = 4, # number of epochs
             bs = 32, # batch size
             lr = 1e-2, # will be useless if lr_find is True
            loss = mse, # loss function
            save = None, # models/{save}.pth
             sampler = None,
             lr_find=False, # if true, will use lr from lr_find
              ):
    "A DL trainer."
    
    train = df.loc[split[0]]
    valid = df.loc[split[1]]
    
    train_ds = GeneralDataset(train, feat_col, target_col)
    valid_ds = GeneralDataset(valid, feat_col, target_col)
    
    n_workers = fc.defaults.cpus

    if sampler is not None:
        
        train_dl = DataLoader(train_ds, batch_size=bs, sampler=sampler,num_workers=n_workers)
        valid_dl = DataLoader(valid_ds, batch_size=bs, sampler=sampler,num_workers=n_workers)
        
        dls = DataLoaders(train_dl, valid_dl)
        
    else:
        
        dls = DataLoaders.from_dsets(train_ds, valid_ds, bs=bs, num_workers=n_workers)
    
    model = model_func()
    
    learn = Learner(dls.to(def_device), model.to(def_device), loss, 
                    metrics= [PearsonCorrCoef(),SpearmanCorrCoef()],
                    cbs = [GradientClip(1.0)] # prevent overfitting
                   )
    
    if lr_find:
        # get learning rate
        lr = learn.lr_find()
        plt.show()
        plt.close()
        print(lr)

        
    print('lr in training is', lr)
    learn.fit_one_cycle(n_epoch,lr) #cbs = [SaveModelCallback(fname = 'best')] # save best model
    
    if save is not None:
        learn.save(save)
        
    pred,target = learn.get_preds()
    
    pred = pd.DataFrame(pred.detach().cpu().numpy(),index=valid.index,columns=target_col)
    target = pd.DataFrame(target.detach().cpu().numpy(),index=valid.index,columns=target_col)
    
    return target, pred

In [None]:
def get_model():
    return CNN1D_2(n_feature, n_target)

In [None]:
target, pred = train_dl(df, 
                        feat_col, 
                        target_col,
                        split0, 
                        get_model,
                        n_epoch=1,
                        lr = 1e-2,
                        save = 'test')

lr in training is 0.01




epoch,train_loss,valid_loss,pearsonr,spearmanr,time
0,1.937994,1.231321,0.106572,0.064782,00:01


In [None]:
score_each(target,pred)

overall MSE: 1.2313
Average Pearson: 0.1580 


(1.2313209,
 0.1579942852920301,
       Pearson
 3   -0.037969
 8   -0.045367
 10  -0.057115
 19  -0.044484
 24  -0.059326
 ..        ...
 359  0.247093
 361 -0.147023
 366  0.107366
 367 -0.004609
 373  0.260843
 
 [78 rows x 1 columns])

## DL CV

In [None]:
#| export
@fc.delegates(train_dl)
def train_dl_cv(df, 
                feat_col, 
                target_col, 
                splits, # list of tuples
                model_func, # functions like lambda x: return MLP_1(num_feat, num_target)
                save:str=None,
                **kwargs
                ):
    
    OOF = []
    metrics = []
    
    for fold,split in enumerate(splits):

        print(f'------fold{fold}------')
        
        
        fname=None
        # save best model for each fold
        if save is not None:
            fname = f'{save}_fold{fold}'
        
        # train model
        target, pred = train_dl(df,feat_col,target_col, split, model_func ,save=fname,**kwargs)

        #------------get scores--------------
        # get score metrics
        mse, pearson_avg, _ = score_each(target,pred)
        
        # store metrics in a dictionary for the current fold
        fold_metrics = {
            'fold': fold,
            'mse': mse,
            'pearson_avg': pearson_avg
        }
        metrics.append(fold_metrics)

        OOF.append(pred)
        

    # Concatenate OOF from each fold to a new dataframe
    oof = pd.concat(OOF).sort_index()
    
    # Get metrics into a dataframe
    metrics = pd.DataFrame(metrics)
    
    return oof, metrics

In [None]:
def get_model():
    return CNN1D_2(n_feature, n_target)

In [None]:
oof,metrics = train_dl_cv(df,feat_col,target_col,splits,get_model,n_epoch=1,lr=3e-3)

------fold0------
lr in training is 0.003




epoch,train_loss,valid_loss,pearsonr,spearmanr,time
0,1.157886,0.985997,0.123256,0.076049,00:01


overall MSE: 0.9860
Average Pearson: 0.2104 
------fold1------
lr in training is 0.003




epoch,train_loss,valid_loss,pearsonr,spearmanr,time
0,1.194019,0.984086,0.130521,0.092631,00:01


overall MSE: 0.9841
Average Pearson: 0.1521 
------fold2------
lr in training is 0.003




epoch,train_loss,valid_loss,pearsonr,spearmanr,time
0,1.15419,0.988698,0.114616,0.064543,00:01


overall MSE: 0.9887
Average Pearson: 0.2677 
------fold3------
lr in training is 0.003




epoch,train_loss,valid_loss,pearsonr,spearmanr,time
0,1.177719,0.97576,0.15624,0.12527,00:01


overall MSE: 0.9758
Average Pearson: 0.1862 
------fold4------
lr in training is 0.003




epoch,train_loss,valid_loss,pearsonr,spearmanr,time
0,1.170352,0.983774,0.135724,0.102884,00:01


overall MSE: 0.9838
Average Pearson: 0.2547 


In [None]:
metrics

Unnamed: 0,fold,mse,pearson_avg
0,0,0.985997,0.210434
1,1,0.984086,0.152108
2,2,0.988698,0.267718
3,3,0.97576,0.186243
4,4,0.983774,0.254678


In [None]:
metrics.pearson_avg.mean()

0.2142360341441809

In [None]:
target = df[target_col]
_,_,corr = score_each(target,oof)

overall MSE: 0.9837
Average Pearson: 0.2142 


In [None]:
corr

Unnamed: 0,Pearson
0,-0.130073
1,-0.210985
2,-0.251176
3,-0.163586
4,-0.054230
...,...
385,0.186961
386,0.336823
387,-0.029309
388,0.029152


## DL Predict

In [None]:
#| export
def predict_dl(df, 
               feat_col, 
               target_col,
               model, # model architecture
               model_pth, # only name, not with .pth
              ):
    
    "Predict dataframe given a deep learning model"
    
    test_dset = GeneralDataset(df,feat_col)
    test_dl = DataLoader(test_dset,bs=512)
    
    
    learn = Learner(None, model.to(def_device), loss_func=1)
    learn.load(model_pth)
    
    learn.model.eval()
    
    preds = []
    for data in test_dl:
        inputs = data.to(def_device)
        outputs = learn.model(inputs) #learn.model(x).sigmoid().detach().cpu().numpy()

        preds.append(outputs.detach().cpu().numpy())

    preds = np.concatenate(preds)
    preds = pd.DataFrame(preds,index=df.index,columns=target_col)

    return preds

In [None]:
test = df.loc[split0[1]]

In [None]:
pred = predict_dl(test.head(3),
                  feat_col,
                  target_col, 
                  model,'test')
pred

Unnamed: 0,-5P,-5G,-5A,-5C,-5S,-5T,-5V,-5I,-5L,-5M,...,4Q,4N,4D,4E,4s,4t,4y,0s,0t,0y
3,-0.355736,-0.371514,-0.81429,-0.573196,1.173128,0.863355,-0.738675,0.366732,0.702826,1.137061,...,-0.286256,-0.485139,0.012519,0.30605,-0.215432,0.959879,0.697288,0.805923,0.199813,-0.39755
8,-0.322134,-0.418208,-0.83548,-0.620543,1.237427,0.920613,-0.78318,0.38532,0.750605,1.191602,...,-0.300988,-0.502365,-0.041983,0.355885,-0.245259,1.002593,0.699817,0.878045,0.185331,-0.444951
10,-0.349195,-0.373037,-0.816447,-0.580855,1.182283,0.871513,-0.739842,0.368429,0.705171,1.146074,...,-0.286629,-0.483317,0.007166,0.30686,-0.217535,0.966914,0.695795,0.811916,0.201817,-0.403425


In [None]:
_,_,corr = score_each(test[target_col].head(3),pred)

overall MSE: 1.4146
Average Pearson: 0.0468 


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()