# Train DL

> A collection of deep learning tools via Fastai

## Setup

In [4]:
#| default_exp dl

In [5]:
#| hide
import sys
sys.path.append("/notebooks/katlas")
from nbdev.showdoc import *
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [6]:
#| export
from fastbook import *
import fastcore.all as fc,torch.nn.init as init
from fastai.callback.training import GradientClip
from torch.utils.data import WeightedRandomSampler

# katlas
from katlas.core import *
from katlas.feature import *
from katlas.train import *

# sklearn
from sklearn.model_selection import *
from sklearn.metrics import mean_squared_error
from scipy.stats import spearmanr,pearsonr

## Utils

In [7]:
#| export
def seed_everything(seed=123):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [5]:
seed_everything()

In [8]:
#| export
def_device = 'mps' if torch.backends.mps.is_available() else 'cuda' if torch.cuda.is_available() else 'cpu'

In [9]:
def_device

'cpu'

## Load Data

In [10]:
# read training data
df = pd.read_parquet('train_data/combine_t5_kd.parquet').reset_index()

# read data contains info for split
info_df = Data.get_kinase_info().query('pseudo!="1"') # get non-pseudo kinase

# merge info with training data
info = df[['kinase']].merge(info_df)
info.head()

# splits
splits = get_splits(info,stratified='group')
split0 = splits[0]


# column name of feature and target
feat_col = df.columns[df.columns.str.startswith('T5_')]
target_col = df.columns[~df.columns.isin(feat_col)][1:]

StratifiedKFold(n_splits=5, random_state=123, shuffle=True)
# kinase group in train set: 9
# kinase group in test set: 9
---------------------------
# kinase in train set: 312
---------------------------
# kinase in test set: 78
---------------------------
test set: ['EPHA3' 'FES' 'FLT3' 'FYN' 'EPHB1' 'EPHB3' 'FER' 'EPHB4' 'FLT4' 'FGFR1' 'EPHA5' 'TEK' 'DDR2' 'ZAP70' 'LIMK1' 'ULK3' 'JAK1' 'WEE1' 'TESK1' 'MAP2K3' 'AMPKA2' 'ATM' 'CAMK1D' 'CAMK2D' 'CAMK4' 'CAMKK1'
 'CK1D' 'CK1E' 'DYRK2' 'DYRK4' 'HGK' 'IKKE' 'JNK2' 'JNK3' 'KHS1' 'MAPKAPK5' 'MEK2' 'MSK2' 'NDR1' 'NEK6' 'NEK9' 'NIM1' 'NLK' 'OSR1' 'P38A' 'P38B' 'P90RSK' 'PAK1' 'PERK' 'PKCH' 'PKCI' 'PKN1' 'ROCK2'
 'RSK2' 'SIK' 'STLK3' 'TAK1' 'TSSK1' 'ALPHAK3' 'BMPR2' 'CDK10' 'CDK13' 'CDK14' 'CDKL5' 'GCN2' 'GRK4' 'IRE1' 'KHS2' 'MASTL' 'MLK4' 'MNK1' 'MRCKA' 'PRPK' 'QSK' 'SMMLCK' 'SSTK' 'ULK2' 'VRK1']


## Dataset

In [11]:
#| export
class GeneralDataset:
    def __init__(self, 
                 df, # a dataframe of values
                 feat_col, # feature columns
                 target_col=None # Will return test set for prediction if target col is None
                ):
        "A general dataset that can be applied to any dataframe"
        
        self.test = False if target_col is not None else True
        
        self.X = df[feat_col].values 
        self.y = df[target_col].values if not self.test else None
        
        self.len = df.shape[0]

    def __len__(self):
        return self.len

    def __getitem__(self, index):
        X = torch.Tensor(self.X[index])
        if self.test:
            return X
        else:
            y = torch.Tensor(self.y[index])
            return X, y

In [12]:
# dataset
ds = GeneralDataset(df,feat_col,target_col)

In [13]:
len(ds)

390

In [14]:
dl = DataLoader(ds, batch_size=64, shuffle=True)

In [15]:
#| export
def get_sampler(info,col):
    
    "For imbalanced data, get higher weights for less-represented samples"
    
    # get value counts
    group_counts = info[col].value_counts()
    
    # to reduce the difference through log
    # group_counts = group_counts.apply(lambda x: np.log(x+1.01))
    
    weights = 1. / group_counts[info[col]]

    sample_weights = torch.from_numpy(weights.to_numpy())
    sample_weights = torch.clamp_min(sample_weights,0.01)

    sampler = WeightedRandomSampler(sample_weights, len(sample_weights),replacement=True)
    
    return sampler

In [16]:
sampler = get_sampler(info,'subfamily')

In [17]:
# dataloader
dl = DataLoader(ds, batch_size=64, sampler=sampler)

In [18]:
xb,yb = next(iter(dl))

xb.shape,yb.shape

(torch.Size([64, 1024]), torch.Size([64, 210]))

## Models

### MLP

In [37]:
#| export
def MLP_1(num_features, 
          num_targets,
          hidden_units = [512, 218],
          dp = 0.2):
    
    # Start with the first layer from num_features to the first hidden layer
    layers = [
        nn.Linear(num_features, hidden_units[0]),
        nn.BatchNorm1d(hidden_units[0]),
        nn.Dropout(dp),
        nn.PReLU()
    ]
    
    # Loop over hidden units to create intermediate layers
    for i in range(len(hidden_units) - 1):
        layers.extend([
            nn.Linear(hidden_units[i], hidden_units[i+1]),
            nn.BatchNorm1d(hidden_units[i+1]),
            nn.Dropout(dp),
            nn.PReLU()
        ])
    
    # Add the output layer
    layers.append(nn.Linear(hidden_units[-1], num_targets))
    
    model = nn.Sequential(*layers)
    
    return model

In [38]:
n_feature = len(feat_col)
n_target = len(target_col)

In [39]:
model = MLP_1(n_feature, n_target)

In [40]:
model(xb)

tensor([[ 0.3748, -0.2792, -0.3940,  ..., -0.1069,  0.5133, -0.0981],
        [ 0.0130,  0.0171,  0.0975,  ...,  0.1337,  0.9050, -0.4457],
        [-0.0209,  0.6076,  0.2354,  ...,  0.6942, -0.0721,  0.5130],
        ...,
        [-0.2140, -0.1669,  0.5976,  ..., -0.0737,  1.2374, -0.8123],
        [-0.7917, -0.3272,  0.0697,  ...,  0.4106, -0.4402, -0.5049],
        [ 0.1277,  0.3288,  0.9124,  ...,  0.4660, -0.0242, -0.3738]], grad_fn=<AddmmBackward0>)

### CNN1D

***Version 1***

In [41]:
#| export
class CNN1D_1(Module):
    
    def __init__(self, 
                 num_features, # this does not matter, just for format
                 num_targets):

        self.conv1 = nn.Conv1d(in_channels=1, out_channels=3, kernel_size=3, dilation=1, padding=1, stride=1)
        self.pool1 = nn.MaxPool1d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv1d(in_channels=3, out_channels=8, kernel_size=3, dilation=1, padding=1, stride=1)
        self.pool2 = nn.MaxPool1d(kernel_size=2, stride=2)
        self.flatten = Flatten()
        self.fc1 = nn.Linear(in_features = int(8 * num_features/4), out_features=128)
        self.fc2 = nn.Linear(in_features=128, out_features=num_targets)

    def forward(self, x):
        x = x.unsqueeze(1) # need shape (bs, 1, num_features) for CNN
        x = self.pool1(nn.functional.relu(self.conv1(x)))
        x = self.pool2(nn.functional.relu(self.conv2(x)))
        # x = torch.flatten(x, 1)
        x = self.flatten(x)
        x = nn.functional.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [42]:
model = CNN1D_1(n_feature, n_target)

In [43]:
model(xb)

tensor([[-0.0796, -0.0721,  0.0301,  ..., -0.1060,  0.0699, -0.0175],
        [-0.0794, -0.0712,  0.0307,  ..., -0.1062,  0.0697, -0.0174],
        [-0.0793, -0.0726,  0.0301,  ..., -0.1064,  0.0706, -0.0176],
        ...,
        [-0.0798, -0.0733,  0.0304,  ..., -0.1060,  0.0696, -0.0175],
        [-0.0791, -0.0726,  0.0304,  ..., -0.1060,  0.0702, -0.0174],
        [-0.0790, -0.0727,  0.0304,  ..., -0.1063,  0.0704, -0.0174]], grad_fn=<AddmmBackward0>)

***Version 2***

In [44]:
#| export
def init_weights(m, leaky=0.):
    "Initiate any Conv layer with Kaiming norm."
    if isinstance(m, (nn.Conv1d,nn.Conv2d,nn.Conv3d)): init.kaiming_normal_(m.weight, a=leaky)

In [45]:
#| export
def lin_wn(ni,nf,dp=0.1,act=nn.SiLU):
    "Weight norm of linear."
    layers =  nn.Sequential(
            nn.BatchNorm1d(ni),
            nn.Dropout(dp),
            nn.utils.weight_norm(nn.Linear(ni, nf)) )
    if act: layers.append(act())
    return layers

In [46]:
#| export
def conv_wn(ni, nf, ks=3, stride=1, padding=1, dp=0.1,act=nn.ReLU):
    "Weight norm of conv."
    layers =  nn.Sequential(
        nn.BatchNorm1d(ni),
        nn.Dropout(dp),
        nn.utils.weight_norm(nn.Conv1d(ni, nf, ks, stride, padding)) )
    if act: layers.append(act())
    return layers

In [47]:
#| export
class CNN1D_2(nn.Module):
    
    def __init__(self, ni, nf, amp_scale = 16):
        super().__init__()

        cha_1,cha_2,cha_3 = 256,512,512
        hidden_size = cha_1*amp_scale

        cha_po_1 = hidden_size//(cha_1*2)
        cha_po_2 = (hidden_size//(cha_1*4)) * cha_3
        
        self.lin = lin_wn(ni,hidden_size)
        
        # bs, 256, 16
        self.view = View(-1,cha_1,amp_scale)
        
        self.conv1 = nn.Sequential(
            conv_wn(cha_1, cha_2, ks=5, stride=1, padding=2, dp=0.1),
            nn.AdaptiveAvgPool1d(output_size = cha_po_1),
            conv_wn(cha_2, cha_2, ks=3, stride=1, padding=1, dp=0.1))
        
        self.conv2 = nn.Sequential(
            conv_wn(cha_2, cha_2, ks=3, stride=1, padding=1, dp=0.3),
            conv_wn(cha_2, cha_3, ks=5, stride=1, padding=2, dp=0.2))
        
        self.head = nn.Sequential(
            nn.MaxPool1d(kernel_size=4, stride=2, padding=1),
            nn.Flatten(),
            lin_wn(cha_po_2,nf,act=None) )


    def forward(self, x):
        # amplify features to 4096
        x = self.lin(x)
        
        # reshape to bs,256,16 for conv1d
        x = self.view(x) 

        x = self.conv1(x)
        
        x_s = x  # for skip connection (multiply)
        x = self.conv2(x)
        x = x * x_s

        # Final block
        x = self.head(x)

        return x

In [48]:
model = CNN1D_2(n_feature,n_target).apply(init_weights)

In [49]:
model(xb)

tensor([[ 0.0269,  0.7208, -0.0277,  ...,  0.1137,  0.2809,  0.4883],
        [-0.0075, -0.8614,  0.0939,  ..., -1.0676,  0.7818,  0.4864],
        [ 0.0897, -0.0184, -0.1457,  ..., -0.4230,  0.3348, -0.4991],
        ...,
        [-0.5338, -0.3602,  0.0975,  ..., -0.1803, -0.3673, -0.0224],
        [-0.0796,  0.4378, -0.3398,  ..., -0.2319, -0.1067, -0.2075],
        [ 1.4080, -0.5030, -0.7703,  ...,  0.0632, -1.8563, -1.1198]], grad_fn=<AddmmBackward0>)

## DL Trainer

In [50]:
#| export
def train_dl(df, 
            feat_col, 
            target_col,
            split, # tuple of numpy array for split index
            model_func, # function to get pytorch model
             n_epoch = 4, # number of epochs
             bs = 32, # batch size
             lr = 1e-2, # will be useless if lr_find is True
            loss = mse, # loss function
            save = None, # models/{save}.pth
             sampler = None,
             lr_find=False, # if true, will use lr from lr_find
              ):
    "A DL trainer."
    
    train = df.loc[split[0]]
    valid = df.loc[split[1]]
    
    train_ds = GeneralDataset(train, feat_col, target_col)
    valid_ds = GeneralDataset(valid, feat_col, target_col)
    
    n_workers = fc.defaults.cpus

    if sampler is not None:
        
        train_dl = DataLoader(train_ds, batch_size=bs, sampler=sampler,num_workers=n_workers)
        valid_dl = DataLoader(valid_ds, batch_size=bs, sampler=sampler,num_workers=n_workers)
        
        dls = DataLoaders(train_dl, valid_dl)
        
    else:
        
        dls = DataLoaders.from_dsets(train_ds, valid_ds, bs=bs, num_workers=n_workers)
    
    model = model_func()
    
    learn = Learner(dls.to(def_device), model.to(def_device), loss, 
                    metrics= [PearsonCorrCoef(),SpearmanCorrCoef()],
                    cbs = [GradientClip(1.0)] # prevent overfitting
                   )
    
    if lr_find:
        # get learning rate
        lr = learn.lr_find()
        plt.show()
        plt.close()
        print(lr)

        
    print('lr in training is', lr)
    learn.fit_one_cycle(n_epoch,lr) #cbs = [SaveModelCallback(fname = 'best')] # save best model
    
    if save is not None:
        learn.save(save)
        
    pred,target = learn.get_preds()
    
    pred = pd.DataFrame(pred.detach().cpu().numpy(),index=valid.index,columns=target_col)
    target = pd.DataFrame(target.detach().cpu().numpy(),index=valid.index,columns=target_col)
    
    return target, pred

In [51]:
def get_model():
    return CNN1D_2(n_feature, n_target)

In [52]:
target, pred = train_dl(df, 
                        feat_col, 
                        target_col,
                        split0, 
                        get_model,
                        n_epoch=1,
                        lr = 1e-2,
                        save = 'test')

lr in training is 0.01


epoch,train_loss,valid_loss,pearsonr,spearmanr,time
0,2.209235,1.388019,0.023253,-0.001965,00:06


In [53]:
score_each(target,pred)

overall MSE: 1.3880
Average Pearson: 0.0757 


(1.3880196,
 0.07574083746828988,
       Pearson
 3    0.003423
 8    0.000009
 10  -0.000267
 19   0.015093
 24  -0.008538
 ..        ...
 359  0.064130
 361 -0.056392
 366 -0.141792
 367 -0.089278
 373  0.063749
 
 [78 rows x 1 columns])

## DL CV

In [55]:
#| export
@fc.delegates(train_dl)
def train_dl_cv(df, 
                feat_col, 
                target_col, 
                splits, # list of tuples
                model_func, # functions like lambda x: return MLP_1(num_feat, num_target)
                save:str=None,
                **kwargs
                ):
    
    OOF = []
    metrics = []
    
    for fold,split in enumerate(splits):

        print(f'------fold{fold}------')
        
        
        fname=None
        # save best model for each fold
        if save is not None:
            fname = f'{save}_fold{fold}'
        
        # train model
        target, pred = train_dl(df,feat_col,target_col, split, model_func ,save=fname,**kwargs)

        #------------get scores--------------
        # get score metrics
        mse, pearson_avg, _ = score_each(target,pred)
        
        # store metrics in a dictionary for the current fold
        fold_metrics = {
            'fold': fold,
            'mse': mse,
            'pearson_avg': pearson_avg
        }
        metrics.append(fold_metrics)

        OOF.append(pred)
        

    # Concatenate OOF from each fold to a new dataframe
    oof = pd.concat(OOF).sort_index()
    
    # Get metrics into a dataframe
    metrics = pd.DataFrame(metrics)
    
    return oof, metrics

In [56]:
def get_model():
    return CNN1D_2(n_feature, n_target)

In [57]:
oof,metrics = train_dl_cv(df,feat_col,target_col,splits,get_model,n_epoch=1,lr=3e-3)

------fold0------
lr in training is 0.003


epoch,train_loss,valid_loss,pearsonr,spearmanr,time
0,1.153321,0.997725,0.076154,0.036538,00:12


overall MSE: 0.9977
Average Pearson: 0.0941 
------fold1------
lr in training is 0.003


epoch,train_loss,valid_loss,pearsonr,spearmanr,time
0,1.154311,0.984537,0.1284,0.078118,00:10


overall MSE: 0.9845
Average Pearson: 0.1442 
------fold2------
lr in training is 0.003


epoch,train_loss,valid_loss,pearsonr,spearmanr,time
0,1.14012,0.994797,0.102739,0.064886,00:12


overall MSE: 0.9948
Average Pearson: 0.1789 
------fold3------
lr in training is 0.003


epoch,train_loss,valid_loss,pearsonr,spearmanr,time
0,1.165069,1.00232,0.061936,0.036632,00:04


overall MSE: 1.0023
Average Pearson: 0.1577 
------fold4------
lr in training is 0.003


epoch,train_loss,valid_loss,pearsonr,spearmanr,time
0,1.185752,0.971004,0.172185,0.113681,00:09


overall MSE: 0.9710
Average Pearson: 0.1760 


In [58]:
metrics

Unnamed: 0,fold,mse,pearson_avg
0,0,0.997725,0.094141
1,1,0.984537,0.144184
2,2,0.994797,0.178877
3,3,1.00232,0.157654
4,4,0.971004,0.176018


In [59]:
metrics.pearson_avg.mean()

0.15017477604007748

In [60]:
target = df[target_col]
_,_,corr = score_each(target,oof)

overall MSE: 0.9901
Average Pearson: 0.1502 


In [61]:
corr

Unnamed: 0,Pearson
0,-0.252464
1,0.012856
2,0.027174
3,0.028486
4,-0.229108
...,...
385,0.223907
386,0.194346
387,0.033649
388,0.032004


## DL Predict

In [65]:
#| export
def predict_dl(df, 
               feat_col, 
               target_col,
               model, # model architecture
               model_pth, # only name, not with .pth
              ):
    
    "Predict dataframe given a deep learning model"
    
    test_dset = GeneralDataset(df,feat_col)
    test_dl = DataLoader(test_dset,bs=512)
    
    
    learn = Learner(None, model.to(def_device), loss_func=1)
    learn.load(model_pth)
    
    learn.model.eval()
    
    preds = []
    for data in test_dl:
        inputs = data.cuda()
        outputs = learn.model(inputs) #learn.model(x).sigmoid().detach().cpu().numpy()

        preds.append(outputs.detach().cpu().numpy())

    preds = np.concatenate(preds)
    preds = pd.DataFrame(preds,index=df.index,columns=target_col)

    return preds

In [66]:
test = df.loc[split0[1]]

In [None]:
pred = predict_dl(test.head(3),
                  feat_col,
                  target_col, 
                  model,'test')
pred

In [None]:
score_each(test[target_col],pred)

In [1]:
#| hide
import nbdev; nbdev.nbdev_export()