In [None]:
!pip install iterative-stratification

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
Path.ls = lambda x: list(x.iterdir())

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import torch
from torch import nn

In [None]:
from sklearn.preprocessing import LabelEncoder

class Processor():
    def __init__(self):
        self.encoder_dict = {}

    def process_df(self, df_, cols):
        # Categorical Columns
        df = df_.copy()
        for col in cols:
            ohe = pd.get_dummies(df[col])
            ohe_cols = [f'{col}_{x}'for x in ohe.columns]
            df[ohe_cols] = ohe
            df.drop(col, axis=1,inplace=True)
        
        return df

In [None]:
FOLDS = 5
DEVICE= 'cuda'

In [None]:
path = Path('/kaggle/input/lish-moa/')
path.ls()

In [None]:
train_features = pd.read_csv(path/'train_features.csv')
train_targets_scored = pd.read_csv(path/'train_targets_scored.csv')
train_targets_nonscored = pd.read_csv(path/'train_targets_nonscored.csv')
test_features = pd.read_csv(path/'test_features.csv')

In [None]:
train_targets_scored['folds'] = -1

cv = MultilabelStratifiedKFold(n_splits=5)
for i, (trn_, val_) in enumerate(cv.split(train_features, train_targets_scored.iloc[:,1:])):
    train_targets_scored.loc[val_,'folds'] = i

In [None]:
cat_proc = Processor()

In [None]:
feat_df = cat_proc.process_df(train_features, ['cp_type', 'cp_time', 'cp_dose'])
train_df = pd.merge(feat_df, train_targets_scored, how='left', on='sig_id')

In [None]:
feat_df

In [None]:
class Dataset:
    def __init__(self,df,features, targets):
        self.features = df.loc[:, features]
        self.targets = df.loc[:, targets]
        
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        data = self.features.iloc[idx]
        target= self.targets.iloc[idx]
        return {
            'x': torch.tensor(data, dtype=torch.float),
            'y': torch.tensor(target, dtype=torch.float)
        }

In [None]:
feat_cols = feat_df.drop('sig_id',axis=1).columns
target_cols = train_targets_scored.drop(['sig_id','folds'], axis=1).columns

In [None]:
train_df

In [None]:
cont_cols = []
for c in train_df.columns:
    if ('c-' in c) or ('g-' in c):
        cont_cols.append(c)

In [None]:
ds = Dataset(train_df, feat_cols, target_cols)

In [None]:
class Model(nn.Module):
    def __init__(self, f_in, f_out, dropout=0, hidden_size=256):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(f_in, hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.Dropout(p=dropout),
            nn.Linear(hidden_size, hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.Dropout(p=dropout),
            nn.Linear(hidden_size, hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.Dropout(p=dropout),
            nn.Linear(hidden_size, hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.Dropout(p=dropout),
            nn.Linear(hidden_size, f_out),
            nn.BatchNorm1d(f_out),
        )
        
    def forward(self, x):
        return self.model(x)

In [None]:
from tqdm.notebook import tqdm

In [None]:
class Engine:
    def __init__(self, model, optimizer,scheduler, criterion, device):
        self.model = model
        self.optimizer = optimizer
        self.criterion = criterion
        self.scheduler = scheduler
        self.device = device
        
    def step(self, batch):
        x = batch['x'].to(self.device)
        y = batch['y'].to(self.device)
        outs = self.model(x)
        loss = self.criterion(outs, y)
        
        return loss
            
    def train_loop(self, dataloader):
        self.model.train()
        final_loss = []
        for batch in tqdm(dataloader, total=len(dataloader)):
            self.optimizer.zero_grad()
            loss = self.step(batch)
            loss.backward()
            self.optimizer.step()
            if self.scheduler : 
                self.scheduler.step()
            final_loss.append(loss.detach().cpu().numpy())
        return final_loss
    
    def eval_loop(self, dataloader):
        with torch.no_grad():
            self.model.eval()
            final_loss = []
            for batch in tqdm(dataloader, total=len(dataloader)):
                loss = self.step(batch)
                final_loss.append(loss.detach().cpu().numpy())
            
        return final_loss
    
    
            

In [None]:
EPOCHS = 10

In [None]:
from IPython.display import clear_output

In [None]:
class Recorder:
    def __init__(self):
        self.train_loss = []
        self.val_loss = []
        
    def update(self, train_loss, val_loss):
        self.train_loss.append(train_loss)
        self.val_loss.append(val_loss)

In [None]:
recorders = {}
models = {}

In [None]:
from matplotlib import pyplot as plt

In [None]:
import tabulate

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
for i in range(FOLDS):
    
    train = train_df.loc[train_df['folds']!=i].reset_index(drop=True)
    val = train_df.loc[train_df['folds']==i].reset_index(drop=True)
            
    scaler = MinMaxScaler()
    train[cont_cols] = scaler.fit_transform(train[cont_cols])
    val[cont_cols] = scaler.transform(val[cont_cols])
    
    train_ds = Dataset(train, feat_cols, target_cols)
    train_dl = torch.utils.data.DataLoader(train_ds, batch_size=1024, shuffle=True, num_workers=4)
    
    val_ds = Dataset(train, feat_cols, target_cols)
    val_dl = torch.utils.data.DataLoader(train_ds, batch_size=1024*4, shuffle=True, num_workers=4)
    
    model = Model(f_in=879, f_out=206)
    model.to(DEVICE)
    
    
    total_steps = len(train_dl) * EPOCHS
    
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=1e-1, total_steps=total_steps)
    criterion = nn.BCEWithLogitsLoss()
    engine = Engine(model, optimizer, scheduler, criterion, DEVICE )
    recorders[i] = Recorder()
    
    
    for epoch in range(EPOCHS):
        train_loss = engine.train_loop(train_dl)
        val_loss = engine.eval_loop(train_dl)
        train_loss = np.stack(train_loss).mean()
        val_loss = np.stack(val_loss).mean()
        recorders[i].update(train_loss, val_loss)
        clear_output()
        print('Fold :: ',i)
        print('Epoch :: ',epoch)
        print("Train Loss :: ",recorders[i].train_loss[-1])
        print("Valid Loss :: ",recorders[i].val_loss[-1])
        
    fig, ax = plt.subplots(FOLDS,1,constrained_layout = True, figsize=(10,30))
    for j in range(i+1):
        ax[j].plot(recorders[j].train_loss,label='train_loss')
        ax[j].plot(recorders[j].val_loss,label='val_loss')
        ax[j].set_title(f'Train Loss: {recorders[j].train_loss[-1]} val loss: {recorders[j].val_loss[-1]}')
    plt.show()    
    
    
    models[i] = model.cpu()
        

In [None]:
train_targets_scored[['sig_id','folds']].to_csv('folds.csv')

In [None]:
for i in range(FOLDS):
    torch.save(models[i].state_dict(), f'model_fold_{i}.pth')