import sys
sys.path.append('../input/iterativestratification')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('ticks')
sns.set_context("poster")
sns.set_palette('colorblind')
import warnings
warnings.filterwarnings('ignore')
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch import nn
import torch.nn.functional as F
import torchvision
import time
from sklearn.model_selection import train_test_split
import random
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

## 0. Data Exploration

In [None]:
plt.rcParams['figure.figsize'] = (20.0, 10.0)

In [None]:
train_features = pd.read_csv('/kaggle/input/lish-moa/train_features.csv')
test_features = pd.read_csv('/kaggle/input/lish-moa/test_features.csv')
train_targets = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')

In [None]:
train_mask = train_features['cp_type'] != 'ctl_vehicle'
train_sig_ids = train_features.loc[train_mask]['sig_id']
train = train_features.loc[train_mask]

In [None]:
test_mask = test_features['cp_type'] != 'ctl_vehicle'
test_sig_ids = test_features.loc[test_mask]['sig_id']
test = test_features.loc[test_mask]

In [None]:
g_features = [cols for cols in train.columns if cols.startswith('g-')]
c_features = [cols for cols in train.columns if cols.startswith('c-')]

In [None]:
train_targets = train_targets[train_targets['sig_id'].isin(train_sig_ids)]

## 1. Data preprocessing

In [None]:
def make_pca_features(n_comp, train, test, feature_list, name, normalize=False, scaler=None):
    
    pca = PCA(n_comp)
    train_pca = pca.fit_transform(train[feature_list])
    test_pca = pca.transform(test[feature_list])
    
    if normalize and scaler is not None:
        train_pca = scaler.fit_transform(train_pca)
        test_pca = scaler.transform(test_pca)
    
    for i in range(n_comp):
        train['{0}_{1}'.format(name, i)] = train_pca[:, i]
        test['{0}_{1}'.format(name, i)] = test_pca[:, i]
        
    return train, test

In [None]:
def preprocess(data):
    data['cp_time'] = data['cp_time'].map({24:0, 48:1, 72:2})
    data['cp_dose'] = data['cp_dose'].map({'D1':0, 'D2':1})
    return data

In [None]:
train_w_pca, test_w_pca = make_pca_features(3, train, test, g_features, 'g_pca', normalize=True, scaler=StandardScaler())

In [None]:
train_w_pca, test_w_pca = make_pca_features(2, train_w_pca, test_w_pca, c_features, 'c_pca', normalize=True, scaler=StandardScaler())

In [None]:
X_train, X_val, y_train, y_val = train_test_split(preprocess(train_w_pca.drop(columns = ['sig_id', 'cp_type'])), train_targets.drop(columns = ['sig_id']), test_size=0.2)

## 2. Model definition

In [None]:
class TabDataset:
    
    def __init__(self, X, y):
        self.X = X
        self.y = y
    
    def __len__(self):
        return(self.X.shape[0])
    
    def __getitem__(self, i):
        
        X_i = torch.from_numpy(self.X.iloc[i, :].values.astype(np.float32))
        y_i = torch.from_numpy(self.y.iloc[i, :].values.astype(np.float32))
        
        return X_i, y_i

In [None]:
class TabDatasetTest:
    
    def __init__(self, X):
        self.X = X
    
    def __len__(self):
        return(self.X.shape[0])
    
    def __getitem__(self, i):
        
        X_i = torch.from_numpy(self.X.iloc[i, :].values.astype(np.float32))        
        return X_i

In [None]:
train_ds = TabDataset(X_train, y_train)
valid_ds = TabDataset(X_val, y_val)

In [None]:
test_ds = TabDatasetTest(preprocess(test_w_pca.drop(columns = ['sig_id', 'cp_type'])))

In [None]:
test_ds[0].shape

In [None]:
train_ds[0][1].dtype

In [None]:
train_dl = DataLoader(train_ds, batch_size=16, num_workers=8)
valid_dl = DataLoader(valid_ds, batch_size=16, num_workers=8)
test_dl = DataLoader(test_ds)

In [None]:
def lin_block(in_size, out_size):
    return nn.Sequential(
        nn.BatchNorm1d(in_size), 
        nn.Dropout(0.2),
        nn.utils.weight_norm(nn.Linear(in_size, out_size))
    )

In [None]:
class Model(nn.Module):
    def __init__(self, in_size, hidden_size, out_size, num_blocks):
        super().__init__()
        
        self.num_blocks = num_blocks
        self.dense0 = nn.Sequential(
        nn.BatchNorm1d(in_size), 
        nn.utils.weight_norm(nn.Linear(in_size, hidden_size))
    )
        
        self.dense_blocks = nn.ModuleList()    
        for i in range(self.num_blocks):
            self.dense_blocks.append(lin_block(hidden_size, hidden_size))
           
        self.final = nn.Linear(hidden_size, out_size)
                
    def forward(self, x): 

        x = F.relu(self.dense0(x))

        for i, block in enumerate(self.dense_blocks):                
            x = F.relu(block(x))

        x = self.final(x)
        return x            

In [None]:
def fit(epochs, train_dl, valid_dl, model, loss_func, score_func, optimizer, scheduler):
    losses, val_losses, scores = [], [], []
    
    for epoch in range(epochs):
        t0 = time.time()
        train_loss = 0.0
        valid_loss = 0.0
#         score = 0
        
        model.train()
        for inputs, labels in train_dl:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            
            loss = loss_func(outputs, labels)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            train_loss += loss.item()

        model.eval();
        with torch.no_grad():
            for inputs, labels in valid_dl:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)

                valid_loss += loss_func(outputs, labels).item()
#                 score += score_func(outputs, labels).item()
        
        train_loss /= len(train_dl)
        valid_loss /= len(valid_dl)
#         score      /= len(valid_dl)
        
        scheduler.step(valid_loss)
        
        print(f'[{epoch + 1}, {time.time() - t0:.1f}] train loss: {train_loss}, val loss: {valid_loss}') # , score: {score:.3f}')
        losses.append(train_loss)
        val_losses.append(valid_loss)
#         scores.append(score)
                
    return losses, val_losses #, scores

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
model = Model(879, 1024, 206, 2)
model.to(device)

In [None]:
weights = y_train.sum(axis=0)

In [None]:
weights

In [None]:
w = torch.from_numpy(y_train.shape[0] / weights.values)

In [None]:
w.shape

In [None]:
loss = nn.BCEWithLogitsLoss(pos_weight=w.to(device)) # pos_weight=torch.from_numpy(w).to(device)
optimizer = torch.optim.Adam(model.parameters(), 1e-2)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, verbose=True)

### 1. No cross validation

In [None]:
losses_train, losses_val = fit(20, train_dl, valid_dl, model, loss, None, optimizer, scheduler=scheduler)

We overfit without the dropout layers

In [None]:
plt.plot(range(len(losses_train)), losses_train, label='train');
plt.plot(range(len(losses_val)), losses_val, label='val');
plt.xlabel('epoch')
plt.ylabel('loss')
plt.legend();

### 1. Multilabel cross validation

In [None]:
def inference_fn(model, dataloader, device):
    model.eval()
    preds = []
    
    for data in dataloader:
        inputs = data.to(device)

        with torch.no_grad():
            outputs = model(inputs)
        
        preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    preds = np.concatenate(preds)
    
    return preds

In [None]:
preds = inference_fn(model, test_dl, device)

In [None]:
results = pd.DataFrame(preds, columns=train_targets.columns[1:])

test_sig_ids.reset_index(drop=True, inplace=True)
results['sig_id'] = test_sig_ids

In [None]:
sample_subs = pd.read_csv('/kaggle/input/lish-moa/sample_submission.csv')

In [None]:
submission = sample_subs[['sig_id']].merge(results, on='sig_id', how='left').fillna(0)

In [None]:
submission.to_csv('submission.csv', index=False)