In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import random
from matplotlib.ticker import MultipleLocator
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
import sys
sys.path.append('../input/iterative-stratification/iterative-stratification-master/')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_features = pd.read_csv("/kaggle/input/lish-moa/train_features.csv")
train_targets_scored = pd.read_csv("/kaggle/input/lish-moa/train_targets_scored.csv")
train_targets_nonscored = pd.read_csv("/kaggle/input/lish-moa/train_targets_nonscored.csv")
test_features = pd.read_csv("/kaggle/input/lish-moa/test_features.csv")
sample_submission = pd.read_csv("/kaggle/input/lish-moa/sample_submission.csv")

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

In [None]:
def add_stats(df, columns, prefix):
    df[prefix + '_mean'] = columns.mean(axis=1)
    df[prefix + '_sum'] = columns.sum(axis=1)
    df[prefix + '_std'] = columns.std(axis=1)
    df[prefix + '_kurt'] = columns.kurtosis(axis = 1)
    df[prefix + '_skew'] = columns.skew(axis = 1)

In [None]:
train_full = train_features.merge(train_targets_scored, on='sig_id')
train_full = train_full[train_full.cp_type != 'ctl_vehicle']
train_full = train_full.drop(columns=['sig_id', 'cp_type'])
train_full['cp_dose'] = pd.get_dummies(train_full['cp_dose'], drop_first=True)

X_train = train_full.iloc[:, :874]
y_train = train_full.iloc[:, 874:]

X_test = test_features
X_test = X_test[X_test.cp_type != 'ctl_vehicle']
X_test = X_test.drop(columns=['sig_id', 'cp_type'])
X_test['cp_dose'] = pd.get_dummies(X_test['cp_dose'], drop_first=True)

add_stats(X_train, X_train.iloc[:, 2:774], 'g')
add_stats(X_train, X_train.iloc[:, 774:874], 'c')
add_stats(X_train, X_train.iloc[:, 2:874], 'gc')
                    
add_stats(X_test, X_test.iloc[:, 2:774], 'g')
add_stats(X_test, X_test.iloc[:, 774:874], 'c')
add_stats(X_test, X_test.iloc[:, 2:874], 'gc')

X_full = pd.concat([X_train, X_test])

In [None]:
def pca_analyse(columns, num_comp, scaler=None):
    pca = PCA(n_components=num_comp)
    if scaler is not None:
        columns = scaler.fit_transform(columns)
    pca.fit(columns)
    return np.cumsum(pca.explained_variance_ratio_)

PCA for g- features

In [None]:
pca_nonscaled = pca_analyse(X_full.iloc[:, 2:774], 700)
pca_stand = pca_analyse(X_full.iloc[:, 2:774], 700, StandardScaler())
pca_quant = pca_analyse(X_full.iloc[:, 2:774], 700, QuantileTransformer(output_distribution="normal"))

In [None]:
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(1, 1, 1)

plt.plot(pca_nonscaled, label='nonscaled')
plt.plot(pca_stand, label='stand')
plt.plot(pca_quant, label='quant')
ax.yaxis.set_major_locator(MultipleLocator(0.05))
ax.grid(which='major')
ax.set_xlabel("Число компонент")
ax.set_ylabel("Доля объясненной дисперсии")
ax.set_title("g- признаки")
plt.legend()

Будем использовать QuantileTransformer с num_comp=600, чтобы доля объясненной дисперсии равнялась 0.95

PCA for с- features

In [None]:
pca_nonscaled = pca_analyse(X_full.iloc[:, 774:874], 100)
pca_stand = pca_analyse(X_full.iloc[:, 774:874], 100, StandardScaler())
pca_quant = pca_analyse(X_full.iloc[:, 774:874], 100, QuantileTransformer(output_distribution="normal"))

In [None]:
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(1, 1, 1)

plt.plot(pca_nonscaled, label='nonscaled')
plt.plot(pca_stand, label='stand')
plt.plot(pca_quant, label='quant')
ax.yaxis.set_major_locator(MultipleLocator(0.05))
ax.grid(which='major')
ax.set_xlabel("Число компонент")
ax.set_ylabel("Доля объясненной дисперсии")
ax.set_title("с- признаки")
plt.legend()

Будем использовать QuantileTransformer с num_comp=80, чтобы доля объясненной дисперсии равнялась 0.95

In [None]:
def add_pca(df, columns, num_comp, prefix, scaler):
    pca = PCA(n_components=num_comp)
    if scaler is not None:
        columns = scaler.fit_transform(columns)
    comps = pca.fit_transform(columns)
    pca_df = pd.DataFrame(comps, 
                          columns=[prefix + f"_pca_{i}".format(i) for i in range(num_comp)])
    return pd.concat((df.reset_index(drop=True), pca_df.reset_index(drop=True)), axis=1)
    

In [None]:
def variance_threshold(df, columns, thr=0.85):
    vt = VarianceThreshold(threshold=thr)
    var = vt.fit(columns).variances_
    
    drop_cols = columns.columns[var < thr]
    return df.drop(columns=drop_cols)

In [None]:
train_full = train_features.merge(train_targets_scored, on='sig_id')

X_full = add_pca(X_full, X_full.iloc[:, 2:774], 600, 'g', 
                 QuantileTransformer(output_distribution="normal"))
X_full = add_pca(X_full, X_full.iloc[:, 774:874], 80, 'c', 
                 QuantileTransformer(output_distribution="normal"))

X_full = variance_threshold(X_full, X_full.drop(columns=['cp_time', 'cp_dose']), thr=0.9)

X_train = X_full[:X_train.shape[0]]
X_test = X_full[-X_test.shape[0]:]

In [None]:
X_train.shape

In [None]:
class MoaDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = X
        self.y = y
        self.device = device
    
    def __getitem__(self, idx):
        if self.y is None:
            return torch.tensor(self.X[idx], dtype=torch.float)
        
        return torch.tensor(self.X[idx], dtype=torch.float),  \
                   torch.tensor(self.y[idx], dtype=torch.float)
    
    def __len__(self):
        return self.X.shape[0]
        

In [None]:
class Model(nn.Module):
    def __init__(self, num_features, num_targets, hidden_size):
        super(Model, self).__init__()
        self.batch_norm1 = nn.BatchNorm1d(num_features)
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_features, hidden_size))
        
        self.batch_norm2 = nn.BatchNorm1d(hidden_size)
        self.dropout2 = nn.Dropout(0.2)
        self.dense2 = nn.utils.weight_norm(nn.Linear(hidden_size, hidden_size))
        
        self.batch_norm3 = nn.BatchNorm1d(hidden_size)
        self.dropout3 = nn.Dropout(0.2)
        self.dense3 = nn.utils.weight_norm(nn.Linear(hidden_size, num_targets))
        
    def forward(self, x):
        x = self.batch_norm1(x)
        x = F.leaky_relu(self.dense1(x))
        
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = F.leaky_relu(self.dense2(x))
        
        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = self.dense3(x)
        
        return x

In [None]:
def valid_epoch(model, criterion, dataloader, device):
    model.eval()
    
    with torch.no_grad():
        total_loss = 0.
        total_num = 0
        for X, y in dataloader:
            X = X.to(device)
            y = y.to(device)
            outputs = model(X)
            loss = criterion(outputs, y)
            total_loss += loss.item() * X.shape[0]
            total_num += X.shape[0]
        
        return total_loss / total_num

def train_epoch(model, criterion, optimizer, scheduler, train_dataloader, valid_dataloader, device):
    model.train()
    total_loss = 0.
    total_num = 0
    for X, y in train_dataloader:
        X = X.to(device)
        y = y.to(device)
        optimizer.zero_grad()
        
        outputs = model(X)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.detach().item() * X.shape[0]
        total_num += X.shape[0]
        
    train_loss = total_loss / total_num
        
    val_loss = valid_epoch(model, criterion, val_dataloader, device)
    return train_loss, val_loss
        
def predict(model, dataset, device):
    model.eval()
    result = []
    with torch.no_grad():
        for elem in dataset:
            elem = elem.unsqueeze(0).to(device)
            output = model(elem)
            result.append(torch.sigmoid(output).cpu().numpy())
    
    return np.concatenate(result)

In [None]:
device = 'cuda'
train_loss_list = []
val_loss_list = []
pred_list = []
test_dataset = MoaDataset(X_test.values)
num_features = X_train.shape[1]
num_targets = y_train.shape[1]

criterion = torch.nn.BCEWithLogitsLoss()
seed_list = np.arange(7)

for seed in seed_list:
    seed_everything(seed)
    mskf = MultilabelStratifiedKFold(n_splits=7, shuffle=True, random_state=seed)
    for fold_ind, (train_idx, val_idx) in enumerate(mskf.split(X_train, y_train)):
        curr_train_loss_list = []
        curr_val_loss_list = []

        train_dataset = MoaDataset(X_train.values[train_idx], y_train.values[train_idx])
        train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=1)

        val_dataset = MoaDataset(X_train.values[val_idx], y_train.values[val_idx])
        val_dataloader = DataLoader(val_dataset, batch_size=128, shuffle=True, num_workers=1)

        model = Model(num_features, num_targets, 1024).to(device)
        optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
        # scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=3, verbose=True)
        # scheduler = optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=5, gamma=0.1)
        scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e3, 
                                                  max_lr=1e-2, epochs=25, steps_per_epoch=len(train_dataloader))


        for epoch_num in range(25):
            train_loss, val_loss = train_epoch(model, criterion, optimizer, scheduler, 
                                               train_dataloader, val_dataloader, device)

            curr_train_loss_list.append(train_loss)
            curr_val_loss_list.append(val_loss)

            print('seed: ', seed, ' fold: ', fold_ind, ' epoch_num: ', epoch_num, ' train loss: ', train_loss)
            print('seed: ', seed, ' fold: ', fold_ind, ' epoch_num: ', epoch_num, ' val loss: ', val_loss)

        train_loss_list.append(curr_train_loss_list)
        val_loss_list.append(curr_val_loss_list)

        pred_list.append(predict(model, test_dataset, device))
        
    
    
    

In [None]:
print("curr_train_loss: ", sum([elem[-1] for elem in train_loss_list]) / len(train_loss_list))
print("curr_val_loss: ", sum([elem[-1] for elem in val_loss_list]) / len(val_loss_list))

In [None]:
y_test = np.zeros((sample_submission.shape[0], y_train.shape[1]))
y_test[test_features.cp_type != 'ctl_vehicle'] = sum(pred_list) / len(pred_list)


In [None]:
submission = pd.DataFrame(y_test, columns=train_targets_scored.columns[1:])
submission['sig_id'] = test_features['sig_id']

In [None]:
submission.to_csv('submission.csv', index=False)