In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os, time

import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.metrics import make_scorer
from sklearn.preprocessing import OneHotEncoder

from scipy.special import expit, logit

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch import nn
import torch.utils.data as utils

import matplotlib.pyplot as plt


In [None]:
data_root = "/kaggle/input/lish-moa/"
train_X = pd.read_csv(data_root + 'train_features.csv')
train_Y = pd.read_csv(data_root + 'train_targets_scored.csv')
test_X = pd.read_csv(data_root + 'test_features.csv')
sample_submission = pd.read_csv(data_root + 'sample_submission.csv')

In [None]:
# cp_type and cp_dose have both 2 categories each
# OneHotEncoder will return 4 columns, 
# one for each of the 4 unique categorical values
ohe = OneHotEncoder(sparse=False)
onehotfeat = ohe.fit_transform(train_X[['cp_type', 'cp_dose', 'cp_time']])
# print(onehotfeat.shape, ohe.categories_)

# add these 4 features into the original train dataset
train_X['cp_type_0'] = onehotfeat[:,0]
train_X['cp_type_1'] = onehotfeat[:,1]
train_X['cp_dose_0'] = onehotfeat[:,2]
train_X['cp_dose_1'] = onehotfeat[:,3]
train_X['cp_time_24'] = onehotfeat[:,4]
train_X['cp_time_48'] = onehotfeat[:,5]
train_X['cp_time_72'] = onehotfeat[:,6]

# dot the sane thing to the test dataset
onehotfeat = ohe.transform(test_X[['cp_type', 'cp_dose', 'cp_time']])
test_X['cp_type_0'] = onehotfeat[:,0]
test_X['cp_type_1'] = onehotfeat[:,1]
test_X['cp_dose_0'] = onehotfeat[:,2]
test_X['cp_dose_1'] = onehotfeat[:,3]
test_X['cp_time_24'] = onehotfeat[:,4]
test_X['cp_time_48'] = onehotfeat[:,5]
test_X['cp_time_72'] = onehotfeat[:,6]

# drop the original cp_type and cp_dose columns from the dataset
train_X.drop(['cp_type', 'sig_id', 'cp_dose', 'cp_time'], axis=1, inplace=True)
test_X.drop(['cp_type', 'sig_id', 'cp_dose', 'cp_time'], axis=1, inplace=True)
train_Y.drop(['sig_id'],axis=1,inplace=True)

print(train_X, test_X)
print(train_X.columns, test_X.columns)
print(train_X.shape, train_Y.shape, test_X.shape)


In [None]:
# we only scale the real-valued features and not the categorical ones
g_cols = [col for col in train_X.columns if col.startswith('g-')]
c_cols = [col for col in train_X.columns if col.startswith('c-')]
transform_feature_list = g_cols + c_cols


def scale_and_PCA(pca_num_components, train, test, cols_to_transform, transformed_col_name):
    # create data by stacking rows from both train and test, for the required columns
    data = pd.concat([train[cols_to_transform], test[cols_to_transform]], axis=0).reset_index(drop=True)
    n = train.shape[0]
    
    # scale
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data)

    # PCA
    pca = PCA(pca_num_components)
    pca_data = pca.fit_transform(scaled_data)

    train_trans = pca_data[:n, :]
    test_trans = pca_data[n:, :]

    return train_trans, test_trans


# first scale and PCA on "g-" features
# we use top 30 "g-" features after PCA
train_X_g, test_X_g = scale_and_PCA(30, train_X, test_X, g_cols, 'g_pca')

# next, scale and tranform the "c-" features
# we use top 10 "c-" features after PCA
train_X_c, test_X_c = scale_and_PCA(10, train_X, test_X, c_cols, 'c_pca')

# concatenate the "g-" and "c-" columns
data_train = np.concatenate((train_X_g, train_X_c), axis=1)
data_test = np.concatenate((test_X_g, test_X_c), axis=1)

# now concatenate train and test rows
n = data_train.shape[0]
data = np.concatenate((data_train, data_test), axis=0)

# categorical columns are the last 7 columns of train/test data (after the preprocessing that we did earlier)
cat_data = np.concatenate((train_X.iloc[:, -7:].to_numpy(), test_X.iloc[:, -7:].to_numpy()), axis=0)

# add back the 7 categorical columns in front of the real-valued columns
transformed_data = np.concatenate((cat_data, data), axis=1)

# seaparate the train/test data
transformed_train_data = transformed_data[:n, :]
transformed_test_data = transformed_data[n:, :]
transformed_train_targets = train_Y.values

print(transformed_train_data.shape, transformed_test_data.shape)

# Torch the data!

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
batch_size = 64


def average_log_loss (y_true, y_pred):
    print(y_true.shape, y_pred.shape)
    num_samples, num_outputs = y_true.shape
    loss = 0.00
    for i in range(num_outputs):
        loss += log_loss(y_true[:, i], y_pred[:, i])
    loss /= num_outputs
    return loss


class NNet(nn.Module):
    def __init__(self):
        super(NNet, self).__init__()
        self.mlp = nn.Sequential(
            torch.nn.Linear(47, 512),
            torch.nn.ReLU(),
            torch.nn.Linear(512, 256),
            torch.nn.Dropout(0.3),
            torch.nn.Linear(256, 206))

    def forward(self, x):
        return self.mlp(x)


loss_fn = torch.nn.BCEWithLogitsLoss()

In [None]:
def train(model, device, train_loader, optimizer, epoch):
    model.train()
    correct = 0
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        # print(data.shape, target.shape)
        optimizer.zero_grad()
        output = model(data.float())
        # print(output.shape, output)
        target = target.type_as(output)
        loss = loss_fn(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % 100 == 0:  # Print loss every 100 batch
            print('Train Epoch: {}\tLoss: {:.6f}'.format(
                epoch, loss.item()))
    test(model, device, train_loader)
    return None

def test(model, device, test_loader):
    model.eval()
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data.float())

'''
kfold = KFold(n_splits=5)

for train_index, val_index in kfold.split(transformed_train_data, train_Y.values):  
    print(train_index, val_index)
'''

In [None]:
n = transformed_train_data.shape[0]

# transform to torch tensors
# 3:1 split for train:val
tensor_train_x = torch.tensor(transformed_train_data, device=device)
tensor_train_y = torch.tensor(transformed_train_targets, device=device)
'''tensor_val_x = torch.tensor(transformed_train_data[:n//4, :], device=device)
tensor_val_y = torch.tensor(transformed_train_targets[:n//4, :], device=device)'''
tensor_test_x = torch.tensor(transformed_test_data, device=device)
    
# create dataset and dataloader for training/validation data
train_dataset = utils.TensorDataset(tensor_train_x, tensor_train_y)
train_loader = utils.DataLoader(train_dataset, batch_size=batch_size)

test_loader = utils.DataLoader(tensor_test_x, shuffle=False)
'''val_dataset = utils.TensorDataset(tensor_val_x, tensor_val_y)
val_loader = utils.DataLoader(val_dataset, batch_size=batch_size)'''

In [None]:
model = NNet().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(20):
    train(model, device, train_loader, optimizer, epoch)
    print(epoch)

In [None]:
predictions = model(tensor_test_x.float())
predictions = expit(predictions.detach().cpu().numpy())
print(predictions, predictions.shape)

sample_submission[sample_submission.columns.to_list()[1:]] = predictions
sample_submission.to_csv('submission.csv',index=False)
# !rm ./submission.csv