In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#read in data
X_train = pd.read_csv('/kaggle/input/lish-moa/train_features.csv')
X_test = pd.read_csv('/kaggle/input/lish-moa/test_features.csv')

y_train = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv').drop(columns=['sig_id'])

In [None]:
def encode_dummies(df):
    df['treatment'] = 1*(df.cp_type == 'trt_cp')
    df['D1'] = 1*(df.cp_dose == 'D1')
    df['cp_time'] = df.cp_time/24
    return df.drop(columns = ['sig_id', 'cp_type', 'cp_dose'])

In [None]:
X_train = encode_dummies(X_train)
X_test = encode_dummies(X_test)

gene_list = ['g-'+str(i) for i in range(772)]
gene_exp = X_train[gene_list].values

cell_list = ['c-'+str(i) for i in range(100)]
cell_via = X_train[cell_list].values

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.utils import resample
from joblib import dump, load
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
def pca_features(df, var_list, n):
    '''
    df: dataframe
    var_list: variable list to be fitted
    n: number of components

    return: (fitted PCA model, principal components)
    '''
    pca = PCA(n_components=n)
    pca_features = pca.fit_transform(df[var_list])
    return (pca, pca_features)

def df_to_array(df, var_list, gene, cell):
    '''
    return: numpy array with transformed principal components
    '''
    a = df.drop(columns=var_list).values
    a = np.hstack((a, gene, cell))
    return a

def pca_transform(X_train_2, X_train_val):
    '''
    Run PCA on training data and transform both train and test data according to obtained components

    X_train_2: dataframe, training data 
    X_train_val: dataframe, test/validation data

    return: X_train, X_val, y_train, y_val as numpy array
    '''

    # X_train_2, X_train_val, y_train_2, y_train_val = train_test_split(X_train, y_train, test_size = 0.2, random_state=47)

    gene_list = ['g-'+str(i) for i in range(772)]
    cell_list = ['c-'+str(i) for i in range(100)]

    pca_gene, gene_features = pca_features(X_train_2, gene_list, 50)
    pca_cell, cell_features = pca_features(X_train_2, cell_list, 10)

    X_train_2 = df_to_array(X_train_2, gene_list+cell_list, gene_features, cell_features)
    X_train_val = df_to_array(X_train_val, gene_list+cell_list, pca_gene.transform(X_train_val[gene_list]), pca_cell.transform(X_train_val[cell_list]))

    # y_train_2 = y_train_2.values
    # y_train_val = y_train_val.values

    return X_train_2, X_train_val

def upsample(X, y, i):
    '''
    upsample positive response in the ith column of y matrix to size 1000
    '''
    # indices of positive response in the ith column
    idx1 = np.where(y[:,i]==1)[0]
    # indices of negative response in the ith column
    idx0 = np.where(y[:,i]==0)[0]
    # sample with replacement 
    up_idx = np.random.choice(idx1, replace = True, size = 1000)
    X_upsampled = np.vstack((X[idx0], X[up_idx.T]))
    y_upsampled = np.vstack(((y[idx0, i:(i+1)], y[up_idx.T, i:(i+1)])))

    return X_upsampled, y_upsampled[:,0]

In [None]:


# #predict probability
# gbc = GradientBoostingClassifier(learning_rate=0.01, max_features=40, n_estimators=180)
# train_loss = []
# y_test_pred = []
# for i in range(y_train_2.shape[1]):
#     y_train_one = y_train_2[:,i]

#     X_train_upsampled, y_train_upsampled = upsample(X_train_2, y_train_2, i)

#     gbc.fit(X_train_upsampled, y_train_upsampled)
#     y_train_pred = gbc.predict_proba(X_train_2)
#     y_pred = gbc.predict_proba(X_test_2)[:,1]

#     train_loss.append(log_loss(y_train_one, y_train_pred, labels=[0,1]))
#     y_test_pred.append(y_pred)

# y_pred = np.array(y_test_pred)
# y_pred = y_pred.T

# #convert to dataframe
# col = y_train.columns
# y_pred_df = pd.DataFrame(y_pred, columns = col)
# sig_id = pd.read_csv('test_features.csv')[['sig_id']]
# y_pred_df = sig_id.join(y_pred_df)

# #write to output
# y_pred_df.to_csv('submission.csv')

# Training with Random Forest

In [None]:
X_train_2, X_test_2 = pca_transform(X_train, X_test)
y_train_2 = y_train.values

rf = RandomForestClassifier(n_estimators=250, max_depth=30)
rf.fit(X_train_2, y_train_2)

y_pred_prob = rf.predict_proba(X_test_2)

y_pred = []
for i in range(len(y_pred_prob)):
    y_pred.append(y_pred_prob[i][:,1])

y_pred_rf = np.array(y_pred).T


In [None]:
# #convert to dataframe
# col = y_train.columns
# y_pred_df = pd.DataFrame(y_pred, columns = col)
# sig_id = pd.read_csv('/kaggle/input/lish-moa/test_features.csv')[['sig_id']]
# y_pred_df = sig_id.join(y_pred_df)

# #write to output
# y_pred_df.to_csv('submission.csv', index=False)

# Training with Neural Netword

In [None]:
import torch
from torch import nn, optim
import torch.nn.functional as F 
import torch.utils.data as data

In [None]:
X_train.values

In [None]:
train_data = data.TensorDataset(torch.from_numpy(X_train.values), torch.from_numpy(y_train.values))
trainloader = data.DataLoader(train_data, batch_size = 256, shuffle=True)

test_data = torch.from_numpy(X_test.values)

In [None]:
class Net(nn.Module):
  
    def __init__(self, n_features, drop_rate):
        super(Net, self).__init__()
        self.linear1 = nn.Linear(n_features, 500)
        self.linear2 = nn.Linear(500, 400)
        self.linear3 = nn.Linear(400, 300)
        self.linear4 = nn.Linear(300, 206)
        # self.linear5 = nn.Linear(200, 206)


    def forward(self, x):
        x = F.relu(self.linear1(x))
        x = F.dropout(x, p=drop_rate)
        x = F.relu(self.linear2(x))
        x = F.dropout(x, p=drop_rate)
        x = F.relu(self.linear3(x))
        x = F.dropout(x, p=drop_rate)
        x = self.linear4(x)
        # x = F.dropout(x, p=drop_rate)
        # x = self.linear5(x)

        return torch.sigmoid(x)

In [None]:
drop_rate = 0.2
net = Net(X_train.shape[1], drop_rate)

criterion = nn.BCELoss()
optimizer = optim.Adam(net.parameters(), lr=0.0008, weight_decay=0.00005)

for epoch in range(15):

    train_loss = []

    for batch in trainloader:

        optimizer.zero_grad()

        y_pred = net(batch[0].float())
        loss = criterion(y_pred, batch[1].float())
        loss.backward()
        optimizer.step()

        train_loss.append(loss.item())
        
print("training loss:", np.mean(train_loss))

In [None]:
# Generate predictions on test features
net.eval()
y_pred = net(test_data.float())
y_pred_nn = y_pred.detach().numpy()


In [None]:
y_pred = (y_pred_rf+y_pred_nn)/2

In [None]:

# Convert to submission format
col = y_train.columns
y_pred_df = pd.DataFrame(y_pred, columns = col)
sig_id = pd.read_csv('/kaggle/input/lish-moa/test_features.csv')[['sig_id']]
y_pred_df = sig_id.join(y_pred_df)

#write to output
y_pred_df.to_csv('submission.csv', index=False)