**Inspiration: This notebook - https://www.kaggle.com/damoonshahhosseini/aggregated-neural-networks

and this one - https://www.kaggle.com/haoweiiil/moa-random-forest-with-pca-and-neural-net/notebook

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

X_train = pd.read_csv('/kaggle/input/lish-moa/train_features.csv')
X_test = pd.read_csv('/kaggle/input/lish-moa/test_features.csv')

y_train = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv').drop(columns=['sig_id'])

In [None]:
def encode_dummies(df):
    df['treatment'] = 1*(df.cp_type == 'trt_cp')
    df['D1'] = 1*(df.cp_dose == 'D1')
    df['cp_time'] = df.cp_time/24
    return df.drop(columns = ['sig_id', 'cp_type', 'cp_dose'])

In [None]:
X_train = encode_dummies(X_train)
X_test = encode_dummies(X_test)

gene_list = ['g-'+str(i) for i in range(772)]
gene_exp = X_train[gene_list].values

cell_list = ['c-'+str(i) for i in range(100)]
cell_via = X_train[cell_list].values

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.utils import resample
from joblib import dump, load
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
def pca_features(df, var_list, n):

    pca = PCA(n_components=n)
    pca_features = pca.fit_transform(df[var_list])
    return (pca, pca_features)

def df_to_array(df, var_list, gene, cell):
    a = df.drop(columns=var_list).values
    a = np.hstack((a, gene, cell))
    return a

def pca_transform(X_train_2, X_train_val):

    gene_list = ['g-'+str(i) for i in range(772)]
    cell_list = ['c-'+str(i) for i in range(100)]

    pca_gene, gene_features = pca_features(X_train_2, gene_list, 50)
    pca_cell, cell_features = pca_features(X_train_2, cell_list, 10)

    X_train_2 = df_to_array(X_train_2, gene_list+cell_list, gene_features, cell_features)
    X_train_val = df_to_array(X_train_val, gene_list+cell_list, pca_gene.transform(X_train_val[gene_list]), pca_cell.transform(X_train_val[cell_list]))
    return X_train_2, X_train_val

def upsample(X, y, i):

    idx1 = np.where(y[:,i]==1)[0]
    
    idx0 = np.where(y[:,i]==0)[0]
    
    up_idx = np.random.choice(idx1, replace = True, size = 1000)
    X_upsampled = np.vstack((X[idx0], X[up_idx.T]))
    y_upsampled = np.vstack(((y[idx0, i:(i+1)], y[up_idx.T, i:(i+1)])))

    return X_upsampled, y_upsampled[:,0]

In [None]:
X_train_2, X_test_2 = pca_transform(X_train, X_test)
y_train_2 = y_train.values

rf = RandomForestClassifier(n_estimators=250, max_depth=30)
rf.fit(X_train_2, y_train_2)

y_pred_prob = rf.predict_proba(X_test_2)

y_pred = []
for i in range(len(y_pred_prob)):
    y_pred.append(y_pred_prob[i][:,1])

y_pred_rf = np.array(y_pred).T

In [None]:
import torch
from torch import nn, optim
import torch.nn.functional as F 
import torch.utils.data as data

In [None]:
train_data = data.TensorDataset(torch.from_numpy(X_train.values), torch.from_numpy(y_train.values))
trainloader = data.DataLoader(train_data, batch_size = 256, shuffle=True)

test_data = torch.from_numpy(X_test.values)

In [None]:
class Net(nn.Module):
  
    def __init__(self, n_features, drop_rate):
        super(Net, self).__init__()
        self.linear1 = nn.Linear(n_features, 200)
        self.linear2 = nn.Linear(200, 400)
        self.linear3 = nn.Linear(400, 600)
        self.linear4 = nn.Linear(600, 800)
        self.linear5 = nn.Linear(800, 1000)
        self.linear6 = nn.Linear(1000, 600)
        self.linear7 = nn.Linear(600, 200)
        self.linear8 = nn.Linear(200, 206)
        

    def forward(self, x):
        x = F.relu(self.linear1(x))
        x = F.dropout(x, p=drop_rate)
        x = F.relu(self.linear2(x))
        x = F.dropout(x, p=drop_rate)
        x = F.relu(self.linear3(x))
        x = F.dropout(x, p=drop_rate)
        x = F.relu(self.linear4(x))
        x = F.dropout(x, p=drop_rate)
        x = F.relu(self.linear5(x))
        x = F.dropout(x, p=drop_rate)
        x = F.relu(self.linear6(x))
        x = F.dropout(x, p=drop_rate)
        x = F.relu(self.linear7(x))
        x = F.dropout(x, p=drop_rate)
        x = self.linear8(x)

        return torch.sigmoid(x)
    
    
drop_rate = 0.18
net = Net(X_train.shape[1], drop_rate)

criterion = nn.BCELoss()
optimizer = optim.Adam(net.parameters(), lr=0.0008, weight_decay=1e-5)

for epoch in range(15):

    train_loss = []

    for batch in trainloader:

        optimizer.zero_grad()

        y_pred = net(batch[0].float())
        loss = criterion(y_pred, batch[1].float())
        loss.backward()
        optimizer.step()

        train_loss.append(loss.item())
        
print("training loss:", np.mean(train_loss))
net.eval()
y_pred = net(test_data.float())
y_pred_nn = y_pred.detach().numpy()

In [None]:

y_pred = (y_pred_rf+y_pred_nn)/2
# Convert to submission format
col = y_train.columns
y_pred_df = pd.DataFrame(y_pred, columns = col)
sig_id = pd.read_csv('/kaggle/input/lish-moa/test_features.csv')[['sig_id']]
y_pred_df = sig_id.join(y_pred_df)

#write to output
y_pred_df.to_csv('submission.csv', index=False)

In [None]:
#       self.linear1 = nn.Linear(n_features, 200)
#       self.linear2 = nn.Linear(200, 180)
#       self.linear3 = nn.Linear(180, 150)
#       self.linear4 = nn.Linear(150, 200)
#       self.linear5 = nn.Linear(200, 180)
#       self.linear6 = nn.Linear(180, 150)
#       self.linear7 = nn.Linear(150, 200)
#       self.linear8 = nn.Linear(200, 206)