In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from tqdm import tqdm
import seaborn as sns
import random
random.seed(123)
pd.set_option('display.max_columns',None)
import os
%config InlineBackend.figure_format = 'svg'
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_features_1= pd.read_csv('/kaggle/input/lish-moa/train_features.csv')
train_features_1.head()

In [None]:
train_features_1.shape

In [None]:
train_features_1.describe()

In [None]:
train_features_1.shape

In [None]:
train_features_1.cp_type.value_counts()

In [None]:
plt.figure(figsize=(15,6))
ax= plt.subplot2grid((1,3),(0,0))
#Train Sample treated with compunds
plt.hist(x='cp_type', data=train_features_1, alpha=0.85,color='r')
plt.title('Train: Samples treated with compounds')
#Train Doses
ax= plt.subplot2grid((1,3),(0,1))
plt.hist(x='cp_dose', data=train_features_1, alpha=0.85)
plt.title('Train:Doses: Low and High')
#Treatment duration
ax= plt.subplot2grid((1,3),(0,2))
plt.hist(x="cp_time", data=train_features_1, alpha=0.85,color='g')
plt.title('Train:Treatment Duration')
plt.show()

In [None]:
gene= [g for g in train_features_1.columns if g.startswith("g-")]
print(f"No. of gene features: {len(gene)}")
cell= [c for c in train_features_1.columns if c.startswith("c-")]
print(f"No. of cell features: {len(cell)}")

In [None]:
plt.figure(figsize=(16,16))
gene_sel= np.random.choice(len(gene),16)
for i,col in enumerate(gene_sel):
    plt.subplot(4,4,i+1)
    plt.hist(train_features_1.loc[:,gene[col]],bins=100)
    plt.title(gene[col])

In [None]:
plt.figure(figsize=(16,16))
cell_sel= np.random.choice(len(cell),16)
for i,col in enumerate(cell_sel):
    plt.subplot(4,4,i+1)
    plt.hist(train_features_1.loc[:,cell[col]],bins=100, color='r')
    plt.title(cell[col])

In [None]:
train_targets_scored_1= pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')
train_targets_scored_1.head()

In [None]:
train_targets_scored_1.shape

In [None]:
train_targets_scored_1.describe()

In [None]:
train_id, val_id= train_test_split([i for i in range(0, 23814)], test_size=0.2)

In [None]:
print(len(train_id))
print(len(val_id))

In [None]:
train_features= train_features_1.iloc[train_id, 1:]
val_features= train_features_1.iloc[val_id, 1:]

train_labels= train_targets_scored_1.iloc[train_id, 1:]
val_labels= train_targets_scored_1.iloc[val_id, 1:]
print(len(train_features))
print(len(val_features))
print(len(train_labels))
print(len(val_labels))

In [None]:
# handling cp_type
cp_type_dict= { "trt_cp": 0, "ctl_vehicle": 1}

train_features["cp_type"] = train_features.cp_type.map(cp_type_dict)
val_features["cp_type"] = val_features.cp_type.map(cp_type_dict)

In [None]:
#handling cp_dose
cp_dose_dict= { "D1": 1,"D2": 2}

train_features["cp_dose"]= train_features.cp_dose.map(cp_dose_dict)
val_features["cp_dose"]= val_features.cp_dose.map(cp_dose_dict)

In [None]:
#handling cp_time
cp_time_dict= {24: 1,48: 2,72: 3}

train_features["cp_time"]= train_features.cp_time.map(cp_time_dict)
val_features["cp_time"]= val_features.cp_time.map(cp_time_dict)

In [None]:
#standard scaling target columns
targ_columns= [col for col in list(train_features_1.columns) if col not in ['sig_id','cp_type','cp_dose','cp_time' ]]

print("Number of target columns are {}".format(len(targ_columns)))

train_targ_columns= train_features[targ_columns].copy()
std_scal = StandardScaler().fit(train_targ_columns.values)
train_targ_columns = std_scal.transform(train_targ_columns.values)

print("Number of Train target columns are {}".format(len(train_targ_columns)))

val_targ_columns= val_features[targ_columns].copy()
val_targ_columns = std_scal.transform(val_targ_columns.values)
print("Number of Val target columns are {}".format(len(val_targ_columns)))

#assign to original data
train_features[targ_columns] = train_targ_columns
val_features[targ_columns] = val_targ_columns

In [None]:
sns.distplot(train_features[targ_columns].sum(axis=1))
plt.title("The Scored targets distribution")
plt.show()

In [None]:
all_cat = list(train_labels.columns)
len(all_cat)

In [None]:
model_dict = {}
for cat in tqdm(all_cat):
    # Training logistic regression model on train data
    logistic_model = LogisticRegression(max_iter=5000)
    logistic_model.fit(train_features, train_labels[cat])
    
    # saving model
    model_dict[cat] = logistic_model 

In [None]:
def calculate_score(models_dict, val_features, val_labels, all_cat):
    log_loss_per_cat = []
    for cat in tqdm(all_cat):
        # predicting using logistic regression model
        logistic_model = models_dict[cat]
        cat_prob = logistic_model.predict_proba(val_features)
        log_loss_per_cat.append(log_loss(val_labels[cat], cat_prob, labels=[0, 1]))
    
    return float(sum(log_loss_per_cat)) / len(log_loss_per_cat)

In [None]:
val_score = calculate_score(model_dict, val_features, val_labels, all_cat)
print("Validation score on validation set is {}".format(val_score))

In [None]:
test_features_1 = pd.read_csv('/kaggle/input/lish-moa/test_features.csv')
print(test_features_1.shape)
test_features_1.head()

In [None]:
#Map values from created dictionaries
test_features_1["cp_type"] = test_features_1.cp_type.map(cp_type_dict)
test_features_1["cp_dose"] = test_features_1.cp_dose.map(cp_dose_dict)
test_features_1["cp_time"] = test_features_1.cp_time.map(cp_time_dict)

In [None]:
##standard scaling
test_targ_columns= test_features_1[targ_columns].copy()
test_targ_columns = std_scal.transform(test_targ_columns.values)
test_features_1[targ_columns] = test_targ_columns

In [None]:
pred = pd.DataFrame()
pred["sig_id"] = test_features_1.sig_id
for cat in tqdm(all_cat):
    pred[cat] = model_dict[cat].predict_proba(test_features_1.iloc[:, 1:])[:, 1]

In [None]:
pred = pred.round(1)
pred.head()

In [None]:
pred.shape

In [None]:
ctl_test = list(test_features_1[test_features_1.cp_type == 1].sig_id)
print(len(ctl_test))

for id_ in tqdm(ctl_test):
    pred.loc[pred.sig_id == id_, all_cat] = 0.0

In [None]:
pred.head()

In [None]:
pred.to_csv("submission.csv", index=False)

In [None]:
train_features_1 = pd.read_csv('/kaggle/input/lish-moa/train_features.csv')
train_targets_scored_1= pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')
test_features_1 = pd.read_csv('/kaggle/input/lish-moa/test_features.csv')

train_targets_nonscored = pd.read_csv('/kaggle/input/lish-moa/train_targets_nonscored.csv')
print(train_targets_nonscored.shape)
train_targets_nonscored.head()

In [None]:
train_features_1.set_index('sig_id', inplace=True)
test_features_1.set_index('sig_id', inplace=True)
train_targets_scored_1.set_index('sig_id', inplace=True)
train_targets_nonscored.set_index('sig_id', inplace=True)

In [None]:
train_targets_nonscored.sum(axis=1).value_counts()

In [None]:
gene= [g for g in train_features_1.columns if g.startswith("g-")]
print(f"No. of gene features: {len(gene)}")
cell= [c for c in train_features_1.columns if c.startswith("c-")]
print(f"No. of cell features: {len(cell)}")

In [None]:
train_targets_scored_1.loc[:, train_targets_scored_1.sum(axis=0) > 600]

In [None]:
#PCA for Linear dimensionality reduction
g_pca = PCA(n_components=70).fit_transform(pd.concat((train_features_1[gene], test_features_1[gene])).values)

g_pca_train = pd.DataFrame(g_pca[:train_features_1.shape[0]], index=train_features_1.index)
g_pca_test = pd.DataFrame(g_pca[train_features_1.shape[0]:], index=test_features_1.index)

In [None]:
c_pca = PCA(n_components=10).fit_transform(pd.concat((train_features_1[cell], test_features_1[cell])).values)
c_pca_train = pd.DataFrame(c_pca[:train_features_1.shape[0]], index=train_features_1.index)
c_pca_test = pd.DataFrame(c_pca[train_features_1.shape[0]:], index=test_features_1.index)

In [None]:
 train_features_1.shape, c_pca_train.shape, g_pca_train.shape,

In [None]:
train_features_1 = pd.concat([train_features_1, g_pca_train, c_pca_train], axis=1)
test_features_1 = pd.concat([test_features_1, g_pca_test, c_pca_test], axis=1)
train_features_1.shape

In [None]:
drop_index = train_features_1[train_features_1.cp_type == 'ctl_vehicle'].index
train_features_df = train_features_1.drop(drop_index, axis=0)
train_features_df = train_features_df.drop('cp_type', axis=1)

train_target_df = train_targets_scored_1.drop(drop_index, axis=0)

drop_index = test_features_1[test_features_1.cp_type == 'ctl_vehicle'].index
test_features_df = test_features_1.drop(drop_index, axis=0)
test_features_df = test_features_df.drop('cp_type', axis=1)

In [None]:
#Convert categorical variable into dummy/indicator variables.
train_features_df = pd.get_dummies(train_features_df, columns=['cp_time', 'cp_dose'], drop_first=True)
test_features_df = pd.get_dummies(test_features_df , columns=['cp_time', 'cp_dose'], drop_first=True)

In [None]:
train_features_df.shape, test_features_df.shape

In [None]:
X_train_all = train_features_df.values
y_train_all = train_target_df.values
X_test = test_features_df.values

In [None]:
#Standarize
scaler = StandardScaler()
X_train_all = scaler.fit_transform(X_train_all)
X_test = scaler.transform(X_test)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train_all, y_train_all, test_size=0.2, random_state=42)

In [None]:
train_all_dataset = TensorDataset(torch.tensor(X_train_all).float(), torch.tensor(y_train_all).float())
train_all_loader = DataLoader(train_all_dataset, batch_size=128)

train_dataset = TensorDataset(torch.tensor(X_train).float(), torch.tensor(y_train).float())
val_dataset = TensorDataset(torch.tensor(X_val).float(), torch.tensor(y_val).float())

train_loader = DataLoader(train_dataset, batch_size=128)
val_loader = DataLoader(val_dataset, batch_size=128)

In [None]:
x, y = next(iter(train_loader))
x.shape, y.shape

In [None]:
class FFNN(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        
        self.bn1 = nn.BatchNorm1d(input_size)
        self.dropout1 = nn.Dropout(0.2)
        self.l1 = nn.utils.weight_norm(nn.Linear(input_size, 2048))
        self.bn2 = nn.BatchNorm1d(2048)
        self.dropout2 = nn.Dropout(0.5)
        self.l2 = nn.utils.weight_norm(nn.Linear(2048, 1024))
        self.bn3 = nn.BatchNorm1d(1024)
        self.dropout3 = nn.Dropout(0.5)
        self.l3 = nn.utils.weight_norm(nn.Linear(1024, output_size))
    
    def forward(self, x):
        x = self.bn1(x)
        x = self.dropout1(x)
        x = F.elu(self.l1(x))
        
        x = self.bn2(x)
        x = self.dropout2(x)
        x = F.elu(self.l2(x))
        
        x = self.bn3(x)
        x = self.dropout3(x)
        x = torch.sigmoid(self.l3(x))
        
        return x

In [None]:
model = FFNN(955, 206)

In [None]:
model(x).shape

In [None]:
def train_model(model, optimizer, loss_function, train_loader, val_loader=None, scheduler=None, epochs=1):

    for epoch in range(epochs):
        running_loss = 0.0
        for n_iter, (x, y) in enumerate(train_loader):
            model.train()
            x = x.to(device)
            y = y.to(device) 
            optimizer.zero_grad()
            y_pred = model(x)
            loss = loss_function(y_pred, y)
            loss.backward()
            optimizer.step()      
            running_loss += loss.item()
        running_loss /= len(train_loader)   
        
        if val_loader is not None:
            model.eval()  
            loss = 0.0
            with torch.no_grad():
                for (x, y) in val_loader:
                    x = x.to(device)
                    y = y.to(device) 
                    y_pred = model(x)
                    loss += loss_function(y_pred, y).item()
                loss /= len(val_loader)

            print("Epoch: [{}/{}] ".format(epoch + 1, epochs),
                  "Train loss: {:.6f}".format(running_loss),
                  "Val loss: {:.6f} ".format(loss))
        else:
            print("Epoch: [{}/{}] ".format(epoch + 1, epochs),
                  "Train loss: {:.6f}".format(running_loss))
        if scheduler is not None:
            scheduler.step()

In [None]:
loss_function = nn.BCELoss()
model = FFNN(955, 206).to(device)
optimizer = optim.Adam(lr=0.001, params=model.parameters(), weight_decay=1e-5)
scheduler = optim.lr_scheduler.StepLR(optimizer,10, gamma=0.5, last_epoch=-1)

In [None]:
train_model(model, optimizer, loss_function, train_loader, val_loader, epochs=50, scheduler=scheduler)

In [None]:
loss_function = nn.BCELoss()
model = FFNN(955, 206).to(device)
optimizer = optim.Adam(lr=0.001, params=model.parameters(), weight_decay=1e-5)
scheduler = optim.lr_scheduler.StepLR(optimizer,10, gamma=0.5, last_epoch=-1)
train_model(model, optimizer, loss_function, train_all_loader, epochs=50, scheduler=scheduler)

In [None]:
def predict(model, X):
    model.eval()  
            
    with torch.no_grad():
        X = X.to(device)
        preds = model(X)
    return preds.cpu().numpy()

In [None]:
y_pred = predict(model, torch.tensor(X_test).float())

In [None]:
submission = pd.DataFrame(np.zeros((test_features_1.shape[0], train_targets_scored_1.shape[1])),index=test_features_1.index, columns=train_targets_scored_1.columns)

In [None]:
pred_index = test_features_1[test_features_1.cp_type != 'ctl_vehicle'].index

In [None]:
submission.loc[pred_index, :] = y_pred

In [None]:
submission.reset_index(inplace=True)

In [None]:
submission

In [None]:
submission.to_csv('/kaggle/working/submission.csv', index=False)