In [None]:
import numpy as np
import pandas as pd
import os
import seaborn as sns

import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from keras.layers import *
from keras import callbacks

from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from tensorflow.keras.losses import BinaryCrossentropy
import tensorflow_addons as tfa

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn import preprocessing
from sklearn.decomposition import PCA
from tqdm.notebook import tqdm

import math


# Classifiers
# RF
from sklearn.ensemble import RandomForestClassifier
# Logistic 
from sklearn.linear_model import LogisticRegression
# N Bayes
from sklearn.naive_bayes import GaussianNB
# Supp Vec
from sklearn.svm import LinearSVC
# XGB
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
#KNN
from sklearn.neighbors import KNeighborsClassifier
#LGBM
from lightgbm import LGBMClassifier


In [None]:
paths = []
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        paths.append(os.path.join(dirname, filename))
        print(os.path.join(dirname, filename))

In [None]:
path = '/kaggle/input/lish-moa/test_features.csv'
test_features = pd.read_csv(path)

path = '/kaggle/input/lish-moa/train_features.csv'
train_features = pd.read_csv(path)

path = '/kaggle/input/lish-moa/train_drug.csv'
train_drug = pd.read_csv(path)

path = "/kaggle/input/lish-moa/train_targets_scored.csv"
train_targets = pd.read_csv(path)

path = '/kaggle/input/lish-moa/sample_submission.csv'
sample_sub = pd.read_csv(path)

path = "/kaggle/input/lish-moa/train_targets_nonscored.csv"
targets_nonscored = pd.read_csv(path)

In [None]:
test_control_index = test_features.query('cp_type == "ctl_vehicle"').index
test_trt_index = test_features.query('cp_type == "trt_cp"').index

train_control_index = train_features.query('cp_type == "ctl_vehicle"').index
train_trt_index = train_features.query('cp_type == "trt_cp"').index

All Cleaning

In [None]:
def preprocess(df):
    df = df.copy()
    df.loc[:, 'cp_type'] = df.loc[:, 'cp_type'].map({'trt_cp': 1, 'ctl_vehicle': 0})
    df.loc[:, 'cp_dose'] = df.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})
    df.loc[:, 'cp_time'] = df.loc[:, 'cp_time'].map({24: 0, 48: 1, 72: 2})    
    df = pd.get_dummies(df, columns=['cp_time','cp_dose'])
    del df['sig_id']
    return df

train = preprocess(train_features)
train = train.query('cp_type == 1').reset_index(drop=True)
del train['cp_type']

test = preprocess(test_features)
test = test.query('cp_type == 1').reset_index(drop=True)
del test['cp_type']
del train_targets['sig_id']

GENES = [col for col in train.columns if col.startswith('g-')]
CELLS = [col for col in train.columns if col.startswith('c-')]


# PCA with sklearn

# Genes

data = pd.concat([pd.DataFrame(train[GENES]), pd.DataFrame(test[GENES])])

# use 515 from biplot
data2 = (PCA(n_components=515, random_state=11).fit_transform(data[GENES]))

# Train Components
train2 = data2[:train.shape[0]]

# Test Components
test2 = data2[-test.shape[0]:]


train2 = pd.DataFrame(train2, columns=[f'pca_G-{i}' for i in range(515)])
test2 = pd.DataFrame(test2, columns=[f'pca_G-{i}' for i in range(515)])

train_features = pd.concat([train, train2], axis=1)
test_features = pd.concat([test, test2], axis=1)

# Cells

data = pd.concat([pd.DataFrame(train[CELLS]), pd.DataFrame(test[CELLS])])

# One principle component explains 80+ % of the variance
# using 3 
data2 = (PCA(n_components=3, random_state=42).fit_transform(data[CELLS]))

# Train Components 
train2 = data2[:train.shape[0]]

# Test Components
test2 = data2[-test.shape[0]:]


train2 = pd.DataFrame(train2, columns=[f'pca_C-{i}' for i in range(3)])
test2 = pd.DataFrame(test2, columns=[f'pca_C-{i}' for i in range(3)])

train_features = pd.concat([train_features, train2], axis=1)
test_features = pd.concat([test_features, test2], axis=1)

assert len(train_features.columns) == len(test_features.columns), "data incorrectly formatted"


GENES_pca = [col for col in train_features.columns if col.startswith('pca_G')]
CELLS_pca = [col for col in train_features.columns if col.startswith('pca_C')]

# Numeric Columns
# Don't want to do this to dummy vars
# Might not make a difference
numeric_cols = GENES + GENES_pca + CELLS + CELLS_pca

# Min Max Scalar
scaler = preprocessing.MinMaxScaler()
scaler.fit(train_features[numeric_cols].append(test_features[numeric_cols]))

train_trans = scaler.transform(train_features[numeric_cols])
test_trans = scaler.transform(test_features[numeric_cols])

# put back into df format
train = pd.DataFrame(train_trans, columns=train_features[numeric_cols].columns)
test = pd.DataFrame(test_trans, columns=test_features[numeric_cols].columns)

# Add dummy vars
train = pd.concat([train,train_features.drop(numeric_cols,axis=1)],axis=1)
test = pd.concat([test,test_features.drop(numeric_cols,axis=1)],axis=1)

In [None]:
predictions = []
train_targs = train_targets[train_targets.index.isin(train_trt_index)]


for j in range(train_targs.shape[1]):
    X_train = train.to_numpy()
    y_train = train_targs.iloc[:,j].to_numpy()

    print("XGBoost Fit on column:",j)
    XGB = XGBClassifier(n_estimators = 150, 
                  max_depth = 3, reg_alpha = 2, min_child_weight = 2,
                  gamma = 3, learning_rate = 0.05, 
                  colsample_bytree = 0.6)
    
    XGB.fit(X_train,y_train)
    XGB_preds = XGB.predict_proba(test)[:,1]
    predictions.append(XGB_preds)
    print("---")

In [None]:
cols = sample_sub.drop(['sig_id'],axis=1).columns
submission = pd.DataFrame(data=predictions).transpose()
submission.columns = cols
submission.set_index(test_trt_index,inplace=True)

controls = pd.DataFrame(np.zeros((len(test_control_index),(sample_sub.shape[1] - 1))), columns=cols)
controls.set_index((test_control_index),inplace=True)

fin = pd.concat([submission,controls],axis=0)
fin.sort_index(inplace=True)
fin.insert(0,'sig_id',sample_sub['sig_id'])
fin.to_csv('submission.csv', index=False)