In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# LOAD FILES
test_features = pd.read_csv('/kaggle/input/lish-moa/test_features.csv')
train_drug = pd.read_csv('/kaggle/input/lish-moa/train_drug.csv')
train_features = pd.read_csv('/kaggle/input/lish-moa/train_features.csv')
train_targets_scored = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('/kaggle/input/lish-moa/train_targets_nonscored.csv')
sample_submission = pd.read_csv('/kaggle/input/lish-moa/sample_submission.csv')

# **Preprocessing**

In [None]:
train_features.insert(loc=0, column='drug_id',value=train_drug['drug_id'])
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)
X_resampled, y_resampled = rus.fit_resample(pd.merge(train_features, train_targets_scored, on=['sig_id']), train_drug['drug_id'])

In [None]:
X_train = X_resampled.iloc[:,2:-206]
# X_train = train_features.drop(columns=["sig_id"])
X_train["cp_type"] = 1 * (X_train["cp_type"] == "trt_cp")
X_train["cp_time"] = X_train["cp_time"] / 24 
X_train["cp_dose"] = 1 * (X_train["cp_dose"] == "D1")

X_test = test_features.drop(columns=["sig_id"])
X_test["cp_type"] = 1 * (X_test["cp_type"] == "trt_cp")
X_test["cp_time"] = X_test["cp_time"] / 24 
X_test["cp_dose"] = 1 * (X_test["cp_dose"] == "D1")

y_train = X_resampled.iloc[:,877:] 
# y_train = train_targets_scored.drop(columns=["sig_id"])

# **Scaling**

In [None]:
ss = StandardScaler()
X_train_scaled = ss.fit_transform(X_train)
X_train_scaled = pd.DataFrame(data=X_train_scaled, columns=X_train.columns)
X_test_scaled = ss.transform(X_test)
X_test_scaled = pd.DataFrame(data=X_test_scaled, columns=X_test.columns)

# **Principal component analysis**

In [None]:
# For training data

CP = X_train_scaled[[x for x in X_train.columns if 'cp_' in x]].values
GENES = X_train_scaled[[x for x in X_train.columns if 'g-' in x]].values
CELLS = X_train_scaled[[x for x in X_train.columns if 'c-' in x]].values

pca = PCA(n_components=2)
pca.fit(CP)
cp_pca=pca.transform(CP)

pca = PCA(n_components=500)
pca.fit(GENES)
genes_pca=pca.transform(GENES)

pca = PCA(n_components=50)
pca.fit(CELLS)
cells_pca=pca.transform(CELLS)

X_train_reduced = np.c_[cp_pca, genes_pca,cells_pca]

#####################################################################
# For testting data

CP = X_test_scaled[[x for x in X_train.columns if 'cp_' in x]].values
GENES = X_test_scaled[[x for x in X_train.columns if 'g-' in x]].values
CELLS = X_test_scaled[[x for x in X_train.columns if 'c-' in x]].values

pca = PCA(n_components=2)
pca.fit(CP)
cp_pca=pca.transform(CP)

pca = PCA(n_components=500)
pca.fit(GENES)
genes_pca=pca.transform(GENES)

pca = PCA(n_components=50)
pca.fit(CELLS)
cells_pca=pca.transform(CELLS)

X_test_reduced = np.c_[cp_pca, genes_pca,cells_pca]

In [None]:
# Split for Training and Validation

split_index = round(X_test_reduced.shape[0] * .7)

X_train_split = X_train_reduced[:split_index,:] 
X_valid = X_train_reduced[split_index:,:] 

y_train_split = y_train.values[:split_index,:]
y_valid = y_train.values[split_index:,:]

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=8, n_jobs=-1)
clf.fit(X_train_reduced, y_train.values)


In [None]:
# Log loss score for model

y_pred2 = np.array(clf.predict_proba(X_valid))[:,:,1]

from sklearn.metrics import log_loss

log_loss(np.ravel(y_valid), np.ravel(y_pred2))

In [None]:
y_pred = np.array(clf.predict_proba(X_test_reduced))[:,:,1]
sample_submission[sample_submission.columns.to_list()[1:]] = y_pred.T
sample_submission

In [None]:
sample_submission.to_csv('submission.csv', index=False)