In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Read in data
train_feat = pd.read_csv("../input/lish-moa/train_features.csv")
test_feat = pd.read_csv("../input/lish-moa/test_features.csv")

train_targets_scored = pd.read_csv("../input/lish-moa/train_targets_scored.csv")
#train_targets_nonscored = pd.read_csv("../input/lish-moa/train_targets_nonscored.csv")
train_drug = pd.read_csv("../input/lish-moa/train_drug.csv")

In [None]:
train_feat

In [None]:
# Initial EDA
print("EDA...")

# columns
# sig_id, cp_type, cp_time, cp_dose, g-0, ..., g-771, c-1, ..., c-99


# train drug
t_d_v_c = train_drug['drug_id'].value_counts()
num_row_count_gt_5 = t_d_v_c.where(t_d_v_c>5).dropna().shape[0]
print(f"Number of drugs with count greater than 5: {num_row_count_gt_5}")
num_row_count_gt_50 = t_d_v_c.where(t_d_v_c>50).dropna()
tdvc_dict = {k: v for k,v in zip(num_row_count_gt_50.index, num_row_count_gt_50.values)}
print(f"Top Drug Counts > 50: {tdvc_dict}")
drug_group_dict = {k: "drugGroup" + str(i) for i,k in enumerate(tdvc_dict.keys())}
print(f"Drug Group Map: {drug_group_dict}")


# missing data? ... none
missing_count = train_feat.isna().sum()
total_missing_sum = missing_count.sum()
print(f"Total Missing Values: {total_missing_sum}")
missing_count.apply(lambda x: print(x) if x != 0 else 0)


# categorical variables
# cp_type:  indicates samples treated with a compound (cp_vehicle) or with a control perturbation (ctrl_vehicle)
# cp_time: indicate treatment duration (24, 48, 72 hours)
# cp_dose: dose (high or low)


# MoA analysis
target_counts = []
small_percents = 0
smaller_percents = 0
for target_name in train_targets_scored.columns:
    if target_name == "sig_id":
        continue
    t_t_v_c = train_targets_scored[target_name].value_counts()
    target_counts.append(t_t_v_c[1])
    per = str(round(t_t_v_c[1]/t_t_v_c[0], 3)) + "%"
    if per == "0.0%":
        smaller_percents += 1
    per = str(round(t_t_v_c[1]/t_t_v_c[0], 2)) + "%"
    if per == "0.0%":
        small_percents += 1
    
print(f"Scored Target MoA Counts: {target_counts}")
print(f"Number of Class Ratios smaller that 0.01: {small_percents}")
print(f"Number of Class Ratios smaller that 0.001: {smaller_percents}")


# feature engineering?
# not that I can think of

In [None]:
# Data Transformations

# categorical ohe for train and test feats
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(handle_unknown='ignore')

enc.fit(train_feat[['cp_type', 'cp_time', 'cp_dose']])
cats = np.concatenate(enc.categories_).ravel()
cats = [str(c) for c in cats]

# create encodings for train and test
train_ohe = pd.DataFrame(enc.transform(train_feat[['cp_type', 'cp_time', 'cp_dose']]).toarray(), columns=cats)
test_ohe = pd.DataFrame(enc.transform(test_feat[['cp_type', 'cp_time', 'cp_dose']]).toarray(), columns=cats)

# remove old columns and add new encodings to df
train = train_feat.iloc[:, 4:]
train = pd.concat([train, train_ohe], axis=1)
test = test_feat.iloc[:, 4:]
test = pd.concat([test, test_ohe], axis=1)

In [None]:
#Model Building -- Loop through each of the 207 targets
# from datetime import datetime

# start = datetime.now()
# all_params = []
# for i in range(1,train_targets_scored.shape[1]):
#     mod_start = datetime.now()
#     print(f"Starting model {i}...  =============================================================")
#     y = train_targets_scored.iloc[:,i]
#     print(y.value_counts())
    
#     #https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
#     from sklearn.model_selection import train_test_split

#     X_train, X_val, y_train, y_val = train_test_split(train, y, test_size=0.3, random_state=42, stratify=y)

#     # resample -- oversample
#     # https://scikit-learn.org/stable/modules/generated/sklearn.utils.resample.html
#     # from sklearn.utils import resample

#     # df = pd.concat([X_train, y_train], axis=1)
    
#     # #separate by class
#     # neg_c = df[df[y.name]==0]
#     # pos_c = df[df[y.name]==1]

#     # upsampled_df = resample(pos_c, n_samples=len(neg_c), replace=True, stratify=pos_c[y.name], random_state=0)

#     # # re-combine 
#     # upsampled_df = pd.concat([neg_c, pos_c])

#     # print(upsampled_df[y.name].value_counts())

#     # y_train = upsampled_df[y.name]
#     # X_train = upsampled_df.drop(y.name, axis=1)


#     # generate synthetic samples
#     # https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.SMOTE.html
#     from imblearn.over_sampling import SMOTE

#     sm = SMOTE(random_state=0)

#     X_train, y_train = sm.fit_sample(X_train, y_train)

#     print(y_train.value_counts())


#     #Grid Search
#     #https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
#     from sklearn.model_selection import GridSearchCV
#     from sklearn import svm

#     # parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
#     # svc = svm.SVC()
#     # clf = GridSearchCV(svc, parameters)
#     # clf.fit(X_train, y_train)

#     # print("CV results: " + str(clf.cv_results_))
#     # print("Top CV results: " + str(clf.best_params_))


#     # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
#     from sklearn.ensemble import RandomForestClassifier
#     parameters = {'max_depth':[2], 'n_estimators': [100]}
#     rf = RandomForestClassifier(random_state=0)
#     clf = GridSearchCV(rf, parameters)
#     clf.fit(X_train, y_train)

#     #print("CV results: " + str(clf.cv_results_))
#     print("Top CV results: " + str(clf.best_params_))


#     #Validation 
#     val_pred = clf.predict(X_val)

#     # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html
#     from sklearn.metrics import accuracy_score
#     # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html
#     from sklearn.metrics import classification_report

#     target_names = ['class 0', 'class 1']
#     print(classification_report(y_val, val_pred, target_names=target_names))
#     print("Accuracy: " + str(accuracy_score(y_val, val_pred)))

#     all_params.append(clf.best_params_)

#     mod_end = datetime.now()
#     print("Time to run: " + str(mod_end - mod_start))

# print("Finished training...  =============================================================")
# end = datetime.now()
# print("Total time to run: " + str(end - start))

In [None]:
sample_sub = pd.read_csv("../input/lish-moa/sample_submission.csv")
sample_sub

In [None]:
#len(all_params)

In [None]:
#Final Train on All Train Data and Predict Test Data
start = datetime.now()

for i in range(1,train_targets_scored.shape[1]):
    mod_start = datetime.now()
    print(f"Starting model {i}...  =============================================================")
    y = train_targets_scored.iloc[:,i]
    
    # generate synthetic samples
    # https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.SMOTE.html
    from imblearn.over_sampling import SMOTE

    sm = SMOTE(random_state=0)

    X_train, y_train = sm.fit_sample(train, y)

    from sklearn.ensemble import RandomForestClassifier
    #rf = RandomForestClassifier(random_state=0, max_depth=all_params[i]['max_depth'], n_estimators=all_params[i]['n_estimators'])
    rf = RandomForestClassifier(random_state=0)
    rf.fit(X_train, y_train)

    sample_sub[y.name] = rf.predict(test)

print("Finished training...  =============================================================")
end = datetime.now()
print("Total time to run: " + str(end - start))

In [None]:
sample_sub

In [None]:
sample_sub.to_csv("./submission.csv", index=False)
print("YES")

In [None]:
samp2 = pd.read_csv("./submission.csv")
samp2