In [None]:
from IPython.display import display

import pandas as pd
import numpy as np

In [None]:
# Configuration

import logging
import sys
from multiprocessing import cpu_count

# Configure logging level
logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)

# Number of cpu cores used
n_jobs = cpu_count()

In [None]:
# Load Data

x_train = pd.read_csv('../input/lish-moa/train_features.csv', index_col=0)
x_test = pd.read_csv('../input/lish-moa/test_features.csv', index_col=0)
y_train = pd.read_csv('../input/lish-moa/train_targets_scored.csv', index_col=0)
submission = pd.read_csv('../input/lish-moa/sample_submission.csv', index_col=0)

display(x_train.head())
display(x_test.head())
display(y_train.head())
display(submission.head())

In [None]:
# Preprocess Data

def preprocess(df):
    df['cp_type'] = df['cp_type'].map({'trt_cp': 0, 'ctl_vehicle': 1})
    df['cp_time'] = df['cp_time'].map({24: 1, 48: 2, 72: 3})
    df['cp_dose'] = df['cp_dose'].map({'D1': 0, 'D2': 1})
    return df

x_train = preprocess(x_train)
x_test = preprocess(x_test)

display(x_train.head())
display(x_test.head())

In [None]:
# Define the scorer function

from sklearn.metrics import roc_auc_score, log_loss, f1_score

def scorer(y_true, y_pred):
    log_loss_, auc, f1 = 0,0,0

    # Add a dummy prediction to y_true and y_pred in case some label has all 0's
    y_true = np.vstack((y_true, np.ones((1, y_true.shape[1]))))
    y_pred = np.vstack((y_pred, np.ones((1, y_pred.shape[1]))))

    v = y_true.shape[1]
    for i in range(v):
        log_loss_ += log_loss(y_true[:, i], y_pred[:, i])
        auc += roc_auc_score(y_true[:, i], y_pred[:, i])
        f1 += f1_score(y_true[:, i], y_pred[:, i] > 0.5)
    return log_loss_ / v, auc / v, f1 / v

In [None]:
# Define the method to evaluate a model

from skmultilearn.model_selection import IterativeStratification

import pickle
from time import time
from pathlib import Path
from os import path

def eval_model(model, _x_train, _y_train, n_splits=3, id_=None):
    start_time = time()
    logging.info('*' * 20)
    logging.info("Evaluating model {}".format(id_ if id_ else model))

    output = None

    # Try to load saved result from disk if exists
    if id_:
        output = Path('output') / id_
        output.mkdir(parents=True, exist_ok=True)
        if path.exists(output / 'score.pkl'):
            logging.debug("Loading result from disk")
            log_loss_, auc, f1 = pickle.load(open(output / 'score.pkl', 'rb'))
            logging.info("The Average Log Loss is {}".format(log_loss_))
            logging.info("The Average AUC is {}".format(auc))
            logging.info("The Average f1 is {}".format(f1))
            return log_loss_, auc, f1

    # Convert data into numpy
    _x_train = np.array(_x_train)
    _y_train = np.array(_y_train)

    # Deprecated sklearn k-forld
    # kf = StratifiedKFold(n_splits=n_splits)
    # kf.get_n_splits(X_train)

    # Use Iterative stratification for multi-label data to handle imbalance
    kf = IterativeStratification(n_splits=n_splits, order=1)

    log_loss_, auc, f1 = 0.0, 0.0, 0.0
    for i, (train_index, test_index) in enumerate(kf.split(_x_train, _y_train)):
        x_train_, x_val_ = _x_train[train_index], _x_train[test_index]
        y_train_, y_val_ = _y_train[train_index], _y_train[test_index]

        # Add dummy sample to make sure every column has 2 labels
        x_train_ = np.vstack((x_train_, np.zeros((1, x_train_.shape[1]))))
        y_train_ = np.vstack((y_train_, np.ones((1, y_train_.shape[1]))))

        y_pred_ = model.fit(x_train_, y_train_).predict(x_val_)

        log_loss_val, auc_val, f1_val = scorer(y_val_, y_pred_)

        # Pickle y_val_ and y_pred_
        if id_:
            pickle.dump((y_val_, y_pred_), open(output / "cv_{}.pkl".format(i), 'wb'))

        # Update the scores
        log_loss_ += log_loss_val
        auc += auc_val
        f1 += f1_val

    log_loss_ /= n_splits
    auc /= n_splits
    f1 /= n_splits
    if id_:
        pickle.dump((log_loss_, auc, f1), open(output / 'score.pkl', 'wb'))
    logging.info("The Average Log Loss is {}".format(log_loss_))
    logging.info("The Average AUC is {}".format(auc))
    logging.info("The Average f1 is {}".format(f1))
    logging.debug("Used {:.2f}s".format(time() - start_time))
    return log_loss_, auc, f1

In [None]:
# Run a Linear Regression for demo

from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, RandomForestRegressor, StackingRegressor
from sklearn.multioutput import MultiOutputClassifier, MultiOutputRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor

# eval_model(LinearRegression(n_jobs=n_jobs), x_train, y_train, id_='lr')

In [None]:
# Define PCA Function

from sklearn.decomposition import PCA

def pca(_x_train, _x_test, n_gene=200, n_cell=50):
    genes = [col for col in _x_train.columns if col.startswith('g-')]
    cells = [col for col in _x_train.columns if col.startswith('c-')]

    # PCA genes
    data_genes = pd.concat([pd.DataFrame(_x_train[genes]), pd.DataFrame(_x_test[genes])])
    data_genes_pca = PCA(n_components=n_gene, random_state=42).fit_transform(data_genes)

    train_gene_pca = data_genes_pca[:_x_train.shape[0]]
    test_gene_pca = data_genes_pca[-_x_test.shape[0]:]

    train_gene_pca = pd.DataFrame(train_gene_pca, columns=[f'pca_G-{i}' for i in range(n_gene)])
    test_gene_pca = pd.DataFrame(test_gene_pca, columns=[f'pca_G-{i}' for i in range(n_gene)])

    # PCA cells
    data_cells = pd.concat([pd.DataFrame(_x_train[cells]), pd.DataFrame(_x_test[cells])])
    data_cells_pca = PCA(n_components=n_cell, random_state=42).fit_transform(data_cells)

    train_cells_pca = data_cells_pca[:_x_train.shape[0]]
    test_cells_pca = data_cells_pca[-_x_test.shape[0]:]

    train_cells_pca = pd.DataFrame(train_cells_pca, columns=[f'pca_C-{i}' for i in range(n_cell)])
    test_cells_pca = pd.DataFrame(test_cells_pca, columns=[f'pca_C-{i}' for i in range(n_cell)])

    # Generate new training and test data
    _train_features = pd.concat((train_gene_pca, train_cells_pca), axis=1)
    _test_features = pd.concat((test_gene_pca, test_cells_pca), axis=1)
    return _train_features, _test_features

In [None]:
# # Tuning PCA with linear regression
#
# cols = ['log_loss', 'roc_auc', 'f1']
# df = pd.DataFrame(columns=cols)
# for n_genes in [10, 20, 50, 100, 200, 400, 500]:
#     for n_cells in [10, 25, 50, 75]:
#         train_features, test_features = pca(x_train, x_test, n_genes, n_cells)
#         ll, auc, f1 = eval_model(LinearRegression(n_jobs=n_jobs),train_features, y_train, id_='lr_{}_{}'.format(n_genes, n_cells))
#         df = df.append(pd.Series([ll, auc, f1], name='lr_{}_{}'.format(n_genes, n_cells), index=cols))
#
# display(df.sort_values(by=['log_loss']).head())

In [None]:
# Make hist plot
import matplotlib.pyplot as plt

mpl_logger = logging.getLogger('matplotlib')
mpl_logger.setLevel(logging.WARNING)

def make_hist_plot(df, title, id_):
    def __plot(score):
        logging.debug(df[score])
        x = list(df.index)
        y = df[score].values
        plt.figure(figsize=(16, 9))
        plt.bar(x, y)
        plt.xticks(x, x, rotation='vertical')
        plt.ylabel(score)
        plt.title("{} {}".format(title, score))
        plt.subplots_adjust(bottom=0.4)

        plt.savefig("fig/{}_{}.png".format(id_, score))
        plt.show()

    for col in df:
        __plot(col)

def make_plot(df, title, id_, x_label=None, log_x=False):
    def __plot(score):
        logging.debug(df[score])
        x = list(df.index)
        y = df[score].values
        # plt.figure(figsize=(16, 9))
        plt.plot(x, y)
        if x_label:
            plt.xlabel(x_label)
        plt.ylabel(score)
        if log_x:
            plt.xscale('log')
        plt.title("{} {}".format(title, score))

        plt.savefig("fig/{}_{}.png".format(id_, score))
        plt.show()

    for col in df:
        __plot(col)

In [None]:
# # Make plot of tuning PCA
#
# make_hist_plot(df, 'Tuning PCA', 'tuning_pca')

In [None]:
# Use 20 genes and 10 cells as best PCA
train_features, test_features = pca(x_train, x_test, 20, 10)

In [None]:
# # Tuning over sampling per label, tested with lr
# df = pd.DataFrame(columns=cols)
# ll, auc, f1 = eval_model(MultiOutputClassifier(LogisticRegression(max_iter=1e4, C=0.01), n_jobs=n_jobs),
#                          train_features, y_train, id_='lr_100_25_c_0.01')
# df = df.append(pd.Series([ll, auc, f1], name='lr_100_25_c_0.01', index=cols))
#
# # MultiOutputWithSampling is a class I implemented to treat each label as separate training task and train the model.
# from MultiOutputWithSampling import MultiOutputWithSampling
#
# # 0.01, 0.02, and 0.03 result in errors
# for ss in [0.04, 0.1, 0.2, 0.25, 0.5]:
#     ll, auc, f1 = eval_model(
#         MultiOutputWithSampling(LogisticRegression(max_iter=1e4, C=0.01), sampling_strategy=ss, n_jobs=n_jobs),
#         train_features, y_train, id_='lr_100_25_c_0.01_ss_{}_separate_sampling'.format(ss))
#     df = df.append(pd.Series([ll, auc, f1], name='lr_100_25_c_0.01_ss_{}_separate_sampling'.format(ss), index=cols))
# make_hist_plot(df, "Tuning sampling strategy per label on LR, C=0.01", 'sampling')
# # No separate sampling performs better

In [None]:
# # Ridge, Tuning a
# tuning_a = np.logspace(-2, 3, 6)
# df = pd.DataFrame(columns=cols)
# for a in tuning_a:
#     ll, auc, f1 = eval_model(MultiOutputRegressor(Ridge(alpha=a), n_jobs=n_jobs), train_features, y_train,
#                              id_='ridge_100_25_a_{:.2f}'.format(a))
#     df = df.append(pd.Series([ll, auc, f1], name=a, index=cols))
# make_plot(df, "Ridge, Tuning a", "ridge_tuning_a", x_label='a', log_x=True)

In [None]:
# The best ridge is a=1000
best_ridge = Ridge(alpha=1000)

In [None]:
# # Logistic Regression, Tuning C
# tuning_c = np.logspace(-2, 3, 6)
# df = pd.DataFrame(columns=cols)
# for c in tuning_c:
#     ll, auc, f1 = eval_model(MultiOutputClassifier(LogisticRegression(max_iter=1e4, C=c), n_jobs=n_jobs),
#                              train_features, y_train, id_='lr_100_25_c_{:.2f}'.format(c))
#     df = df.append(pd.Series([ll, auc, f1], name=c, index=cols))
# make_plot(df, "LR, Tuning C", "lr_tuning_c", x_label='C', log_x=True)

In [None]:
# The best LR is C=0.01
best_lr = LogisticRegression(max_iter=1e4, C=0.01)

In [None]:
# # Random Forest, tuning max_depth
# df = pd.DataFrame(columns=cols)
# for n_estimators in [50, 200, 500]:
#     for max_depth in [1, 3, 6, 10]:
#         ll, auc, f1 = eval_model(MultiOutputRegressor(
#             RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=43,
#                                   min_samples_split=10), n_jobs=n_jobs), train_features, y_train,
#             id_='rfr_{}_{}_43_10'.format(n_estimators, max_depth))
#         df = df.append(pd.Series([ll, auc, f1], name=max_depth, index=cols))
# display(df.sort_values(by='log_loss').head())
# make_hist_plot(df, "Random Forest, Tuning n_estimators and max_depth", "rfr_tuning_n_d")

In [None]:
# The best Random Forest is max_depth=3

best_rf = MultiOutputRegressor(
    RandomForestRegressor(n_estimators=50, max_depth=3, random_state=43, min_samples_split=10), n_jobs=n_jobs)

In [None]:
# # NN model, Tuning hidden_layer
# df = pd.DataFrame(columns=cols)
# best_loss = np.inf
# best_h = None
# for i in [50, 100, 200]:
#     for j in [50, 100, 200]:
#         for k in [50, 100, 200]:
#             ll, auc, f1 = eval_model(
#                 MLPRegressor(hidden_layer_sizes=(i, j, k), random_state=1, max_iter=1500, learning_rate='adaptive',
#                              warm_start=True), train_features, y_train, id_='nn_100_25_h_{}_{}_{}'.format(i, j, k))
#             if best_loss > ll:
#                 best_loss = ll
#                 best_h = (i, j, k)
#             df = df.append(pd.Series([ll, auc, f1], name='nn_100_25_h_{}_{}_{}'.format(i, j, k), index=cols))
# make_hist_plot(df, "Neural Network", "nn_tuning_hidden")
# logging.info("The best hidden layer configuration is {}".format(best_h))
# logging.info("The best log loss is {}".format(best_loss))

In [None]:
# The best nn is hidden_layer=(200,100,100)
best_nn = MLPRegressor(hidden_layer_sizes=(200, 100, 100), random_state=1, max_iter=1500, warm_start=True)

In [None]:
# Predict with the best model
best_model = best_rf
best_model.fit(train_features, y_train)
y_pred = best_model.predict(test_features)
# pickle.dump((best_model, y_pred), open("best_model.pkl", 'wb'))
# best_model, y_pred = pickle.load(open("best_model.pkl", 'rb'))
submission = pd.DataFrame(y_pred, index=submission.index, columns=submission.columns)

display(submission.head())

In [None]:
submission.to_csv('submission.csv')

In [None]:
# # Remove pickle files

# import shutil

# shutil.rmtree("/kaggle/working/output")