In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.metrics import log_loss
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression

In [None]:
train_features = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets_scored = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
test_features = pd.read_csv('../input/lish-moa/test_features.csv')
s_submission = pd.read_csv('../input/lish-moa/sample_submission.csv')

Log_loss metric

In [None]:
def average_log_loss(y_true, y_pred):
    print(y_true.shape, y_pred.shape)
    num_samples, num_outputs = y_true.shape
    loss = 0.00
    for i in range(num_outputs):
        loss += log_loss(y_true[:, i], y_pred[:, i], labels=[0, 1])
    loss /= num_outputs
    return loss

Preprocessing

In [None]:
def preprocess(df):
    df = df.drop(columns=['sig_id'])
    df.cp_dose = df.cp_dose.map({'D1': -1, 'D2': 1})
    df.cp_time = df.cp_time.map({24: -1, 48: 0, 72: 1})
    df.cp_type = df.cp_type.map({'trt_cp': -1, 'ctl_vehicle': 1})
    return df

In [None]:
train_features = preprocess(train_features)
train_targets_scored = train_targets_scored.drop(columns=['sig_id'])
test_features = preprocess(test_features)

targets_np = train_targets_scored.to_numpy()

Scaler and transform

In [None]:
g_cols = [col for col in train_features.columns if col.startswith('g-')]
c_cols = [col for col in train_features.columns if col.startswith('c-')]
cp_cols = [col for col in train_features.columns if col.startswith('cp_')]

def scaler_and_PCA(pca_num_components, train, test):
    data = np.concatenate((train, test), axis=0)
    n = train.shape[0]
    
    # variance threshold
    selector = VarianceThreshold(threshold=0.8)
    data = selector.fit_transform(data)
    
    # scale
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data)

    # PCA
    pca = PCA(pca_num_components)
    pca_data = pca.fit_transform(scaled_data)

    train_trans = pca_data[:n, :]
    test_trans = pca_data[n:, :]

    return train_trans, test_trans


# For columns "g-"
train_X_g = train_features[g_cols].to_numpy()
test_X_g = test_features[g_cols].to_numpy()
train_X_g, test_X_g = scaler_and_PCA(80, train_X_g, test_X_g)

# For columns "c-"
train_X_c = train_features[c_cols].to_numpy()
test_X_c = test_features[c_cols].to_numpy()
train_X_c, test_X_c = scaler_and_PCA(20, train_X_c, test_X_c)

features_np = np.concatenate((train_features[cp_cols].to_numpy(), train_X_g, train_X_c), axis=1)
test_np = np.concatenate((test_features[cp_cols].to_numpy(), test_X_g, test_X_c), axis=1)
print('Shape after scaler and PCA', features_np.shape)

Cross Validation Logistic Regression

In [None]:
best_model = None
best_loss = 999999999999999999
kf = KFold(n_splits=5)
j = 1
for train_indices, val_indices in kf.split(features_np):
    X_train, Y_train = features_np[train_indices, :], targets_np[train_indices, :]
    X_val, Y_val = features_np[val_indices, :], targets_np[val_indices, :]

    all_categories = list(train_targets_scored.columns)
    model_dict = {}
    print('FIT')
    for i in tqdm(range(206)):
        if Y_train[:, i].max() == 0:
            # use last model
            model_dict[all_categories[i]] = logistic_model
        else:
            logistic_model = LogisticRegression(C=0.1, max_iter=1000, class_weight={0: 0.4, 1: 0.6})
            logistic_model.fit(X_train, Y_train[:, i])
            # saving model
            model_dict[all_categories[i]] = logistic_model
    print('PREDICT')
    Y_pred = np.zeros(Y_val.shape)
    i = 0
    for category in tqdm(all_categories):
        Y_pred[:, i] = np.copy(model_dict[category].predict_proba(X_val)[:, 1])
        i += 1
    print('VALIDATE')
    cur_loss = average_log_loss(Y_val, Y_pred)
    print('Log_loss', j, cur_loss)
    if cur_loss < best_loss:
        best_model = model_dict
        best_loss = cur_loss
    j += 1

print('Best loss is:', best_loss)

Predict result

In [None]:
Y_res = s_submission.drop(columns=['sig_id']).to_numpy()
i = 0
all_categories = list(train_targets_scored.columns)
print('PREDICT RESULT')
for category in tqdm(all_categories):
    Y_res[:, i] = np.copy(best_model[category].predict_proba(test_np)[:, 1])
    i += 1
# POSTPROCESS
for i in range(test_np.shape[0]):
    if test_np[i][0] == 1:
        Y_res[i, :] = np.zeros(Y_res.shape[1])
s_res = pd.DataFrame(Y_res, columns=all_categories)
s_res = pd.concat([s_submission['sig_id'], s_res], axis=1)
s_res.to_csv('submission.csv', index=False)