In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os, time

import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold

import matplotlib.pyplot as plt

In [None]:
data_path = "/kaggle/input/lish-moa/"
train_X = pd.read_csv(data_path + 'train_features.csv')
train_Y = pd.read_csv(data_path + 'train_targets_scored.csv')
test_X = pd.read_csv(data_path + 'test_features.csv')
sample_submission = pd.read_csv(data_path + 'sample_submission.csv')

In [None]:
#Preprossecing data

In [None]:
# change cp_time columns to string
train_X['cp_time'] = train_X['cp_time'].apply(lambda x:str(x))
test_X['cp_time'] = test_X['cp_time'].apply(lambda x:str(x))

# pd.get_dummies gets one-hot encoding for categorical columns
# we also the remove the original categorical columns
train_X = train_X.join(pd.get_dummies(train_X[['cp_time','cp_type','cp_dose']])).drop(['cp_time','cp_type','cp_dose'],axis=1)
test_X = test_X.join(pd.get_dummies(test_X[['cp_time','cp_type','cp_dose']])).drop(['cp_time','cp_type','cp_dose'],axis=1)

# get the ids from the test/training data and then remove those columns
test_ids = test_X['sig_id']
test_X.drop(['sig_id'],axis=1,inplace=True)

train_ids = train_X['sig_id']
train_X.drop(['sig_id'],axis=1,inplace=True)
train_Y.drop(['sig_id'],axis=1,inplace=True)

print(train_X, test_X)
print(train_X.columns, test_X.columns)
print(train_X.shape, train_Y.shape, test_X.shape)

In [None]:
#removing mean and scaling to unit variance

In [None]:
def plot_data(a):
    # a is your data array
    hist, bins = np.histogram(a, bins=100, normed=True)
    bin_centers = (bins[1:]+bins[:-1])*0.5
    plt.plot(bin_centers, hist)
    plt.show()


# we only scale the real-valued features and not the categorical ones
g_cols = [col for col in train_X.columns if col.startswith('g-')]
c_cols = [col for col in train_X.columns if col.startswith('c-')]
transform_feature_list = g_cols + c_cols


def scale_and_PCA(pca_num_components, train, test, cols_to_transform, transformed_col_name):
    # create data by stacking rows from both train and test, for the required columns
    data = pd.concat([train[cols_to_transform], test[cols_to_transform]], axis=0).reset_index(drop=True)
    n = train.shape[0]
    
    # scale
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data)

    # PCA
    pca = PCA(pca_num_components)
    pca_data = pca.fit_transform(scaled_data)

    train_trans = pca_data[:n, :]
    test_trans = pca_data[n:, :]

    return train_trans, test_trans

# first scale and PCA on "g-" features
# we use top 10 "g-" features after PCA
train_X_g, test_X_g = scale_and_PCA(10, train_X, test_X, g_cols, 'g_pca')

# next, scale and tranform the "c-" features
# we use top 5 "c-" features after PCA
train_X_c, test_X_c = scale_and_PCA(5, train_X, test_X, c_cols, 'c_pca')

# concatenate the "g-" and "c-" columns
data_train = np.concatenate((train_X_g, train_X_c), axis=1)
data_test = np.concatenate((test_X_g, test_X_c), axis=1)

# now concatenate train and test rows
n = data_train.shape[0]
data = np.concatenate((data_train, data_test), axis=0)

# categorical columns are the last 7 columns of train/test data (after the preprocessing that we did earlier)
cat_data = np.concatenate((train_X.iloc[:, -7:].to_numpy(), test_X.iloc[:, -7:].to_numpy()), axis=0)

# add back the 7 categorical columns in front of the real-valued columns
transformed_data = np.concatenate((cat_data, data), axis=1)

# seaparate the train/test data
transformed_train_data = transformed_data[:n, :]
transformed_test_data = transformed_data[n:, :]

print(transformed_train_data.shape, transformed_test_data.shape)

In [None]:
#avergae loss function used for cross-validation

In [None]:
def average_log_loss (y_true, y_pred):
    print(y_true.shape, y_pred.shape)
    num_samples, num_outputs = y_true.shape
    loss = 0.00
    for i in range(num_outputs):
        loss += log_loss(y_true[:, i], y_pred[:, i], labels=[0, 1])
    loss /= num_outputs
    return loss

In [None]:
#Upsampling for those features which have low positve outputs

In [None]:
# convert targets to numpy array
transformed_train_targets = train_Y.values
class_1_upsample = {}
for i in range(transformed_train_targets.shape[1]):
    class_1_upsample[i] = (np.where(transformed_train_targets[:, i] == 1)[0]).tolist()
    # print(*class_1_upsample[i], sep=" ")

In [None]:
#logistic regression that uses multi-output classifier

In [None]:
best_model = None
best_loss = 999999999999999999 
best_C = None 
best_model = None

for c in [1.0, 0.1, 0.01]:
    LR_model = MultiOutputClassifier(LogisticRegression(C=c, class_weight={0: 0.4, 1:0.6}), n_jobs=-1)
    # 5-fold CV
    kf = KFold(n_splits=5)
    for train_indices, val_indices in kf.split(transformed_train_data):
        start = time.time()
        X_train, Y_train = transformed_train_data[train_indices, :], transformed_train_targets[train_indices, :]
        X_val, Y_val = transformed_train_data[val_indices, :], transformed_train_targets[val_indices, :]
        
        temp = []
        for i in range(Y_train.shape[1]):
            ones = (np.where(Y_train[:, i] == 1)[0])
            # print("column ", i, " has no ones in training set", class_1_upsample[i])
            if len(ones)==0:
                temp.append(class_1_upsample[i])
        
        if not temp:
            upsample_indices = set(temp)
            upsample_indices = list(upsample_indices)
            X_train = np.concatenate([X_train, transformed_train_data[upsample_indices, :]], axis=0)
            Y_train = np.concatenate([Y_train, transformed_train_targets[upsample_indices, :]], axis=0)
        
        X_train = np.concatenate([X_train, np.ones((1, transformed_train_data.shape[1]))], axis=0)
        Y_train = np.concatenate([Y_train, np.ones((1, Y_train.shape[1]))], axis=0)
        
        LR_model.fit(X_train, Y_train)
        stop = time.time()
        print("Time taken to fit LR for C=", c, ": ", stop-start, " seconds")
        
        # prediction on the validation set
        probs_val = LR_model.predict_proba(X_val)

        # the first index is the probability of output 1 for a feature
        preds = np.array(probs_val)[:, :, 1]
        preds = preds.T
        cur_loss = average_log_loss(Y_val, preds)
        
        if cur_loss < best_loss:
            best_C = c
            best_model = LR_model

In [None]:
#Run the best model on the test-data and produce submission.csv

In [None]:
# prediction on the test data
print("best LR model has C=", best_C)
probs = best_model.predict_proba(transformed_test_data)

# the first index is the probability of output 1 for a feature
preds = np.array(probs)[:, :, 1]
preds = preds.T
sample_submission[sample_submission.columns.to_list()[1:]] = preds
#submission
sample_submission.to_csv('submission.csv',index=False)