# PCA + Logistic Regression

I here present my notebook with a simple treatment of the data. I will make use of PCA and logistic regression only. Let's import the necessary packets.

In [None]:
import numpy as np
import pandas as pd
import sys
import time
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss

sys.path.append('../input/iterative-stratification/iterative-stratification-master')
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

import pdb

now the data

In [None]:
dir_data = '/kaggle/input/lish-moa/'
df_train_orig = pd.read_csv(dir_data + 'train_features.csv', index_col=0)
df_target_orig = pd.read_csv(dir_data + 'train_targets_scored.csv', index_col=0)
df_target_nonscored_orig = pd.read_csv(dir_data + 'train_targets_nonscored.csv', index_col=0)
df_submiss = pd.read_csv(dir_data + 'sample_submission.csv', index_col=0)
df_test_orig = pd.read_csv(dir_data + 'test_features.csv', index_col=0)

In [None]:
df_train_orig.head()

Two columns ('cp_type' and 'cp_dose') are categorical. The 'cp_type' column has just two values: one ('trt_cp') indicates samples treated with a compound, the other one ('ctl_vehicle') it's a control sample and has no MoAs. Let's check it. 

In [None]:
df_train_orig['cp_type'].unique()

In [None]:
idx_vehicle = df_train_orig[df_train_orig['cp_type'] == 'ctl_vehicle'].index
df_target_orig.loc[idx_vehicle].sum()

In [None]:
df_target_orig.loc[idx_vehicle].sum().sum()

this will be useful later, since we know that any sample with 'cp_type' == 'ctl_vehicle' has target zero for any MoA.
MoAs represent the particular actions of the drugs under analysis. Let's see how many examples there are for each MoA=1

In [None]:
df_target_orig.sum(axis=0).sort_values()

For 'nfkb_inhibitor' there are 832 examples, while for 'atp-sensitive_potassium_channel_antagonist' and 'erbb2_inhibitor' there is just one. This can make troubles later, so we add one dummy value for each one. Besides, the training set is unbalanced.

In [None]:
ds_to_plot = df_target_orig.sum(axis=0).sort_values()

plt.figure(figsize=(16,8))
plt.bar(height=ds_to_plot, x=range(len(ds_to_plot)), tick_label=ds_to_plot.index.tolist())
plt.xticks(ticks=range(len(ds_to_plot)),labels=ds_to_plot.index.tolist(), rotation=90, fontsize=3)
plt.xlabel('MoA')
plt.ylabel('number of samples')

Let's duplicate the rows having just one example for MoAs 'atp_sensitive_potassium_channel_antagonist' and 'erbb2_inhibitor' (otherwise MultilabelStratifiedShuffleSplit will complain with just one example).

In [None]:
#consider non-categorical values only
cols_no_cp = df_train_orig.columns[3:]
#cols targets that has only one example
cols_oneEx =df_target_orig.columns[df_target_orig.sum(axis=0) == 1].tolist()
#boolean to select the ids
boole = (df_target_orig[cols_oneEx[0]] == 1) | (df_target_orig[cols_oneEx[1]] == 1)
ids_oneEx = df_target_orig.loc[boole].index.tolist()
ids_oneEx_dummy = [i + '_dum' for i in ids_oneEx]
#create dummy values for ids_oneEx
df_train_orig.loc[ids_oneEx_dummy[0]] = df_train_orig.loc[ids_oneEx[0]] 
df_train_orig.loc[ids_oneEx_dummy[1]] = df_train_orig.loc[ids_oneEx[1]] 
df_target_orig.loc[ids_oneEx_dummy[0]] = df_target_orig.loc[ids_oneEx[0]]
df_target_orig.loc[ids_oneEx_dummy[1]] = df_target_orig.loc[ids_oneEx[1]]
#add dummy features
df_train_orig.loc[ids_oneEx_dummy[0],cols_no_cp] = df_train_orig.loc[ids_oneEx[0],cols_no_cp] + 0.1
df_train_orig.loc[ids_oneEx_dummy[1],cols_no_cp] = df_train_orig.loc[ids_oneEx[1],cols_no_cp] + 0.1

apply one-hot-encoding for categorical values to both training and test sets

In [None]:
#put together the two sets in order to apply the same one-hot-encoding
#first identify the train from the test set
idx_train = df_train_orig.index
idx_test = df_test_orig.index
#put them together
df_all = pd.concat([df_train_orig,df_test_orig])
#get dummies for categorical variables
df_all = pd.get_dummies(df_all)
#separate the two sets
df_train = df_all.loc[idx_train]
df_test = df_all.loc[idx_test]


## Short discussion on the number of MoAs for each example
Let's check if every sample has one or more MoAs

In [None]:
df_target_orig.sum(axis=1).sort_values().unique()

so, there can be up to 7 MoAs for one sample. Let's see how many

In [None]:
df_target_orig.sum(axis=1).value_counts()

we see that only 6 examples have 7 MoAs while 9367 samples have 0 MoAs. Are the latter all 'ctl_vehicle'?

In [None]:
print('numer of ctl_vehicles= ', len(idx_vehicle))

therefore, there are 9367-1866 examples that have no MoAs but are not control sample. How can it be? 
Actually, we found that these samples have MoAs in the file 'train_targets_nonscored.csv'. Let's put the two together and drop the 'idx_vehicle' rows

In [None]:
df_target_whole = df_target_orig.join(df_target_nonscored_orig, how='left')
df_target_whole.drop(idx_vehicle, inplace=True)
df_target_whole.sum(axis=1).value_counts()


we actually see that the number of samples with 'ctl_vehicle'=trt_cp having no MoAs are 3664. This suggest that the data are not complete, i.e. these are examples which MoA are not reported in the data. 

However, we are not using the non-scored data in our analysis.

## Preparation of the model

Let's apply PCA (keeping 90% of the information), standard scaler and logistic regression, putting everything together in a pipeline. We use MultiOutput classifier.

In [None]:
pca=PCA(n_components=0.9)
scaler = StandardScaler()
logistic = LogisticRegression(max_iter=10000, tol=0.1, C = 0.008)# C=0.008 appears to be the best after some tests
pipe = Pipeline(steps=[('scaler', scaler),('pca', pca),('moc',MultiOutputClassifier(logistic,n_jobs=-1))])

we split the sample using stratification to keep the ratio of the samples.

In [None]:
sss = MultilabelStratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
X = df_train
y = df_target_orig

Now some useful function to compute the log loss and predictions

In [None]:
def log_loss_metric(y_true, y_pred):
    metrics = []
    for i in np.arange(y_true.shape[1]):
        metrics.append(log_loss(y_true[:, i].astype(float), y_pred[:, i].astype(float), labels = [0,1]))
    return np.mean(metrics)

###############################################
def compute_logloss_0(clf, X, y, string):

    proba = clf.predict_proba(X)
    y_pred=np.zeros(y.shape)

    for i,val in enumerate(proba):
        y_pred[:,i] = val[:,1]

    logloss_value = log_loss_metric(y.values,y_pred)
    print(string + ' log loss ', logloss_value)

    return y_pred
###############################################
def predict_test(clf, X, y):


    proba = clf.predict_proba(X)

    y_pred=np.zeros(y.shape)
    for i,val in enumerate(proba):
        y_pred[:,i] = val[:,1]

    return y_pred


In [None]:
y_pred_list = []
for i, (train_index, test_index) in enumerate(sss.split(X, y)):
    X_train_sss, X_test_sss = X.iloc[train_index], X.iloc[test_index]
    y_train_sss, y_test_sss = y.iloc[train_index], y.iloc[test_index]

    pipe.fit(X_train_sss,y_train_sss)
    print('*** split n =', i, ' ***')
    y_pred_train_sss = compute_logloss_0(pipe, X_train_sss, y_train_sss, 'X_train_sss ')
    y_pred_test_sss = compute_logloss_0(pipe, X_test_sss, y_test_sss, 'X_test_sss ')
    y_pred = predict_test(pipe, df_test, df_submiss)
    y_pred_list.append(y_pred)
    

## Submission

In [None]:
#compute the mean
y_pred_mean = np.array(y_pred_list).mean(axis=0)
#prepare the submission dataframe
df_sub = pd.DataFrame(y_pred_mean, columns=df_submiss.columns, index=idx_test)
#set the 'ctl_vehicle' to zero
idx_test_vehicle = df_test_orig[df_test_orig['cp_type'] == 'ctl_vehicle'].index
df_sub.loc[idx_test_vehicle] = 0.0
#reset index
df_sub.reset_index(inplace=True)
#write
df_sub.to_csv('submission.csv', index = False)