In [1]:
%%writefile inference.py
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from joblib import dump, load
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from datetime import date
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

Writing inference.py


In [2]:
%%writefile -a inference.py
data_folder = '../input/lish-moa/'

# fix the random seed 
xseed = 43

# number of folds for cv
nfolds = 5

# number of components to retain from PCA decomposition
nof_comp = 250

model_name = 'lr'

Appending to inference.py


In [3]:
%%writefile -a inference.py
xtrain = pd.read_csv(data_folder + 'train_features.csv')
xtest = pd.read_csv(data_folder + 'test_features.csv')
ytrain = pd.read_csv(data_folder + 'train_targets_scored.csv')

Appending to inference.py


In [4]:
%%writefile -a inference.py
# due to small cardinality of all values, it's faster to handle categoricals that way,

#print(set(xtrain['cp_time']), set(xtest['cp_time']) )

# cp_time
xtrain['cp_time_24'] = (xtrain['cp_time'] == 24) + 0
xtrain['cp_time_48'] = (xtrain['cp_time'] == 48) + 0
xtest['cp_time_24'] = (xtest['cp_time'] == 24) + 0
xtest['cp_time_48'] = (xtest['cp_time'] == 48) + 0
xtrain.drop('cp_time', axis = 1, inplace = True)
xtest.drop('cp_time', axis = 1, inplace = True)

# cp_dose
#print(set(xtrain['cp_dose']), set(xtest['cp_dose']) )
xtrain['cp_dose_D1'] = (xtrain['cp_dose'] == 'D1') + 0
xtest['cp_dose_D1'] = (xtest['cp_dose'] == 'D1') + 0
xtrain.drop('cp_dose', axis = 1, inplace = True)
xtest.drop('cp_dose', axis = 1, inplace = True)

# cp_type
xtrain['cp_type_control'] = (xtrain['cp_type'] == 'ctl_vehicle') + 0
xtest['cp_type_control'] = (xtest['cp_type'] == 'ctl_vehicle') + 0
xtrain.drop('cp_type', axis = 1, inplace = True)
xtest.drop('cp_type', axis = 1, inplace = True)

Appending to inference.py


In [5]:
%%writefile -a inference.py
# prepare split
import sys
sys.path.append('../input/iterative-stratification/iterative-stratification-master')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
kf = MultilabelStratifiedKFold(n_splits = nfolds)

# separation
id_train = xtrain['sig_id']; id_test = xtest['sig_id']
ytrain.drop('sig_id', axis = 1, inplace = True) 
xtrain.drop('sig_id', axis = 1, inplace = True)
xtest.drop('sig_id', axis = 1, inplace = True)

# storage matrices for OOF / test predictions
prval = np.zeros(ytrain.shape)
prfull = np.zeros((xtest.shape[0], ytrain.shape[1]))

Appending to inference.py


In [6]:
%%writefile -a inference.py
# base model definition throught sklearn Pipeline
pca = PCA(n_components = nof_comp)
logistic = LogisticRegression(max_iter=10000, tol=0.1, C = 0.5)
base_model = Pipeline(steps=[('pca', pca), ('logistic', logistic)])

# a pipeline can be fed into MultiOutputClassifier just like a regular estimator would
mo_base = MultiOutputClassifier(base_model, n_jobs=-1)

Appending to inference.py


In [7]:
%%writefile -a inference.py
for (ff, (id0, id1)) in enumerate(kf.split(xtrain,ytrain)):
     
    x0, x1 = xtrain.loc[id0], xtrain.loc[id1]
    y0, y1 = np.array(ytrain.loc[id0]), np.array(ytrain.loc[id1])
    
    # stupid fix for empty columns - LogisticRegression blows up otherwise 
    # (the problem occurs for two folds only, each time for a single column)
    # yes, i know it's ugly
    check_for_empty_cols = np.where(y0.sum(axis = 0) == 0)[0]
    if len(check_for_empty_cols):
        y0[0,check_for_empty_cols] = 1
    
    # fit model
    mo_base.fit(x0,y0)
    
    # generate the prediction
    vpred = mo_base.predict_proba(x1)
    fpred = mo_base.predict_proba(xtest)
    
    for ii in range(0,ytrain.shape[1]):
        
        prval[id1,ii] = vpred[ii][:,1]
        prfull[:,ii] += fpred[ii][:,1]/nfolds  

Appending to inference.py


In [8]:
%%writefile -a inference.py
prval = pd.DataFrame(prval); prval.columns = ytrain.columns
prval['sig_id'] = id_train

prfull = pd.DataFrame(prfull); prfull.columns = ytrain.columns
prfull['sig_id'] = id_test

Appending to inference.py


In [9]:
%%writefile -a inference.py
metrics = []
for _target in ytrain.columns:
    metrics.append(log_loss(ytrain.loc[:, _target], prval.loc[:, _target]))
print(f'OOF Metric: {np.round(np.mean(metrics),4)}')

Appending to inference.py


In [10]:
%%writefile -a inference.py
xcols = list(ytrain.columns); xcols.insert(0, 'sig_id')
prval = prval[xcols]; prfull = prfull[xcols]

np.save('log-reg-oof.npy', prval.values[:, 1:])
# actual submission
prfull.to_csv('submission.csv', index = False)

Appending to inference.py


In [11]:
! python inference.py

OOF Metric: 0.0215
