In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression

from joblib import dump, load
import pickle

from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import log_loss

from datetime import date

from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from xgboost import XGBClassifier

from category_encoders import CountEncoder


In [None]:
# settings
data_folder = '../input/'
data_folder = '../input/lish-moa/'
output_folder = ''

xseed = 42

nfolds = 5



# XGB

In [None]:
model_name = 'xgb'

## Data
xtrain = pd.read_csv(data_folder + 'train_features.csv')
xtest = pd.read_csv(data_folder + 'test_features.csv')
ytrain = pd.read_csv(data_folder + 'train_targets_scored.csv')

## FE

# summary statistics per row per group of columns
gcols = [f for f in xtrain.columns if 'g-' in f]
ccols = [f for f in xtrain.columns if 'c-' in f]

xtrain['g_min'] = xtrain[gcols].min(axis = 1)
xtrain['g_max'] = xtrain[gcols].max(axis = 1)
xtrain['g_mean'] = xtrain[gcols].mean(axis = 1)
xtrain['g_sd'] = xtrain[gcols].std(axis = 1)

xtrain['c_min'] = xtrain[ccols].min(axis = 1)
xtrain['c_max'] = xtrain[ccols].max(axis = 1)
xtrain['c_mean'] = xtrain[ccols].mean(axis = 1)
xtrain['c_sd'] = xtrain[ccols].std(axis = 1)
xtrain['c_median'] = xtrain[ccols].std(axis = 1)


xtest['g_min'] = xtest[gcols].min(axis = 1)
xtest['g_max'] = xtest[gcols].max(axis = 1)
xtest['g_mean'] = xtest[gcols].mean(axis = 1)
xtest['g_sd'] = xtest[gcols].std(axis = 1)

xtest['c_min'] = xtest[ccols].min(axis = 1)
xtest['c_max'] = xtest[ccols].max(axis = 1)
xtest['c_mean'] = xtest[ccols].mean(axis = 1)
xtest['c_sd'] = xtest[ccols].std(axis = 1)
xtest['c_median'] = xtest[ccols].std(axis = 1)

# categorical cols

enc = LabelEncoder()
enc_cnt = CountEncoder()
category_cols = ['cp_dose', 'cp_type']
print(category_cols)

for cols in category_cols:
    xtrain[cols] = enc.fit_transform(xtrain[cols])
    xtest[cols] = enc.transform(xtest[cols])
    
    enc_cnt.fit(xtrain[cols] + xtest[cols])
    xtrain[cols+'_cnt'] = enc_cnt.transform(xtrain[cols])
    xtest[cols+'_cnt'] = enc_cnt.transform(xtest[cols])
    
    
## Model    

# prepare split
kf = KFold(n_splits = nfolds)

# separation
id_train = xtrain['sig_id']; id_test = xtest['sig_id']
ytrain.drop('sig_id', axis = 1, inplace = True) 
xtrain.drop('sig_id', axis = 1, inplace = True)
xtest.drop('sig_id', axis = 1, inplace = True)


prval = np.zeros(ytrain.shape)
prfull = np.zeros((xtest.shape[0], ytrain.shape[1]))

# classifier 

classifier = MultiOutputClassifier(XGBClassifier(tree_method='gpu_hist'))

clf = Pipeline([
                ('classify', classifier)
               ])

params = {'classify__estimator__colsample_bytree': 0.6522,
          'classify__estimator__gamma': 3.6975,
          'classify__estimator__learning_rate': 0.0503,
          'classify__estimator__max_delta_step': 2.0706,
          'classify__estimator__max_depth': 10,
          'classify__estimator__min_child_weight': 31.5800,
          'classify__estimator__n_estimators': 166,
          'classify__estimator__subsample': 0.8639
         }

_ = clf.set_params(**params)

## Fitting
for (ff, (id0, id1)) in enumerate(kf.split(xtrain)):
     
    x0, x1 = xtrain.loc[id0], xtrain.loc[id1]
    y0, y1 = ytrain.loc[id0], ytrain.loc[id1]
    
    # drop where cp_type==ctl_vehicle (baseline)
    ctl_mask = np.where(x0['cp_type'] != 0)[0]

    x0 = x0.iloc[ctl_mask]
    x1 = x1
    y0 = y0.iloc[ctl_mask]
    
    # fit model
    classifier.fit(x0, y0)
    
    # generate predictions
    vpreds = clf.predict_proba(x1) # list of preds per class
    vpreds = np.array(vpreds)[:,:,1].T # take the positive class
    
    fpreds = clf.predict_proba(xtest)
    fpreds = np.array(fpreds)[:,:,1].T # take the positive class
        
    # normalize the probabilities 
    for ii in range(0, ytrain.shape[1]):
        m1 = y0.iloc[:,ii].mean()
        
        m2 = vpreds[:,ii].mean()
        vpreds[:,ii] = vpreds[:,ii] - m2 + m1

        m2 = fpreds[:,ii].mean()
        fpreds[:,ii] = fpreds[:,ii] - m2 + m1

    prval[id1,:] = vpreds
    prfull += fpreds / nfolds


## prep files
prval = pd.DataFrame(prval); prval.columns = ytrain.columns
prval['sig_id'] = id_train

prfull = pd.DataFrame(prfull); prfull.columns = ytrain.columns
prfull['sig_id'] = id_test

xcols = list(ytrain.columns); xcols.insert(0, 'sig_id')
prval = prval[xcols]
prfull = prfull[xcols]

todate = date.today().strftime("%d%m")
print(todate)

prval.to_csv(output_folder + 'prval_'+model_name+'_'+todate+'.csv', index = False)
prfull.to_csv(output_folder + 'prfull_'+model_name+'_'+todate+'.csv', index = False)

prval_xgb = prval.copy()
prfull_xgb = prfull.copy()


# LR

In [None]:
model_name = 'lr'

## Data

xtrain = pd.read_csv(data_folder + 'train_features.csv')
xtest = pd.read_csv(data_folder + 'test_features.csv')
ytrain = pd.read_csv(data_folder + 'train_targets_scored.csv')

## FE
# categorical columns - cp_time, cp_dose. we can dump cp_type
print(set(xtrain['cp_time']), set(xtest['cp_time']) )

# cp_time
xtrain['cp_time_24'] = (xtrain['cp_time'] == 24) + 0
xtrain['cp_time_48'] = (xtrain['cp_time'] == 48) + 0
xtest['cp_time_24'] = (xtest['cp_time'] == 24) + 0
xtest['cp_time_48'] = (xtest['cp_time'] == 48) + 0
xtrain.drop('cp_time', axis = 1, inplace = True)
xtest.drop('cp_time', axis = 1, inplace = True)

# cp_dose
print(set(xtrain['cp_dose']), set(xtest['cp_dose']) )
xtrain['cp_dose_D1'] = (xtrain['cp_dose'] == 'D1') + 0
xtest['cp_dose_D1'] = (xtest['cp_dose'] == 'D1') + 0
xtrain.drop('cp_dose', axis = 1, inplace = True)
xtest.drop('cp_dose', axis = 1, inplace = True)

# cp_type
xtrain['cp_type_control'] = (xtrain['cp_type'] == 'ctl_vehicle') + 0
xtest['cp_type_control'] = (xtest['cp_type'] == 'ctl_vehicle') + 0

xtrain.drop('cp_type', axis = 1, inplace = True)
xtest.drop('cp_type', axis = 1, inplace = True)

# summary statistics per row per group of columns
gcols = [f for f in xtrain.columns if 'g-' in f]
ccols = [f for f in xtrain.columns if 'c-' in f]

xtrain['g_min'] = xtrain[gcols].min(axis = 1)
xtrain['g_max'] = xtrain[gcols].max(axis = 1)
xtrain['g_mean'] = xtrain[gcols].mean(axis = 1)
xtrain['g_sd'] = xtrain[gcols].std(axis = 1)

xtrain['c_min'] = xtrain[ccols].min(axis = 1)
xtrain['c_max'] = xtrain[ccols].max(axis = 1)
xtrain['c_mean'] = xtrain[ccols].mean(axis = 1)
xtrain['c_sd'] = xtrain[ccols].std(axis = 1)
xtrain['c_median'] = xtrain[ccols].std(axis = 1)


xtest['g_min'] = xtest[gcols].min(axis = 1)
xtest['g_max'] = xtest[gcols].max(axis = 1)
xtest['g_mean'] = xtest[gcols].mean(axis = 1)
xtest['g_sd'] = xtest[gcols].std(axis = 1)

xtest['c_min'] = xtest[ccols].min(axis = 1)
xtest['c_max'] = xtest[ccols].max(axis = 1)
xtest['c_mean'] = xtest[ccols].mean(axis = 1)
xtest['c_sd'] = xtest[ccols].std(axis = 1)
xtest['c_median'] = xtest[ccols].std(axis = 1)

## Model

kf = KFold(n_splits = nfolds)

# separation
id_train = xtrain['sig_id']; id_test = xtest['sig_id']
ytrain.drop('sig_id', axis = 1, inplace = True) 
xtrain.drop('sig_id', axis = 1, inplace = True)
xtest.drop('sig_id', axis = 1, inplace = True)


prval = np.zeros(ytrain.shape)
prfull = np.zeros((xtest.shape[0], ytrain.shape[1]))

#  model definition
pca = PCA(n_components = 350)
# set the tolerance to a large value to make the example faster
logistic = LogisticRegression(max_iter=10000, tol=0.1, C = 0.25)
base_model = Pipeline(steps=[('pca', pca), ('logistic', logistic)])

mo_base = MultiOutputClassifier(base_model, n_jobs=-1)

# fitting
for (ff, (id0, id1)) in enumerate(kf.split(xtrain)):
     
    x0, x1 = xtrain.loc[id0], xtrain.loc[id1]
    y0, y1 = np.array(ytrain.loc[id0]), np.array(ytrain.loc[id1])
    
    # stupid fix for empty columns - LogisticRegression blows up otherwise
    check_for_empty_cols = np.where(y0.sum(axis = 0) == 0)[0]
    if len(check_for_empty_cols):
        y0[0,check_for_empty_cols] = 1

    # fit model
    mo_base.fit(x0,y0)
    
    # generate the prediction
    vpred = mo_base.predict_proba(x1)
    fpred = mo_base.predict_proba(xtest)
    
    for ii in range(0,ytrain.shape[1]):
        
        prval[id1,ii] = vpred[ii][:,1]
        prfull[:,ii] += fpred[ii][:,1]/nfolds   

# prep files
prval = pd.DataFrame(prval); prval.columns = ytrain.columns
prval['sig_id'] = id_train

prfull = pd.DataFrame(prfull); prfull.columns = ytrain.columns
prfull['sig_id'] = id_test


xcols = list(ytrain.columns); xcols.insert(0, 'sig_id')
prval = prval[xcols]
prfull = prfull[xcols]


todate = date.today().strftime("%d%m")
print(todate)

prval.to_csv(output_folder + 'prval_'+model_name+'_'+todate+'.csv', index = False)
prfull.to_csv(output_folder + 'prfull_'+model_name+'_'+todate+'.csv', index = False)

prval_lr = prval.copy()
prfull_lr = prfull.copy()


# Ensemble

In [None]:
alpha = 0.9

metrics1 = []
metrics2 = []
metrics3 = []
metrics_combo = []

for _target in ytrain.columns:
    metrics1.append(log_loss(ytrain.loc[:, _target], prval_xgb.loc[:, _target]))
    metrics2.append(log_loss(ytrain.loc[:, _target], prval_lr.loc[:, _target]))
    prcombo = alpha * prval_xgb.loc[:, _target] + (1- alpha) * alpha * prval_lr.loc[:, _target]
    metrics3.append(log_loss(ytrain.loc[:, _target], prcombo   ))
    prval.loc[:, _target] = prcombo
    prfull.loc[:, _target] = alpha * prfull_xgb.loc[:, _target] + (1- alpha) * alpha * prfull_lr.loc[:, _target]
    
print(f'OOF Metric: {np.round(np.mean(metrics1),4)}')
print(f'OOF Metric: {np.round(np.mean(metrics2),4)}')
print(f'OOF Metric: {np.round(np.mean(metrics3),4)}')



In [None]:
model_name = 'ens'
todate = date.today().strftime("%d%m")
print(todate)

prval.to_csv(output_folder + 'prval_'+model_name+'_'+todate+'.csv', index = False)
prfull.to_csv(output_folder + 'prfull_'+model_name+'_'+todate+'.csv', index = False)
prfull.to_csv(output_folder + 'submission.csv', index = False)
