# Testing model load

In [1]:
%%writefile inference.py

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from category_encoders import CountEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss

import matplotlib.pyplot as plt

from sklearn.multioutput import MultiOutputClassifier

import os
import warnings
warnings.filterwarnings('ignore')
import datetime
from time import time

Writing inference.py


## Load test and submission data

In [2]:
%%writefile -a inference.py

train_features = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv')

test_features = pd.read_csv('../input/lish-moa/test_features.csv')
sub = pd.read_csv('../input/lish-moa/sample_submission.csv')

Appending to inference.py


## Preprocess data

In [3]:
%%writefile -a inference.py

def preprocess_features(df):
    df = df.copy()
    df['cp_dose'] = df['cp_dose'].map({'D1': 0, 'D2': 1})
    df['cp_time'] = df['cp_time'].map({24: 0, 48: 1, 72: 2})
    df.drop(columns=['sig_id'], inplace=True)
    return df

# preprocess train_features
train_features = preprocess_features(train_features)
train_features2 = train_features.copy() #keep train_features with cp_type to later evalute model
train_features.drop(columns=['cp_type'], inplace=True)

# preprocess test_features
test_features = preprocess_features(test_features)
test_features.drop(columns=['cp_type'], inplace=True)

# preprocess train_targets
train_targets.drop(columns=['sig_id'], inplace=True)

# transform to numpy
X_train_features = train_features.iloc[:,:].to_numpy()
y_train_targets = train_targets.iloc[:,:].to_numpy()
X_test = test_features.iloc[:,:].to_numpy()

Appending to inference.py


## Load model

In [4]:
%%writefile -a inference.py

import joblib
loaded_model = joblib.load('../input/xgboost-baseline-saved-model/xgboost_final_model.sav')

Appending to inference.py


## Test model prediction on training data

In [5]:
%%writefile -a inference.py

SEED = 42
NFOLDS = 5
np.random.seed(SEED)

oof_preds = np.zeros(y_train_targets.shape)
test_preds = np.zeros((test_features.shape[0], y_train_targets.shape[1]))
oof_losses = []
kf = KFold(n_splits=NFOLDS)

for fn, (trn_idx, val_idx) in enumerate(kf.split(X_train_features, y_train_targets)):
    print('Starting fold: ', fn)
    X_train, X_val = X_train_features[trn_idx], X_train_features[val_idx]
    y_train, y_val = y_train_targets[trn_idx], y_train_targets[val_idx]
    
    val_preds = loaded_model.predict_proba(X_val) # list of preds per class
    val_preds = np.array(val_preds)[:,:,1].T # take the positive class
    oof_preds[val_idx] = val_preds
    
    loss = log_loss(np.ravel(y_val), np.ravel(val_preds))
    oof_losses.append(loss)

print(oof_losses)
print('Mean OOF loss across folds', np.mean(oof_losses))
print('STD OOF loss across folds', np.std(oof_losses))

Appending to inference.py


## Predict on test data

In [6]:
%%writefile -a inference.py

# Predict
test_preds = loaded_model.predict_proba(X_test)
test_preds = np.array(test_preds)[:,:,1].T # take the positive class

Appending to inference.py


## OOF log loss

In [7]:
%%writefile -a inference.py

# set control train preds to 0
control_mask = train_features2['cp_type']=='ctl_vehicle'
oof_preds[control_mask] = 0

print('OOF log loss: ', log_loss(np.ravel(y_train_targets), np.ravel(oof_preds)))

Appending to inference.py


## Submission file

In [8]:
%%writefile -a inference.py

sub.iloc[:,1:] = test_preds
sub.to_csv('submission.csv', index=False)

Appending to inference.py


In [9]:
%%writefile -a inference.py

np.save('xgb-oof.npy', oof_preds)

Appending to inference.py


In [10]:
! python inference.py

Starting fold:  0
Starting fold:  1
Starting fold:  2
Starting fold:  3
Starting fold:  4
[0.014979477369147594, 0.01504830126307787, 0.014895901999032702, 0.015053951496768293, 0.0171170575506215]
Mean OOF loss across folds 0.01541893793572959
STD OOF loss across folds 0.0008509863748471652
OOF log loss:  0.015179309323355594
[0m