# Machine Learning

In [5]:
import pickle
import warnings

import hyperopt
import numpy as np
import pandas as pd
from hyperopt import STATUS_OK, Trials, fmin, hp, rand, tpe
from hyperopt.pyll.base import scope
from lightgbm import LGBMClassifier
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import (
    ExtraTreesClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
)
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.metrics import ndcg_score, roc_auc_score
from sklearn.model_selection import (
    KFold,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from tpot import TPOTClassifier
from xgboost.sklearn import XGBClassifier

warnings.simplefilter(action='ignore', category=FutureWarning)

In [6]:
train = pd.read_csv('./data/train_df.csv', index_col=0)

In [7]:
train.shape

(213451, 126)

In [8]:
le = LabelEncoder()
train['target'] = le.fit_transform(train['country_destination'])
lb = LabelBinarizer()
lb.fit(train['target'])

LabelBinarizer()

In [9]:
mapping = dict(zip(range(len(le.classes_)), le.classes_))
mapping

{0: 'AU',
 1: 'CA',
 2: 'DE',
 3: 'ES',
 4: 'FR',
 5: 'GB',
 6: 'IT',
 7: 'NDF',
 8: 'NL',
 9: 'PT',
 10: 'US',
 11: 'other'}

In [10]:
target = train['target']
feature = train.drop(['target', 'country_destination'], axis=1)

In [11]:
print(target.shape, feature.shape)

(213451,) (213451, 125)


### Base Models comparison

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    feature, target, train_size=.25, random_state=42)

In [8]:
clf_list = {'dummy': DummyClassifier(), 'lr': LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=800),
            'rfc': RandomForestClassifier(), 'et': ExtraTreesClassifier(), 'gradient': GradientBoostingClassifier(),
            'lgb': LGBMClassifier(objective='multiclass', num_class=12)}

In [9]:
def model_train(name, reg):
    fit = reg.fit(X_train, y_train)
    pred = fit.predict(X_test)
    score = fit.predict_proba(X_test)
    ndcg = ndcg_score(lb.transform(y_test), score, k=5)
    roc = roc_auc_score(y_test, score, multi_class='ovr')
    print('{} has ndcg score of {:.3f} and roc of {:.3f}'.format(name, ndcg, roc))

In [12]:
for name, reg in clf_list.items():
    model_train(name, reg)

dummy has ndcg score of 0.531 and roc of 0.500
lr has ndcg score of 0.825 and roc of 0.656
rfc has ndcg score of 0.809 and roc of 0.557
et has ndcg score of 0.797 and roc of 0.552
gradient has ndcg score of 0.824 and roc of 0.648
lgb has ndcg score of 0.825 and roc of 0.632


### Cross Validation

In [34]:
for name, reg in clf_list.items():
    scores = cross_validate(reg, feature, target, cv=4, scoring=(
        'roc_auc_ovr'), return_train_score=True)
    print('{} has average roc of {:.3f}'.format(
        name, np.mean(scores['train_score'])))

dummy has average roc of 0.500
lr has average roc of 0.687
rfc has average roc of 0.842
et has average roc of 0.845
gradient has average roc of 0.700
lgb has average roc of 0.754


**Tree based** models appear to work well for this classification problem. **Lightgbm** would also be considered.

### Tunning

In [13]:
number_generations = 10
population_size = 20
offspring_size = 50
scoring_function = 'roc_auc_ovr'

In [14]:
tpot_clf = TPOTClassifier(generations=number_generations, population_size=population_size,
                          offspring_size=offspring_size, scoring=scoring_function,
                          verbosity=2, random_state=2, cv=5)

In [None]:
tpot_clf.fit(X_train, y_train)
print(tpot_clf.score(X_test, y_test))

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=520.0, style=ProgressStyle(de…

In [10]:
space = {'max_depth': hp.quniform('max_depth', 10, 50, 4),
         'n_estimators': scope.int(hp.quniform('n_estimators', 200, 2000, 10)),
         'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 4)),
         'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 2, 10, 4)),
         'bootstrap': hp.choice('bootstrap', [True, False]),
         'max_features': hp.choice('max_features', ['auto', 'sqrt'])
         }

In [11]:
def obj_rf(params):
    clf = RandomForestClassifier(**params)
    best_score = cross_val_score(
        clf, X_train, y_train, scoring='roc_auc_ovr', cv=2).mean()
    return {'loss': -best_score, 'status': STATUS_OK}

In [14]:
best_rf = fmin(fn=obj_rf, space=space, max_evals=25,
               rstate=np.random.RandomState(42), algo=rand.suggest)

100%|██████████| 25/25 [37:51<00:00, 90.84s/trial, best loss: -0.636198424672282]  


In [15]:
best_rf

{'bootstrap': 0,
 'max_depth': 16.0,
 'max_features': 0,
 'min_samples_leaf': 8.0,
 'min_samples_split': 8.0,
 'n_estimators': 210.0}

In [25]:
rf_tuned = RandomForestClassifier(n_estimators=210, min_samples_leaf=8,
                                  min_impurity_split=8, max_depth=16, bootstrap=True).fit(feature, target)
pickle.dump(rf_tuned, open('./data/rf_tuned.sav', 'wb'))

In [89]:
model_train('rf_tuned', rf_tuned)

rf_tuned has ndcg score of 0.806 and roc of 0.500


In [24]:
def obj_et(params):
    clf = ExtraTreesClassifier(**params)
    best_score = cross_val_score(
        clf, X_train, y_train, scoring='roc_auc_ovr', cv=2).mean()
    return {'loss': -best_score, 'status': STATUS_OK}

In [26]:
best_et = fmin(fn=obj_et, space=space, max_evals=25,
               rstate=np.random.RandomState(42), algo=tpe.suggest)

100%|██████████| 25/25 [24:33<00:00, 58.93s/trial, best loss: -0.636270277556644] 


In [29]:
et_tuned = ExtraTreesClassifier(n_estimators=1990, max_depth=24,
                                min_samples_leaf=8, min_samples_split=8).fit(feature, target)
pickle.dump(et_tuned, open('./data/et_tuned.sav', 'wb'))

In [18]:
space_lgb = {'max_depth': scope.int(hp.quniform('max_depth', 10, 50, 25)),
             'n_estimators': scope.int(hp.quniform('n_estimators', 200, 2000, 10)),
             'learning_rate': hp.choice('learning_rate', [0.05, 0.1, 0.3]),
             'min_child_samples': scope.int(hp.quniform('min_child_samples', 4, 200, 50)),
             'num_leaves': scope.int(hp.quniform('num_leaves', 20, 100, 20))
             }

In [19]:
def obj_lgb(params):
    clf = LGBMClassifier(**params)
    best_score = cross_val_score(
        clf, X_train, y_train, scoring='roc_auc_ovr', cv=2).mean()
    return {'loss': -best_score, 'status': STATUS_OK}

In [22]:
best_lgb = fmin(fn=obj_lgb, space=space_lgb, max_evals=25,
                rstate=np.random.RandomState(42), algo=rand.suggest)

100%|██████████| 25/25 [1:12:39<00:00, 174.38s/trial, best loss: -0.612053920174981] 


In [23]:
best_lgb

{'learning_rate': 0,
 'max_depth': 25.0,
 'min_child_samples': 150.0,
 'n_estimators': 220.0,
 'num_leaves': 60.0}

In [24]:
lgb_tuned = LGBMClassifier(n_estimators=220, min_child_samples=150,
                           num_leaves=60, learning_rate=0.05, max_depth=25).fit(feature, target)
pickle.dump(lgb_tuned, open('./data/lgb_tuned.sav', 'wb'))