# Machine Learning

In [1]:
import pickle
import warnings

import hyperopt
import numpy as np
import pandas as pd
from hyperopt import STATUS_OK, Trials, fmin, hp, rand, tpe
from hyperopt.pyll.base import scope
from lightgbm import LGBMClassifier
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import (
    ExtraTreesClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
)
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.metrics import make_scorer, ndcg_score, roc_auc_score
from sklearn.model_selection import (
    KFold,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split
)
from sklearn.preprocessing import LabelBinarizer, LabelEncoder

warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
train = pd.read_csv('./data/train_df.csv', index_col=0)

In [3]:
train.shape

(213451, 126)

In [4]:
le = LabelEncoder()
train['target'] = le.fit_transform(train['country_destination'])
lb = LabelBinarizer()
lb.fit(train['target'])

LabelBinarizer()

In [5]:
mapping = dict(zip(range(len(le.classes_)), le.classes_))
mapping

{0: 'AU',
 1: 'CA',
 2: 'DE',
 3: 'ES',
 4: 'FR',
 5: 'GB',
 6: 'IT',
 7: 'NDF',
 8: 'NL',
 9: 'PT',
 10: 'US',
 11: 'other'}

In [6]:
target = train['target']
feature = train.drop(['target', 'country_destination'], axis=1)

In [7]:
print(target.shape, feature.shape)

(213451,) (213451, 125)


### Base Models comparison

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    feature, target, train_size=.25, random_state=42)

In [9]:
clf_list = {'dummy': DummyClassifier(), 'lr': LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=800),
            'rfc': RandomForestClassifier(), 'et': ExtraTreesClassifier(), 'gradient': GradientBoostingClassifier(),
            'lgb': LGBMClassifier(objective='multiclass', num_class=12)}

In [10]:
def model_train(name, reg):
    fit = reg.fit(X_train, y_train)
    pred = fit.predict(X_test)
    score = fit.predict_proba(X_test)
    ndcg = ndcg_score(lb.transform(y_test), score, k=5)
#     roc = roc_auc_score(y_test, score, multi_class='ovr')
    print('{} has ndcg score of {:.3f}'.format(name, ndcg))

In [15]:
for name, reg in clf_list.items():
    model_train(name, reg)

dummy has ndcg score of 0.530 and roc of 0.500
lr has ndcg score of 0.825 and roc of 0.656
rfc has ndcg score of 0.809 and roc of 0.558
et has ndcg score of 0.797 and roc of 0.552
gradient has ndcg score of 0.824 and roc of 0.649
lgb has ndcg score of 0.825 and roc of 0.632


### Cross Validation

In [11]:
def dcg_score(y_true, y_score, k=5):
    """Discounted cumulative gain (DCG) at rank K.

    Parameters
    ----------
    y_true : array, shape = [n_samples]
        Ground truth (true relevance labels).
    y_score : array, shape = [n_samples, n_classes]
        Predicted scores.
    k : int
        Rank.

    Returns
    -------
    score : float
    """
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])

    gain = 2 ** y_true - 1

    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gain / discounts)


def ndcg_score_c(ground_truth, predictions, k=5):
    """Normalized discounted cumulative gain (NDCG) at rank K.

    Normalized Discounted Cumulative Gain (NDCG) measures the performance of a
    recommendation system based on the graded relevance of the recommended
    entities. It varies from 0.0 to 1.0, with 1.0 representing the ideal
    ranking of the entities.

    Parameters
    ----------
    ground_truth : array, shape = [n_samples]
        Ground truth (true labels represended as integers).
    predictions : array, shape = [n_samples, n_classes]
        Predicted probabilities.
    k : int
        Rank.

    Returns
    -------
    score : float

    Example
    -------
    >>> ground_truth = [1, 0, 2]
    >>> predictions = [[0.15, 0.55, 0.2], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]]
    >>> score = ndcg_score(ground_truth, predictions, k=2)
    1.0
    >>> predictions = [[0.9, 0.5, 0.8], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]]
    >>> score = ndcg_score(ground_truth, predictions, k=2)
    0.6666666666
    """
    lb = LabelBinarizer()
    lb.fit(range(len(predictions) + 1))
    T = lb.transform(ground_truth)

    scores = []

    # Iterate over each y_true and compute the DCG score
    for y_true, y_score in zip(T, predictions):
        actual = dcg_score(y_true, y_score, k)
        best = dcg_score(y_true, y_true, k)
        score = float(actual) / float(best)
        scores.append(score)

    return np.mean(scores)

In [12]:
ndcg_scorer = make_scorer(ndcg_score_c, needs_proba=True, k=5)

In [13]:
for name, reg in clf_list.items():
    scores = cross_validate(reg, X_train, y_train, cv=4, scoring=
        ndcg_scorer, return_train_score=True)
    print('{} has average roc of {:.3f}'.format(
        name, np.mean(scores['train_score'])))

dummy has average roc of 0.646
lr has average roc of 0.827
rfc has average roc of 0.862
et has average roc of 0.862
gradient has average roc of 0.832
lgb has average roc of 0.838


**Tree based** models appear to work well for this classification problem. **Lightgbm** would also be considered.

### Tunning

In [11]:
space = {'max_depth': hp.quniform('max_depth', 10, 50, 4),
         'n_estimators': scope.int(hp.quniform('n_estimators', 200, 2000, 10)),
         'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 4)),
         'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 2, 10, 4)),
         'bootstrap': hp.choice('bootstrap', [True, False]),
#          'max_features': hp.choice('max_features', ['auto', 'sqrt'])
         }

In [15]:
def obj_rf(params):
    clf = RandomForestClassifier(**params)
    best_score = cross_val_score(
        clf, X_train, y_train, scoring=ndcg_scorer, cv=5).mean()
    return {'loss': -best_score, 'status': STATUS_OK}

In [16]:
best_rf = fmin(fn=obj_rf, space=space, max_evals=40,
               rstate=np.random.RandomState(42), algo=tpe.suggest)

100%|████████████████████████████████████████████| 40/40 [3:45:44<00:00, 338.61s/trial, best loss: -0.8229951734812391]


In [17]:
best_rf

{'bootstrap': 1,
 'max_depth': 32.0,
 'max_features': 1,
 'min_samples_leaf': 4.0,
 'min_samples_split': 8.0,
 'n_estimators': 1520.0}

In [24]:
rf_tune_test = RandomForestClassifier(n_estimators=1520, min_samples_leaf=4,
                                  min_impurity_split=8, max_depth=32, bootstrap=True)

In [None]:
model_train('rf_tuned', rf_tune_test)

In [18]:
rf_tuned = RandomForestClassifier(n_estimators=1520, min_samples_leaf=4,
                                  min_impurity_split=8, max_depth=32, bootstrap=True).fit(feature, target)
pickle.dump(rf_tuned, open('./data/rf_tuned.sav', 'wb'))

In [14]:
def obj_et(params):
    clf = ExtraTreesClassifier(**params)
    best_score = cross_val_score(
        clf, X_train, y_train, scoring=ndcg_scorer, cv=5).mean()
    return {'loss': -best_score, 'status': STATUS_OK}

In [17]:
best_et = fmin(fn=obj_et, space=space, max_evals=40,
               rstate=np.random.RandomState(42), algo=tpe.suggest)

100%|████████████████████████████████████████████| 40/40 [3:06:33<00:00, 279.83s/trial, best loss: -0.8231016965544086]


In [18]:
best_et

{'bootstrap': 1,
 'max_depth': 44.0,
 'max_features': 1,
 'min_samples_leaf': 4.0,
 'min_samples_split': 4.0,
 'n_estimators': 640.0}

In [20]:
et_tuned_test = ExtraTreesClassifier(n_estimators=640, max_depth=44,
                                min_samples_leaf=4, min_samples_split=4)

In [21]:
model_train('et_tuned', et_tuned_test)

et_tuned has ndcg score of 0.822


In [20]:
et_tuned = ExtraTreesClassifier(n_estimators=640, max_depth=44,
                                min_samples_leaf=4, min_samples_split=4).fit(feature, target)
pickle.dump(et_tuned, open('./data/et_tuned.sav', 'wb'))

In [13]:
space_lgb = {'max_depth': scope.int(hp.quniform('max_depth', 10, 50, 25)),
             'n_estimators': scope.int(hp.quniform('n_estimators', 200, 2000, 10)),
             #              'learning_rate': hp.choice('learning_rate', [0.05, 0.1, 0.3]),
             'min_child_samples': scope.int(hp.quniform('min_child_samples', 4, 200, 50)),
             'num_leaves': scope.int(hp.quniform('num_leaves', 20, 100, 20))
             }

In [14]:
def obj_lgb(params):
    clf = LGBMClassifier(**params)
    best_score = cross_val_score(
        clf, X_train, y_train, scoring=ndcg_scorer, cv=5).mean()
    return {'loss': -best_score, 'status': STATUS_OK}

In [15]:
best_lgb = fmin(fn=obj_lgb, space=space_lgb, max_evals=40,
                rstate=np.random.RandomState(42), algo=tpe.suggest)

100%|████████████████████████████████████████████| 40/40 [4:48:50<00:00, 433.27s/trial, best loss: -0.8242607317366591]


In [16]:
best_lgb

{'max_depth': 0.0,
 'min_child_samples': 150.0,
 'n_estimators': 620.0,
 'num_leaves': 20.0}

In [17]:
lgb_tuned_test = LGBMClassifier(n_estimators=620, min_child_samples=150,
                           num_leaves=20, learning_rate=0.05)

In [18]:
model_train('lgb_tuned', lgb_tuned_test)

lgb_tuned has ndcg score of 0.824


In [19]:
lgb_tuned = LGBMClassifier(n_estimators=620, min_child_samples=150,
                           num_leaves=20, learning_rate=0.05).fit(feature, target)
pickle.dump(lgb_tuned, open('./data/lgb_tuned.sav', 'wb'))