# Machine Learning

In [11]:
import warnings

import keras
import numpy as np
import pandas as pd
import tensorflow as tf
from keras.layers import Dense
from keras.metrics import TopKCategoricalAccuracy
from keras.models import Sequential, load_model
from keras.utils import np_utils
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.metrics import ndcg_score, make_scorer
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
train = pd.read_csv('./data/train_df.csv', index_col=0)

In [3]:
le = LabelEncoder()
train['target'] = le.fit_transform(train['country_destination'])
lb = LabelBinarizer()
lb.fit(train['target'])

LabelBinarizer()

In [4]:
mapping = dict(zip(range(len(le.classes_)), le.classes_))
mapping

{0: 'AU',
 1: 'CA',
 2: 'DE',
 3: 'ES',
 4: 'FR',
 5: 'GB',
 6: 'IT',
 7: 'NDF',
 8: 'NL',
 9: 'PT',
 10: 'US',
 11: 'other'}

In [5]:
target = train['target']
feature = train.drop(['target', 'country_destination'], axis=1)

In [6]:
print(target.shape, feature.shape)

(213451,) (213451, 125)


### Baseline Model

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    feature, target, train_size=.25, random_state=42)

In [8]:
def model_train(name, reg):
    fit = reg.fit(X_train, y_train)
    score = fit.predict_proba(X_test)
    ndcg = ndcg_score(lb.transform(y_test), score, k=5)
    print('{} has ndcg score of {:.3f}'.format(name, ndcg))

In [9]:
def dcg_score(y_true, y_score, k=5):
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])

    gain = 2 ** y_true - 1

    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gain / discounts)


def ndcg_score_c(ground_truth, predictions, k=5):
    
    lb = LabelBinarizer()
    lb.fit(range(len(predictions) + 1))
    T = lb.transform(ground_truth)

    scores = []
    
    for y_true, y_score in zip(T, predictions):
        actual = dcg_score(y_true, y_score, k)
        best = dcg_score(y_true, y_true, k)
        score = float(actual) / float(best)
        scores.append(score)

    return np.mean(scores)

In [12]:
ndcg_scorer = make_scorer(ndcg_score_c, needs_proba=True, k=5)

In [13]:
def base_model():
    model = Sequential()
    model.add(Dense(240, input_dim=125, activation='relu'))
    model.add(Dense(60, activation='relu'))
    model.add(Dense(12, activation='sigmoid'))
    model.compile(loss='categorical_crossentropy', optimizer='adam',
                  metrics=['accuracy', 'top_k_categorical_accuracy'])
    return model

In [15]:
estimator = KerasClassifier(
    build_fn=base_model, epochs=15, batch_size=128, verbose=0)
kfold = KFold(n_splits=4, shuffle=True)
results = cross_val_score(estimator, X_train, y_train, cv=kfold, scoring=ndcg_scorer)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



In [19]:
print('ndcg score: {:.4f}'.format(results.mean()))

ndcg score: 0.8246


In [20]:
estimator.fit(feature, lb.transform(target), verbose=False)

<keras.callbacks.callbacks.History at 0x179e5432a20>

In [21]:
estimator.model.save(filepath='./data/deep_learn.tf')

In [23]:
estimator.fit(X_train, y_train)
score = estimator.predict_proba(X_test)
ndcg = ndcg_score(lb.transform(y_test), score, k=5)
print('model has ndcg score of {:.4f}'.format(ndcg))

model has ndcg score of 0.8251
