# Machine Learning

In [51]:
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.compose import TransformedTargetRegressor
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.metrics import ndcg_score, roc_auc_score
from sklearn.model_selection import KFold, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from xgboost.sklearn import XGBClassifier

In [2]:
train = pd.read_csv('./data/train_df.csv', index_col=0)

In [3]:
le = LabelEncoder()
train['target'] = le.fit_transform(train['country_destination'])

In [4]:
mapping = dict(zip(range(len(le.classes_)), le.classes_))
mapping

{0: 'AU',
 1: 'CA',
 2: 'DE',
 3: 'ES',
 4: 'FR',
 5: 'GB',
 6: 'IT',
 7: 'NDF',
 8: 'NL',
 9: 'PT',
 10: 'US',
 11: 'other'}

In [5]:
target = train['target']
feature = train.drop(['target', 'country_destination'], axis=1)

In [6]:
print(target.shape, feature.shape)

(213451,) (213451, 51)


In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    feature, target, train_size=.25, random_state=42)

In [22]:
def ary_prep(data):
    """function to convert the predictions and test dataset
    into form that be accepted by ndgc_score function

    Args:
    data : array to be transformed

    Returns:
    array that with shape of (1, n) where n is 
    the number of samples
    """
    list_d = data.tolist()
    ndcg_array = np.asarray([list_d])
    return ndcg_array

### Base Models comparison

In [59]:
clf_list = {'dummy': DummyClassifier(), 'lr': LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000),
            'rfc': RandomForestClassifier(), 'gradient': GradientBoostingClassifier(),
            'xgb': XGBClassifier(objective='multi:softmax', num_class=12), 'lgb': LGBMClassifier(objective='multiclass', num_class=12)}

In [60]:
def model_train(name, reg):
    fit = reg.fit(X_train, y_train)
    pred = fit.predict(X_test)
    score = fit.predict_proba(X_test)
    ndcg = ndcg_score(ary_prep(y_test), ary_prep(pred), k=5)
    roc = roc_auc_score(y_test, score, multi_class='ovo')
    print('{} has ndcg score of {:.3f} and roc of {:.3f}'.format(name, ndcg, roc))

In [61]:
for name, reg in clf_list.items():
    model_train(name, reg)



dummy has ndcg score of 0.715 and roc of 0.500
lr has ndcg score of 0.764 and roc of 0.550
rfc has ndcg score of 0.752 and roc of 0.529
gradient has ndcg score of 0.785 and roc of 0.548
xgb has ndcg score of 0.768 and roc of 0.547
lgb has ndcg score of 0.766 and roc of 0.544


### KFold Validation