# Machine Learning

In [1]:
import warnings

import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.compose import TransformedTargetRegressor
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import (
    ExtraTreesClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
)
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.metrics import ndcg_score, roc_auc_score
from sklearn.model_selection import KFold, cross_validate, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from xgboost.sklearn import XGBClassifier

warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
train = pd.read_csv('./data/train_df.csv', index_col=0)

In [3]:
le = LabelEncoder()
train['target'] = le.fit_transform(train['country_destination'])
lb = LabelBinarizer()
lb.fit(train['target'])

LabelBinarizer()

In [4]:
mapping = dict(zip(range(len(le.classes_)), le.classes_))
mapping

{0: 'AU',
 1: 'CA',
 2: 'DE',
 3: 'ES',
 4: 'FR',
 5: 'GB',
 6: 'IT',
 7: 'NDF',
 8: 'NL',
 9: 'PT',
 10: 'US',
 11: 'other'}

In [5]:
target = train['target']
feature = train.drop(['target', 'country_destination'], axis=1)

In [6]:
print(target.shape, feature.shape)

(213451,) (213451, 125)


In [7]:
feature.head()

Unnamed: 0_level_0,signup_method_facebook,signup_method_google,affiliate_channel_content,affiliate_channel_direct,affiliate_channel_other,affiliate_channel_remarketing,affiliate_channel_sem-brand,affiliate_channel_sem-non-brand,affiliate_channel_seo,signup_app_Moweb,...,action_detail_view_listing,action_detail_wishlist,action_detail_your_listings,action_detail_your_trips,device_type_Blackberry,device_type_Chromebook,device_type_Linux Desktop,device_type_Mac Desktop,device_type_Windows Desktop,device_type_iPodtouch
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
gxn3p5htnn,1,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
820tgsjxq7,1,0,0,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4ft3gnwmtx,0,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
bjjt8pjhuk,1,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
87mebub9p4,0,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    feature, target, train_size=.25, random_state=42)

In [9]:
def ary_prep(data):
    """function to convert the predictions and test dataset
    into form that be accepted by ndgc_score function

    Args:
    data : array to be transformed

    Returns:
    array that with shape of (1, n) where n is 
    the number of samples
    """
    list_d = data.tolist()
    ndcg_array = np.asarray([list_d])
    return ndcg_array

### Base Models comparison

In [31]:
clf_list = {'dummy': DummyClassifier(), 'lr': LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=800),
            'rfc': RandomForestClassifier(), 'et': ExtraTreesClassifier(), 'gradient': GradientBoostingClassifier(),
            'lgb': LGBMClassifier(objective='multiclass', num_class=12)}

In [32]:
def model_train(name, reg):
    fit = reg.fit(X_train, y_train)
    pred = fit.predict(X_test)
    score = fit.predict_proba(X_test)
    ndcg = ndcg_score(lb.transform(y_test), score, k=5)
    roc = roc_auc_score(y_test, score, multi_class='ovr')
    print('{} has ndcg score of {:.3f} and roc of {:.3f}'.format(name, ndcg, roc))

In [33]:
for name, reg in clf_list.items():
    model_train(name, reg)

KeyboardInterrupt: 

### Cross Validation

In [13]:
# ndcg_df = pd.DataFrame(columns=clf_list.keys())

In [15]:
# kf = KFold(n_splits=4)

In [27]:
# result = []
# for train_idx, test_idx in kf.split(feature):
#     X_tr_idx, X_ts_idx = feature.iloc[train_idx], feature.iloc[train_idx]
#     y_tr_idx, y_ts_idx = target.iloc[train_idx], target.iloc[train_idx]
# #     print(feature.iloc[train_idx], feature.iloc[test_idx])
#     for name, reg in clf_list.items():
# #         for i in range(4):
    
#         fit = reg.fit(X_tr_idx, y_tr_idx)
#         score = fit.predict_proba(X_ts_idx)
#         ndcg = ndcg_score(lb.transform(y_ts_idx), score, k=5)
#         print(ndcg)
        
# #         ndcg_df.loc[i, name]

In [None]:
for name, reg in clf_list.items():
    scores = cross_validate(reg, feature, target, cv=4, scoring=(
        'roc_auc_ovr'), return_train_score=True)
    print('{} has average roc of {:.3f}'.format(
        name, np.mean(scores['train_score'])))

dummy has average roc of 0.500
lr has average roc of 0.687
rfc has average roc of 0.842
et has average roc of 0.845


**Tree based** models appear to work well for this classification problem