In [233]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder

from xgboost.sklearn import XGBClassifier
import xgboost as xgb
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelBinarizer

In [9]:
train_users = pd.read_csv('../data/train_data.csv')

In [4]:
train_users.fillna(-1, inplace=True)

In [11]:
train, test = train_test_split(train_users, test_size = 0.3, stratify = train_users['country_destination'])

Train

In [448]:

def dcg_score(y_true, y_score, k=5):
    """Discounted cumulative gain (DCG) at rank K.
    Parameters
    ----------
    y_true : array, shape = [n_samples]
        Ground truth (true relevance labels).
    y_score : array, shape = [n_samples, n_classes]
        Predicted scores.
    k : int
        Rank.
    Returns
    -------
    score : float
    """

    order = np.argsort(y_score)[::-1]
    
    y_true = np.take(y_true, order[:k])
    gain = 2 ** y_true - 1

    discounts = np.log2(np.arange(rel) + 2)
    return np.sum(gain / discounts)


def ndcg_score(ground_truth, predictions, k=5):
    """Normalized discounted cumulative gain (NDCG) at rank K.
    Normalized Discounted Cumulative Gain (NDCG) measures the performance of a
    recommendation system based on the graded relevance of the recommended
    entities. It varies from 0.0 to 1.0, with 1.0 representing the ideal
    ranking of the entities.
    Parameters
    ----------
    ground_truth : array, shape = [n_samples]
        Ground truth (true labels represended as integers).
    predictions : array, shape = [n_samples, n_classes]
        Predicted probabilities.
    k : int
        Rank.
    Returns
    -------
    score : float
    Example
    -------
    >>> ground_truth = [1, 0, 2]
    >>> predictions = [[0.15, 0.55, 0.2], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]]
    >>> score = ndcg_score(ground_truth, predictions, k=2)
    1.0
    >>> predictions = [[0.9, 0.5, 0.8], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]]
    >>> score = ndcg_score(ground_truth, predictions, k=2)
    0.6666666666
    """
    scores = []

    # Iterate over each y_true and compute the DCG score
    for y_true, y_score in zip(ground_truth, predictions):
        actual = dcg_score(y_true, y_score, k)
        best = dcg_score(y_true, y_true, k)
        if best < 1e-8:
            scores.append(1.)
        else:
            score = float(actual) / float(best)
            scores.append(score)

    return np.mean(scores)

In [449]:
ndcg_score(y[:1], predict[:1])

[0.00695321 0.00883711 0.00825913 0.01040157 0.01656157 0.01485613
 0.01320898 0.74104977 0.00771238 0.00678723 0.12987906 0.03549389] [ 7 10 11  4  5] 7
7 [0] 7


ValueError: 7 is not in list

In [241]:
ndcg_scorer = make_scorer(ndcg_score, greater_is_better = True, needs_proba = False)

In [284]:
xgb_model = xgb.XGBClassifier(objective="multi:softprob", nthread=-1)

params = {
        'max_depth': [6],
        'n_estimators': [20, 25, 30, 35],
        'learning_rate' : [0.2, 0.3]
    }

clf = GridSearchCV(
    xgb_model,
    param_grid = params,
    cv = 5,
    verbose = 8,
    n_jobs = 4,
    scoring = ndcg_scorer
)

In [None]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(train['country_destination'])

In [286]:
X = train.drop(['country_destination', 'id', 'most_used_device'], axis = 1).values

In [350]:
predict = xgb_model.predict_proba(X)

In [287]:
best_fit = clf.fit(X, y)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed: 27.1min
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed: 85.5min
[Parallel(n_jobs=4)]: Done  40 out of  40 | elapsed: 107.9min finished


In [288]:
best_fit.best_params_

{'learning_rate': 0.2, 'max_depth': 6, 'n_estimators': 20}

In [261]:
test_users = pd.read_csv('../data/test_data.csv')

In [263]:
y_train = train_users['country_destination']
train_users.drop(['country_destination', 'id', 'most_used_device'], axis=1, inplace=True)
x_train = train_users.values

label_encoder = LabelEncoder()
encoded_y_train = label_encoder.fit_transform(y_train)

In [264]:
x_test = test_users.drop(['id', 'most_used_device'], axis = 1).values

In [289]:
"""
# 0.7390
param = {
    'max_depth': 8,
    'learning_rate': 0.5,
    'verbosity': 3,
    'objective': 'multi:softprob',
    'n_jobs': -1,
    'gamma': 0,
    'min_child_weight': 1,
    'max_delta_step': 0,
    'subsample': 1,
    'colsample_bytree': 1,
    'colsample_bylevel': 1,
    'reg_alpha': 0,
    'reg_lambda': 1,
    'scale_pos_weight': 1,
    'base_score': 0.5,
    'random_state': 42
}
"""
"""
# 0.74597 Time Doesn't take much
param = {
    'max_depth' : 6, 
    'learning_rate' :0.3, 
    'n_estimators' : 25,
    'objective' : 'multi:softprob',
    'subsample' : 0.5, 
    'colsample_bytree' : 0.5, 
    'seed' : 0,
    'n_jobs' : -1,
    'verbosity' : 3
}
"""

param = {
    'max_depth' : 6, 
    'learning_rate' :0.2, 
    'n_estimators' : 20,
    'objective' : 'multi:softprob',
    'subsample' : 0.5, 
    'colsample_bytree' : 0.5, 
    'seed' : 0,
    'n_jobs' : -1,
    'verbosity' : 3
}

In [290]:
xgb_model = xgb.XGBClassifier(**param)

In [293]:
xgb_model.fit(x_train, encoded_y_train)

[22:53:58] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 18 extra nodes, 0 pruned nodes, max_depth=6
[22:53:58] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 50 extra nodes, 0 pruned nodes, max_depth=6
[22:53:58] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 24 extra nodes, 0 pruned nodes, max_depth=6
[22:53:58] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 56 extra nodes, 0 pruned nodes, max_depth=6
[22:53:59] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 104 extra nodes, 0 pruned nodes, max_depth=6
[22:53:59] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 44 extra nodes, 0 pruned nodes, max_depth=6
[22:53:59] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 66 extra nodes, 0 pruned nodes, max_depth=6
[22:53:59] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 104 extra nodes, 0 pruned nodes, max_depth=6
[22:53:59] INFO: src/tree/updater_prune.cc:74: tree pr

[22:54:23] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 122 extra nodes, 0 pruned nodes, max_depth=6
[22:54:23] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 98 extra nodes, 0 pruned nodes, max_depth=6
[22:54:25] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 28 extra nodes, 0 pruned nodes, max_depth=6
[22:54:25] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 56 extra nodes, 0 pruned nodes, max_depth=6
[22:54:25] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 54 extra nodes, 0 pruned nodes, max_depth=6
[22:54:25] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 74 extra nodes, 0 pruned nodes, max_depth=6
[22:54:26] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 84 extra nodes, 0 pruned nodes, max_depth=6
[22:54:26] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=6
[22:54:26] INFO: src/tree/updater_prune.cc:74: tree pru

[22:54:50] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 54 extra nodes, 0 pruned nodes, max_depth=6
[22:54:50] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 18 extra nodes, 0 pruned nodes, max_depth=6
[22:54:50] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 116 extra nodes, 0 pruned nodes, max_depth=6
[22:54:50] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 78 extra nodes, 0 pruned nodes, max_depth=6
[22:54:52] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 38 extra nodes, 0 pruned nodes, max_depth=6
[22:54:53] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 82 extra nodes, 0 pruned nodes, max_depth=6
[22:54:53] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 34 extra nodes, 0 pruned nodes, max_depth=6
[22:54:53] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 94 extra nodes, 0 pruned nodes, max_depth=6
[22:54:53] INFO: src/tree/updater_prune.cc:74: tree pru

[22:55:19] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 48 extra nodes, 0 pruned nodes, max_depth=6
[22:55:19] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 92 extra nodes, 0 pruned nodes, max_depth=6
[22:55:19] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 0 pruned nodes, max_depth=6
[22:55:19] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 22 extra nodes, 0 pruned nodes, max_depth=6
[22:55:20] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 118 extra nodes, 0 pruned nodes, max_depth=6
[22:55:20] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 88 extra nodes, 0 pruned nodes, max_depth=6
[22:55:23] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 58 extra nodes, 0 pruned nodes, max_depth=6
[22:55:23] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 0 pruned nodes, max_depth=6
[22:55:23] INFO: src/tree/updater_prune.cc:74: tree pru

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0,
              learning_rate=0.2, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=None, n_estimators=20, n_jobs=-1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0,
              silent=None, subsample=0.5, verbosity=3)

In [278]:
from collections import defaultdict

In [269]:
_feature_importance = defaultdict(float)
for col, importance in zip(train_users.columns, xgb_model.feature_importances_):
    _feature_importance[col] = importance

In [270]:
sorted(_feature_importance.items(), key = lambda x:x[1], reverse = True)

[('age', 0.11800498),
 ('pending.1', 0.092495136),
 ('confirm_email.1', 0.07967741),
 ('first_browser_unknown', 0.05062814),
 ('pending', 0.038483728),
 ('manage_listing', 0.03628897),
 ('signup_method_basic', 0.03389077),
 ('signup_method_facebook', 0.033729024),
 ('authenticate', 0.032510664),
 ('nans', 0.029824276),
 ('account_created_year', 0.025917633),
 ('age_group', 0.020358242),
 ('at_checkpoint', 0.017414428),
 ('pending.2', 0.0155201135),
 ('verify', 0.013380551),
 ('affiliate_channel_content', 0.01320313),
 ('header_userpic.1', 0.010606088),
 ('gender_female', 0.009989167),
 ('first_active_year', 0.009893741),
 ('requested.1', 0.009605634),
 ('manage_listing.1', 0.008594246),
 ('complete_status.1', 0.008478024),
 ('active', 0.0081962785),
 ('create.1', 0.007940214),
 ('gender_unknown', 0.0075251907),
 ('at_checkpoint.2', 0.0075206347),
 ('ask_question', 0.0070593576),
 ('language_en', 0.0060469364),
 ('day_pauses', 0.005973164),
 ('ask_question.2', 0.0059453542),
 ('first_de

In [301]:
predicted = xgb_model.predict_proba(x_train)

In [294]:
y_test = xgb_model.predict_proba(x_test)

In [296]:
ids = []  #list of ids
cts = []  #list of countries
for i in range(len(test_users['id'])):
    idx = test_users['id'][i]
    ids += [idx] * 5
    cts += label_encoder.inverse_transform(np.argsort(y_test[i])[::-1])[:5].tolist()

In [297]:
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub.to_csv('sub.csv',index=False)