In [None]:
#General structure:
#-Tune model on training data via a grid search
#-Run that model on stacked data

In [6]:
import pandas as pd
import numpy as np
import math
import datetime

In [7]:
#Import munged data
train_m = pd.read_csv('train_starting.csv', index_col=0)
test_m = pd.read_csv('test_starting.csv', index_col = 0)

In [8]:
print list(train_m.columns)

['id', 'date_account_created', 'timestamp_first_active', 'date_first_booking', 'gender', 'age', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser', 'country_destination', 'lag_account_created', 'lag_first_active', 'lag_account_created_first_active', 'bookings', 'population_in_thousands', 'sum_secs_elapsed', 'counts']


In [9]:
#Remove date variables (leaving only the time-lag variables involving dates)

excl = list(train_m.columns)
toremove = ['id', 'date_account_created', 'timestamp_first_active', 'date_first_booking', 'population_in_thousands',
            'lag_account_created', 'bookings', 'lag_first_active', 
            #'lag_account_created_first_active', 
              'country_destination'] 
map(lambda x: excl.remove(x), toremove)

[None, None, None, None, None, None, None, None, None]

In [10]:
train_xcl = train_m.loc[:, excl]
test_xcl = test_m.loc[:, excl]

In [11]:
from sklearn.feature_extraction import DictVectorizer

#Concatenate data in order to dummify variables
frames = [train_xcl, test_xcl]
all_data = pd.concat(frames)
all_data = all_data.T.to_dict().values()
train_xdic = train_xcl.T.to_dict().values()
test_xdic = test_xcl.T.to_dict().values()

#Proper format for AdaBoost (dummify variables)
vec = DictVectorizer()
vec.fit(all_data)
train_xvec = vec.transform(train_xdic)
test_xvec = vec.transform(test_xdic)



In [13]:
#Response variable
train_y = train_m.loc[:,'country_destination']

In [15]:
from sklearn import preprocessing
#Transform response variable
le = preprocessing.LabelEncoder()
le.fit(train_y)
train_ytrans = le.transform(train_y)
print type(train_ytrans)

<type 'numpy.ndarray'>


In [40]:
trainx_vecmiss = pd.DataFrame(train_xvec.toarray()).fillna(-1)

In [42]:
import sklearn.grid_search as gs
from sklearn.ensemble import AdaBoostClassifier
model = AdaBoostClassifier()

# The next few cells perform a grid search for AdaBoost parameters
grid_param = [{'learning_rate': [.01], 'n_estimators': [100, 200, 300, 600]}]

grid_search = gs.GridSearchCV(model, grid_param, cv=5).fit(trainx_vecmiss, train_ytrans)



In [44]:
grid_search.grid_scores_

[mean: 0.58347, std: 0.00004, params: {'n_estimators': 100, 'learning_rate': 0.01},
 mean: 0.58347, std: 0.00004, params: {'n_estimators': 200, 'learning_rate': 0.01},
 mean: 0.58347, std: 0.00004, params: {'n_estimators': 300, 'learning_rate': 0.01},
 mean: 0.59740, std: 0.01037, params: {'n_estimators': 600, 'learning_rate': 0.01}]

In [45]:
grid_param = [{'learning_rate': [.01, .001, .0001], 'n_estimators': [600]}]

grid_search = gs.GridSearchCV(model, grid_param, cv=5).fit(trainx_vecmiss, train_ytrans)

In [48]:
grid_search.grid_scores_

[mean: 0.59740, std: 0.01037, params: {'n_estimators': 600, 'learning_rate': 0.01},
 mean: 0.58347, std: 0.00004, params: {'n_estimators': 600, 'learning_rate': 0.001},
 mean: 0.58347, std: 0.00004, params: {'n_estimators': 600, 'learning_rate': 0.0001}]

In [None]:
grid_param = [{'learning_rate': [.01], 'n_estimators': [450, 500, 550, 600, 650]}]

grid_search = gs.GridSearchCV(model, grid_param, cv=5).fit(trainx_vecmiss, train_ytrans)

In [53]:
grid_search.grid_scores_

[mean: 0.58660, std: 0.00604, params: {'n_estimators': 450, 'learning_rate': 0.01},
 mean: 0.59339, std: 0.01087, params: {'n_estimators': 500, 'learning_rate': 0.01},
 mean: 0.59491, std: 0.01049, params: {'n_estimators': 550, 'learning_rate': 0.01},
 mean: 0.59740, std: 0.01037, params: {'n_estimators': 600, 'learning_rate': 0.01},
 mean: 0.60216, std: 0.01198, params: {'n_estimators': 650, 'learning_rate': 0.01}]

In [56]:
grid_param = [{'learning_rate': [.01, .001], 'n_estimators': [650]}]

grid_search = gs.GridSearchCV(model, grid_param, cv=5).fit(trainx_vecmiss, train_ytrans)

In [57]:
grid_search.grid_scores_

#Optimal parameters are n_estimators = 650, learning_rate = .01

[mean: 0.60216, std: 0.01198, params: {'n_estimators': 650, 'learning_rate': 0.01},
 mean: 0.58347, std: 0.00004, params: {'n_estimators': 650, 'learning_rate': 0.001}]

In [85]:
#Import stacked data

trainf = pd.read_csv('training_for_final_models.csv')
testf = pd.read_csv('test_for_final_models.csv')
print list(trainf.columns)

['id', 'gender', 'age', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser', 'country_destination', 'sum_secs_elapsed', 'counts', 'pred_lag_account_created', 'pred_lag_first_active', 'pred_bookings']


In [86]:
trainf.head()

Unnamed: 0,id,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination,sum_secs_elapsed,counts,pred_lag_account_created,pred_lag_first_active,pred_bookings
0,gxn3p5htnn,,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF,,,NB,NB,NB
1,820tgsjxq7,MALE,35-39,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF,,,same day,before,early
2,4ft3gnwmtx,FEMALE,55-59,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US,,,greater 1 day,greater 1 day,NB
3,bjjt8pjhuk,FEMALE,40-44,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,other,,,NB,NB,waited
4,87mebub9p4,,40-44,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US,,,greater 1 day,greater 1 day,NB


In [90]:
excl = list(trainf.columns)
toremove = ['id', 'country_destination'] 
map(lambda x: excl.remove(x), toremove)
excl

['gender',
 'age',
 'signup_method',
 'signup_flow',
 'language',
 'affiliate_channel',
 'affiliate_provider',
 'first_affiliate_tracked',
 'signup_app',
 'first_device_type',
 'first_browser',
 'sum_secs_elapsed',
 'counts',
 'pred_lag_account_created',
 'pred_lag_first_active',
 'pred_bookings']

In [91]:
train_xcl = trainf.loc[:, excl]
test_xcl = testf.loc[:, excl]

In [93]:
from sklearn.feature_extraction import DictVectorizer

#Transform data to format for AdaBoost (dummify variables)
frames = [train_xcl, test_xcl]
all_data = pd.concat(frames)
all_data = all_data.T.to_dict().values()
train_xdic = train_xcl.T.to_dict().values()
test_xdic = test_xcl.T.to_dict().values()

vec = DictVectorizer()
vec.fit(all_data)
train_xvec = vec.transform(train_xdic)
test_xvec = vec.transform(test_xdic)

In [94]:
train_y = trainf.loc[:,'country_destination']

In [95]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(train_y)
train_ytrans = le.transform(train_y)
print type(train_ytrans)

<type 'numpy.ndarray'>


In [97]:
trainx_vecmiss = pd.DataFrame(train_xvec.toarray()).fillna(-1)

In [None]:
import numpy as np

model2 = AdaBoostClassifier()
 
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import make_scorer

#ndcg is the objective function of the Airbnb Kaggle competition. This was taken from a script form Kaggle.
#ndcg_score is a scoring object using the ndcg score function.
#dcg_score is a helper function.
def dcg_score(y_true, y_score, k=5):
 """Discounted cumulative gain (DCG) at rank K.

 Parameters
 ----------
 y_true : array, shape = [n_samples]
     Ground truth (true relevance labels).
 y_score : array, shape = [n_samples, n_classes]
     Predicted scores.
 k : int
     Rank.

 Returns
 -------
 score : float
 """
 order = np.argsort(y_score)[::-1]
 y_true = np.take(y_true, order[:k])

 gain = 2 ** y_true - 1

 discounts = np.log2(np.arange(len(y_true)) + 2)
 return np.sum(gain / discounts)


def ndcg_score(ground_truth, predictions, k=5):
 """Normalized discounted cumulative gain (NDCG) at rank K.

 Normalized Discounted Cumulative Gain (NDCG) measures the performance of a
 recommendation system based on the graded relevance of the recommended
 entities. It varies from 0.0 to 1.0, with 1.0 representing the ideal
 ranking of the entities.

 Parameters
 ----------
 ground_truth : array, shape = [n_samples]
     Ground truth (true labels represended as integers).
 predictions : array, shape = [n_samples, n_classes]
     Predicted probabilities.
 k : int
     Rank.

 Returns
 -------
 score : float

 Example
 -------
 >>> ground_truth = [1, 0, 2]
 >>> predictions = [[0.15, 0.55, 0.2], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]]
 >>> score = ndcg_score(ground_truth, predictions, k=2)
 1.0
 >>> predictions = [[0.9, 0.5, 0.8], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]]
 >>> score = ndcg_score(ground_truth, predictions, k=2)
 0.6666666666
 """
 lb = LabelBinarizer()
 lb.fit(range(len(predictions) + 1))
 T = lb.transform(ground_truth)

 scores = []

 # Iterate over each y_true and compute the DCG score
 for y_true, y_score in zip(T, predictions):
     actual = dcg_score(y_true, y_score, k)
     best = dcg_score(y_true, y_true, k)
     score = float(actual) / float(best)
     scores.append(score)

 return np.mean(scores)


# NDCG Scorer function
ndcg_scorer = make_scorer(ndcg_score, needs_proba= True, k=5)

In [64]:
#Use optimal parameters previously found to predict country destination on stacked data.

import sklearn.grid_search as gs

#max_depth_values = [5, 6, 7]
learning_rate_values = [.01]
# subsample_values = [0.7]
# colsample_bytree_values = [0.7]
n_estimators = [650]
# gamma = [0]

params = {'learning_rate': learning_rate_values,
        'n_estimators' : n_estimators}

grid = gs.GridSearchCV(model2, params, scoring=ndcg_scorer, cv=5)

In [98]:
grid.fit(trainx_vecmiss, train_ytrans)
grid.grid_scores_

[mean: 0.81753, std: 0.00607, params: {'n_estimators': 650, 'learning_rate': 0.01}]

In [99]:
testx_vecmiss = pd.DataFrame(test_xvec.toarray()).fillna(-1)


In [100]:
predictions = grid.predict_proba(testx_vecmiss)

In [101]:
#Print out top 5 predicted countries

test_id = testf['id']

  
ids = []  #list of ids
cts = []  #list of countries
for i in range(len(test_id)):
   idx = test_id[i]
   ids += [idx] * 5
   cts += le.inverse_transform(np.argsort(predictions[i])[::-1])[:5].tolist()
  
sub1 = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
print sub1.head
sub1.to_csv('sub1.csv',index=False)

<bound method DataFrame.head of                 id country
0       5uwns89zht     NDF
1       5uwns89zht      US
2       5uwns89zht   other
3       5uwns89zht      FR
4       5uwns89zht      IT
5       jtl0dijy2j     NDF
6       jtl0dijy2j      US
7       jtl0dijy2j   other
8       jtl0dijy2j      FR
9       jtl0dijy2j      IT
10      xx0ulgorjt     NDF
11      xx0ulgorjt      US
12      xx0ulgorjt   other
13      xx0ulgorjt      FR
14      xx0ulgorjt      IT
15      6c6puo6ix0     NDF
16      6c6puo6ix0      US
17      6c6puo6ix0   other
18      6c6puo6ix0      FR
19      6c6puo6ix0      IT
20      czqhjk3yfe     NDF
21      czqhjk3yfe      US
22      czqhjk3yfe   other
23      czqhjk3yfe      FR
24      czqhjk3yfe      IT
25      szx28ujmhf      US
26      szx28ujmhf     NDF
27      szx28ujmhf   other
28      szx28ujmhf      FR
29      szx28ujmhf      IT
...            ...     ...
310450  8yvhec201j     NDF
310451  8yvhec201j      US
310452  8yvhec201j   other
310453  8yvhec201j     