In [25]:
# This code is used to tune the parameters of XGBoost by successive grid search in the parameter space, as well as finding
# the optimal number of trees by cross-validation.

import os
import pickle
import pandas as pd
import numpy as np
import xgboost as xgb
from calculate_auc import calculate_auc
from utils import stratify

n_boot = 10
split = 0.9

ft_dir = 'features_long/'

# list feature files
files = os.listdir(ft_dir)
files = files[:40]

# reading top locations
with open('top_locations.dat', 'rb') as f:
    location_top = pickle.load(f)
f.close()

feature_all = []
target_all = []

for filename in files:
    with open(ft_dir+filename, 'rb') as f:  
        feature, target = pickle.load(f)

        # only keeping locations in location_top and encoding them
        ind = np.array([], int)
        for (i,loc) in enumerate(target['location']):
            if loc in location_top:
                ind = np.append(ind, i)
                target.loc[i,'elocation'] = np.where(location_top==loc)[0]
        feature = feature.loc[ind,:]
        target = target.loc[ind]
        
        feature = feature.reset_index(drop=True)
        target = target.reset_index(drop=True)
        
        # change encoded column data type to int
        target['elocation'] = target['elocation'].astype(int)
        
        feature_all.append(feature)
        target_all.append(target)
        
    f.close()


# for max_depth in np.arange(3,10,2):
#     for min_child_weight in np.arange(1,6,2):
# for max_depth in np.arange(2,5,1):
#     for min_child_weight in np.arange(4,7,1):
# for gamma in [k/10.0 for k in range(0,5)]:
# for gamma in [k/10.0 for k in range(4,7)]:
# for subsample in [k/10.0 for k in range(6,10)]:
#     for colsample_bytree in [kk/10.0 for kk in range(6,10)]:
# for subsample in np.arange(.75,9,.05):
#     for colsample_bytree in np.arange(.85,1,.05):
# for reg_alpha in [1e-5, 1e-2, 0.1, 1, 100]:
# for reg_alpha in [1e-7, 1e-6, 1e-5]:

confs = []
aucs = []
aucs_mean = []
labels = []
inds = np.arange(0,len(feature_all),1)
inds_split = int(np.floor(split*len(feature_all)))
np.random.seed(seed=0)

for i in range(n_boot):

    # training set
    np.random.shuffle(inds)
    ind_train = inds[:inds_split]
    ind_test = inds[inds_split:]

    x_train = pd.concat([feature_all[j] for j in ind_train], axis=0)
    y_train = pd.concat([target_all[j]['elocation'] for j in ind_train], axis=0)
    x_train = x_train.reset_index(drop=True)
    y_train = y_train.reset_index(drop=True)

    # stratification
    (x_train, y_train) = stratify(x_train, y_train)

    # test set
    x_test = pd.concat([feature_all[j] for j in ind_test], axis=0)
    y_test = pd.concat([target_all[j]['elocation'] for j in ind_test], axis=0)
    x_test = x_test.reset_index(drop=True)
    y_test = y_test.reset_index(drop=True)

    # remove foursquare features (sensor)
#     x_train = x_train.drop(['fsq 0','fsq 1','fsq 2','fsq 3','fsq 4','fsq 5','fsq 6','fsq 7','fsq 8','fsq distance'],axis=1)
#     x_test = x_test.drop(['fsq 0','fsq 1','fsq 2','fsq 3','fsq 4','fsq 5','fsq 6','fsq 7','fsq 8','fsq distance'],axis=1)
#     x_train = x_train.reset_index(drop=True)
#     x_test = x_test.reset_index(drop=True)

    # model (sensor)
#     gbm = xgb.XGBClassifier(max_depth=6, n_estimators=50, learning_rate=0.05, nthread=12, subsample=0.25, \
#                         colsample_bytree=0.5, max_delta_step=0, gamma=3, objective='mlogloss', reg_alpha=0.5, \
#                         missing=np.nan)

    # model (sensor + foursquare)
    gbm = xgb.XGBClassifier(max_depth=4, n_estimators=1000, learning_rate=0.01, nthread=12, subsample=0.8, \
                        colsample_bytree=0.9, max_delta_step=0, gamma=0.4, objective='mlogloss', \
                        reg_alpha=0, reg_lambda=1, \
                        missing=np.nan, min_child_weight=4, scale_pos_weight=1, seed=27)

    # for finding the best n_estimators
    gbm.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric='mlogloss', verbose=False,\
           early_stopping_rounds=50)

    # after finding the best n_estimators
#     gbm.fit(x_train, y_train)

    # training performance
    y_pred = gbm.predict(x_train)
    conf_train, roc_auc_train = calculate_auc(y_pred, y_train, location_top.size)

    # test
    y_pred = gbm.predict(x_test)
    conf, roc_auc = calculate_auc(y_pred, y_test, location_top.size)

    labels.append(np.unique(y_test))
    confs.append(conf)
    aucs.append(roc_auc)
    aucs_mean.append(np.nanmean(roc_auc))
    print(gbm.best_iteration, np.nanmean(roc_auc))
#         print(np.nanmean(roc_auc))

#     print(np.unique(y_test))
#     print(roc_auc_train, np.nanmean(roc_auc_train))
#     print(roc_auc, np.nanmean(roc_auc))

print(np.nanmean(np.array(aucs_mean)))

253 0.5904970115
717 0.600456461153
761 0.580378332217
998 0.737453861384
439 0.668473367389
443 0.620994280013
283 0.59603436232
565 0.622919265874
586 0.657950585767
374 0.610265376598
0.628542290421


In [13]:
np.arange(2,5,1)

array([2, 3, 4])