In [None]:
import os
import pickle
import pandas as pd
import numpy as np
import xgboost as xgb
from calculate_auc import calculate_auc
import time
from copy import deepcopy
from utils import stratify

save_results = True
do_stratify = False

n_boot = 100
split = 0.7
np.random.seed(seed=0)

ft_dir = 'features_long/'

# list feature files
files = os.listdir(ft_dir)
# files.remove('1515656.dat')
# print('subject 1515656 removed due to spoofed gps.')

# reading top locations
with open('top_locations.dat', 'rb') as f:
    location_top = pickle.load(f)
f.close()

feature_all = []
target_all = []

for filename in files:
    with open(ft_dir+filename, 'rb') as f:  
        feature, target = pickle.load(f)

        # only keeping locations in location_top and encoding them
        ind = np.array([], int)
        for (i,loc) in enumerate(target['location']):
            if loc in location_top:
                ind = np.append(ind, i)
                target.loc[i,'elocation'] = np.where(location_top==loc)[0]
        feature = feature.loc[ind,:]
        target = target.loc[ind]
        feature = feature.reset_index(drop=True)
        target = target.reset_index(drop=True)
        
        # change encoded column data type to int
        target['elocation'] = target['elocation'].astype(int)
        
        feature_all.append(feature)
        target_all.append(target)
        
    f.close()

confs = []
aucs = []
labels = []
inds = np.arange(0,len(feature_all),1)
inds_split = int(np.floor(split*len(feature_all)))

for i in range(n_boot):
    
    print('------------------')
    print(i)
    
    # training set
    np.random.shuffle(inds)
    ind_train = inds[:inds_split]
    ind_test = inds[inds_split:]
    
    x_train = pd.concat([feature_all[j] for j in ind_train], axis=0)
    y_train = pd.concat([target_all[j]['elocation'] for j in ind_train], axis=0)
    x_train = x_train.reset_index(drop=True)
    y_train = y_train.reset_index(drop=True)
    
    # stratification
    (x_train, y_train) = stratify(x_train, y_train)
    
    # test set
    x_test = pd.concat([feature_all[j] for j in ind_test], axis=0)
    y_test = pd.concat([target_all[j]['elocation'] for j in ind_test], axis=0)
    x_test = x_test.reset_index(drop=True)
    y_test = y_test.reset_index(drop=True)
    
    # remove foursquare features
#     x_train = x_train.drop(['fsq 0','fsq 1','fsq 2','fsq 3','fsq 4','fsq 5','fsq 6','fsq 7','fsq 8','fsq distance'],axis=1)
#     x_test = x_test.drop(['fsq 0','fsq 1','fsq 2','fsq 3','fsq 4','fsq 5','fsq 6','fsq 7','fsq 8','fsq distance'],axis=1)
#     x_train = x_train.reset_index(drop=True)
#     x_test = x_test.reset_index(drop=True)
    
    # model (sensor)
#     gbm = xgb.XGBClassifier(max_depth=6, n_estimators=50, learning_rate=0.05, nthread=12, subsample=0.25, \
#                         colsample_bytree=0.5, max_delta_step=0, gamma=3, objective='mlogloss', reg_alpha=0.5, \
#                         missing=np.nan)
#     gbm = xgb.XGBClassifier(max_depth=4, n_estimators=200, learning_rate=0.025, nthread=12, subsample=0.2, \
#                         colsample_bytree=0.5, max_delta_step=0, gamma=0.4, objective='mlogloss', reg_alpha=0, \
#                         reg_lambda=1, missing=np.nan, min_child_weight=4)
    
    # model (sensor + foursquare)
#     gbm = xgb.XGBClassifier(max_depth=6, n_estimators=75, learning_rate=0.05, nthread=12, subsample=0.25, \
#                         colsample_bytree=0.2, max_delta_step=0, gamma=3, objective='mlogloss', reg_alpha=0.5, \
#                         missing=np.nan)
    gbm = xgb.XGBClassifier(max_depth=4, n_estimators=300, learning_rate=0.025, nthread=12, subsample=0.25, \
                        colsample_bytree=0.2, max_delta_step=0, gamma=0.4, objective='mlogloss', reg_alpha=0, \
                        reg_lambda=1, missing=np.nan, min_child_weight=4)
    
    # fitting model
#     gbm.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric='mlogloss', verbose=False, early_stopping_rounds=50)
#     print('best n_estimators: ',gbm.best_iteration)
    gbm.fit(x_train, y_train)
    
    # training performance
    y_pred = gbm.predict(x_train)
    conf_train, roc_auc_train = calculate_auc(y_pred, y_train, location_top.size)

    # test
    y_pred = gbm.predict(x_test)
    conf, roc_auc = calculate_auc(y_pred, y_test, location_top.size)
    
#     labels.append(np.unique(y_test))
    confs.append(conf)
    aucs.append(roc_auc)

#     print(np.unique(y_test))
    print(location_top)
    print(roc_auc_train, np.nanmean(roc_auc_train))
    print(roc_auc, np.nanmean(roc_auc))

if save_results:
    with open('auc_all.dat','wb') as f:
        pickle.dump([aucs, confs, labels], f)
    f.close()




In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.hist(y_train);
plt.hist(y_test);
plt.hist(y_pred);

In [None]:
from sklearn.metrics import roc_auc_score
print(roc_auc_score(np.array([0,0,0,0,0,0,0,0]),np.array([0,0,1,0,0,0,0,0])))

In [None]:
x_test.shape