In [3]:
import os
import pickle
import pandas as pd
import numpy as np
import xgboost as xgb
from calculate_auc import calculate_auc
import time
from copy import deepcopy
from utils import stratify

save_results = True
do_stratify = False

n_boot = 100
split = 0.7
np.random.seed(seed=0)

ft_dir = 'features_long/'

# list feature files
files = os.listdir(ft_dir)
# files.remove('1515656.dat')
# print('subject 1515656 removed due to spoofed gps.')

# reading top locations
with open('top_locations.dat', 'rb') as f:
    location_top = pickle.load(f)
f.close()

feature_all = []
target_all = []

for filename in files:
    with open(ft_dir+filename, 'rb') as f:  
        feature, target = pickle.load(f)

        # only keeping locations in location_top and encoding them
        ind = np.array([], int)
        for (i,loc) in enumerate(target['location']):
            if loc in location_top:
                ind = np.append(ind, i)
                target.loc[i,'elocation'] = np.where(location_top==loc)[0]
        feature = feature.loc[ind,:]
        target = target.loc[ind]
        feature = feature.reset_index(drop=True)
        target = target.reset_index(drop=True)
        
        # change encoded column data type to int
        target['elocation'] = target['elocation'].astype(int)
        
        feature_all.append(feature)
        target_all.append(target)
        
    f.close()

confs = []
aucs = []
labels = []
inds = np.arange(0,len(feature_all),1)
inds_split = int(np.floor(split*len(feature_all)))

for i in range(n_boot):
    
    print('------------------')
    print(i)
    
    # training set
    np.random.shuffle(inds)
    ind_train = inds[:inds_split]
    ind_test = inds[inds_split:]
    
    x_train = pd.concat([feature_all[j] for j in ind_train], axis=0)
    y_train = pd.concat([target_all[j]['elocation'] for j in ind_train], axis=0)
    x_train = x_train.reset_index(drop=True)
    y_train = y_train.reset_index(drop=True)
    
    # stratification
    (x_train, y_train) = stratify(x_train, y_train)
    
    # test set
    x_test = pd.concat([feature_all[j] for j in ind_test], axis=0)
    y_test = pd.concat([target_all[j]['elocation'] for j in ind_test], axis=0)
    x_test = x_test.reset_index(drop=True)
    y_test = y_test.reset_index(drop=True)
    
    # remove foursquare features
    x_train = x_train.drop(['fsq 0','fsq 1','fsq 2','fsq 3','fsq 4','fsq 5','fsq 6','fsq 7','fsq 8','fsq distance'],axis=1)
    x_test = x_test.drop(['fsq 0','fsq 1','fsq 2','fsq 3','fsq 4','fsq 5','fsq 6','fsq 7','fsq 8','fsq distance'],axis=1)
    x_train = x_train.reset_index(drop=True)
    x_test = x_test.reset_index(drop=True)
    
    # model (sensor)
#     gbm = xgb.XGBClassifier(max_depth=6, n_estimators=50, learning_rate=0.05, nthread=12, subsample=0.25, \
#                         colsample_bytree=0.5, max_delta_step=0, gamma=3, objective='mlogloss', reg_alpha=0.5, \
#                         missing=np.nan)
    gbm = xgb.XGBClassifier(max_depth=4, n_estimators=200, learning_rate=0.025, nthread=12, subsample=0.2, \
                        colsample_bytree=0.5, max_delta_step=0, gamma=0.4, objective='mlogloss', reg_alpha=0, \
                        reg_lambda=1, missing=np.nan, min_child_weight=4)
    
    # model (sensor + foursquare)
#     gbm = xgb.XGBClassifier(max_depth=6, n_estimators=75, learning_rate=0.05, nthread=12, subsample=0.25, \
#                         colsample_bytree=0.2, max_delta_step=0, gamma=3, objective='mlogloss', reg_alpha=0.5, \
#                         missing=np.nan)
#     gbm = xgb.XGBClassifier(max_depth=4, n_estimators=300, learning_rate=0.025, nthread=12, subsample=0.25, \
#                         colsample_bytree=0.2, max_delta_step=0, gamma=0.4, objective='mlogloss', reg_alpha=0, \
#                         reg_lambda=1, missing=np.nan, min_child_weight=4)
    
    # fitting model
#     gbm.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric='mlogloss', verbose=False, early_stopping_rounds=50)
#     print('best n_estimators: ',gbm.best_iteration)
    gbm.fit(x_train, y_train)
    
    # training performance
    y_pred = gbm.predict(x_train)
    y_p = gbm.predict_proba(x_train)
    conf_train, roc_auc_train = calculate_auc(y_pred, y_train, y_p, location_top.size)

    # test
    y_pred = gbm.predict(x_test)
    y_p = gbm.predict_proba(x_test)
    conf, roc_auc = calculate_auc(y_pred, y_test, y_p, location_top.size)
    
#     labels.append(np.unique(y_test))
    confs.append(conf)
    aucs.append(roc_auc)

#     print(np.unique(y_test))
    print(location_top)
    print(roc_auc_train, np.nanmean(roc_auc_train))
    print(roc_auc, np.nanmean(roc_auc))

if save_results:
    with open('auc_sensor_pro.dat','wb') as f:
        pickle.dump([aucs, confs, labels], f)
    f.close()


------------------
0
['Travel Or Transport' 'Nightlife Spot' 'Spiritual' 'Outdoors & Recreation'
 'Arts & Entertainment' 'Work' 'Professional Or Medical Office'
 "Another's Home" 'Food' 'Home' 'Shop Or Store']
[ 0.98889242  0.99272484  0.99153464  0.97967401  0.97814699  0.96436991
  0.96577151  0.93726007  0.90271491  0.98845679  0.91504788] 0.964053998053
[ 0.80715071  0.86678636  0.83516994  0.83774582  0.8775253   0.89876525
  0.83365081  0.74587632  0.7714233   0.96374233  0.8755049 ] 0.846667367226
------------------
1
['Travel Or Transport' 'Nightlife Spot' 'Spiritual' 'Outdoors & Recreation'
 'Arts & Entertainment' 'Work' 'Professional Or Medical Office'
 "Another's Home" 'Food' 'Home' 'Shop Or Store']
[ 0.9878799   0.99403409  0.99387446  0.9805709   0.98170388  0.96413246
  0.9602296   0.9459128   0.90352441  0.98970347  0.92314479] 0.965882796807
[ 0.83398355  0.89130635  0.81457658  0.79925594  0.88075997  0.86921291
  0.9146982   0.75138277  0.76528384  0.97131922  0.85251

['Travel Or Transport' 'Nightlife Spot' 'Spiritual' 'Outdoors & Recreation'
 'Arts & Entertainment' 'Work' 'Professional Or Medical Office'
 "Another's Home" 'Food' 'Home' 'Shop Or Store']
[ 0.99084467  0.99100377  0.9910454   0.98432939  0.98067145  0.96243154
  0.96818226  0.93985368  0.91257248  0.99078841  0.91609138] 0.966164948929
[ 0.77020871  0.89454025  0.86498135  0.81930786  0.87745669  0.87555026
  0.83191443  0.73087303  0.80625806  0.95105164  0.87121277] 0.844850457952
------------------
18
['Travel Or Transport' 'Nightlife Spot' 'Spiritual' 'Outdoors & Recreation'
 'Arts & Entertainment' 'Work' 'Professional Or Medical Office'
 "Another's Home" 'Food' 'Home' 'Shop Or Store']
[ 0.98911208  0.99124959  0.9915165   0.98352971  0.98452695  0.9657673
  0.967893    0.94008672  0.90469188  0.98982141  0.91948013] 0.966152296786
[ 0.74476947  0.8987554   0.83495289  0.78571271  0.89574849  0.88359716
  0.82838195  0.73481066  0.7952171   0.96234756  0.86124758] 0.838685543648
-

['Travel Or Transport' 'Nightlife Spot' 'Spiritual' 'Outdoors & Recreation'
 'Arts & Entertainment' 'Work' 'Professional Or Medical Office'
 "Another's Home" 'Food' 'Home' 'Shop Or Store']
[ 0.99107346  0.99245951  0.99585427  0.98137034  0.98151327  0.9609918
  0.96764773  0.94968849  0.91001766  0.98912703  0.92039323] 0.967285163424
[ 0.71274752  0.81443017  0.83546821  0.83246439  0.88346775  0.86238036
  0.81076987  0.75427096  0.80888377  0.96606068  0.84266467] 0.829418939664
------------------
35
['Travel Or Transport' 'Nightlife Spot' 'Spiritual' 'Outdoors & Recreation'
 'Arts & Entertainment' 'Work' 'Professional Or Medical Office'
 "Another's Home" 'Food' 'Home' 'Shop Or Store']
[ 0.98878726  0.99314847  0.99283501  0.97983311  0.98273761  0.96245602
  0.96512976  0.94370018  0.90525365  0.99127674  0.91478525] 0.96544936807
[ 0.78810707  0.83118781  0.85426334  0.86357746  0.86475647  0.84183907
  0.83407176  0.78103813  0.79540818  0.96319991  0.87097781] 0.844402455869
--

['Travel Or Transport' 'Nightlife Spot' 'Spiritual' 'Outdoors & Recreation'
 'Arts & Entertainment' 'Work' 'Professional Or Medical Office'
 "Another's Home" 'Food' 'Home' 'Shop Or Store']
[ 0.98790976  0.99147642  0.98731021  0.98200688  0.9812665   0.96529023
  0.96306668  0.94113332  0.91010454  0.99067768  0.91308586] 0.964848007135
[ 0.82845087  0.90588402  0.73086457  0.73486965  0.86681522  0.86102116
  0.88902249  0.76302268  0.78492081  0.95424072  0.81865043] 0.830705691661
------------------
52
['Travel Or Transport' 'Nightlife Spot' 'Spiritual' 'Outdoors & Recreation'
 'Arts & Entertainment' 'Work' 'Professional Or Medical Office'
 "Another's Home" 'Food' 'Home' 'Shop Or Store']
[ 0.986662    0.99076272  0.99060381  0.97864694  0.98269715  0.96355669
  0.96786186  0.94410451  0.89289403  0.98978961  0.91733895] 0.964083479457
[ 0.73485874  0.86242133  0.88580395  0.84492853  0.88310948  0.83949159
  0.88610017  0.81375221  0.76717858  0.95002485  0.86143679] 0.848100566841


['Travel Or Transport' 'Nightlife Spot' 'Spiritual' 'Outdoors & Recreation'
 'Arts & Entertainment' 'Work' 'Professional Or Medical Office'
 "Another's Home" 'Food' 'Home' 'Shop Or Store']
[ 0.98835445  0.99214752  0.99031985  0.98334702  0.98193174  0.96395029
  0.97128481  0.94483541  0.90592367  0.99061033  0.91680228] 0.966318850971
[ 0.74802535  0.82018089  0.83912427  0.7859193   0.8652756   0.8257107
  0.87946715  0.81990953  0.77662172  0.96430778  0.8522966 ] 0.834258080992
------------------
69
['Travel Or Transport' 'Nightlife Spot' 'Spiritual' 'Outdoors & Recreation'
 'Arts & Entertainment' 'Work' 'Professional Or Medical Office'
 "Another's Home" 'Food' 'Home' 'Shop Or Store']
[ 0.99196148  0.99232007  0.99418071  0.98410521  0.98161794  0.95842989
  0.96287557  0.94692354  0.9122813   0.99158629  0.91788893] 0.966742811537
[ 0.80871075  0.88706705  0.81739929  0.80356173  0.89318527  0.86803347
  0.78809511  0.71461615  0.79136879  0.95915142  0.86492414] 0.836010288392
-

['Travel Or Transport' 'Nightlife Spot' 'Spiritual' 'Outdoors & Recreation'
 'Arts & Entertainment' 'Work' 'Professional Or Medical Office'
 "Another's Home" 'Food' 'Home' 'Shop Or Store']
[ 0.98918736  0.99214674  0.98997368  0.97840278  0.98167458  0.96406237
  0.96573203  0.93968363  0.90556025  0.99023814  0.91792556] 0.964962464695
[ 0.78031725  0.91049184  0.79260776  0.85266444  0.86539472  0.85254691
  0.83739347  0.73957982  0.76018322  0.9526174   0.84677861] 0.835506856453
------------------
86
['Travel Or Transport' 'Nightlife Spot' 'Spiritual' 'Outdoors & Recreation'
 'Arts & Entertainment' 'Work' 'Professional Or Medical Office'
 "Another's Home" 'Food' 'Home' 'Shop Or Store']
[ 0.98785223  0.99062726  0.99287749  0.97885642  0.98246323  0.96103862
  0.96503725  0.94139574  0.91216412  0.98920178  0.91475113] 0.96511502356
[ 0.81331728  0.84234745  0.82308391  0.81892294  0.8822079   0.9009168
  0.82797356  0.75027404  0.81363055  0.95790752  0.84135767] 0.842903602871
--

In [5]:
x_train.shape

(98197, 45)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.hist(y_train);
plt.hist(y_test);
plt.hist(y_pred);

In [None]:
from sklearn.metrics import roc_auc_score
print(roc_auc_score(np.array([0,0,0,0,0,0,0,0]),np.array([0,0,1,0,0,0,0,0])))

In [None]:
x_test.shape