In [1]:
import os
import pickle
import pandas as pd
import numpy as np
import xgboost as xgb
from calculate_confusion_matrix import calculate_confusion_matrix
import time
from copy import deepcopy
from utils import one_hot_encoder

save_results = True
do_stratify = False

n_boot = 100
split = 0.7
np.random.seed(seed=0)

ft_dir = 'features_long/'

# list feature files
files = os.listdir(ft_dir)
# files.remove('1515656.dat')
# print('subject 1515656 removed due to spoofed gps.')

# reading top locations
with open('top_locations.dat', 'rb') as f:
    location_top = pickle.load(f)
f.close()

feature_all = []
target_all = []

for filename in files:
    with open(ft_dir+filename, 'rb') as f:  
        feature, target = pickle.load(f)

        # only keeping locations in location_top
        ind = np.array([], int)
        for (i,loc) in enumerate(target['location']):
            if loc in location_top:
                ind = np.append(ind, i)
        feature = feature.loc[ind,:]
        target = target.loc[ind]
        feature = feature.reset_index(drop=True)
        target = target.reset_index(drop=True)
        
        feature_all.append(feature)
        target_all.append(target)
        
    f.close()

confs = []
aucs = []
labels = []
inds = np.arange(0,len(feature_all),1)
inds_split = int(np.floor(split*len(feature_all)))

for i in range(n_boot):
    
    print('------------------')
    print(i)
#     if i==6:
#         print 'subject skipped because of lack of data'
#         continue
    
    # training set
    np.random.shuffle(inds)
    ind_train = inds[:inds_split]
    ind_test = inds[inds_split:]
    
    x_train = pd.concat([feature_all[j] for j in ind_train], axis=0)
    y_train = pd.concat([target_all[j]['location'] for j in ind_train], axis=0)
    x_train = x_train.reset_index(drop=True)
    y_train = y_train.reset_index(drop=True)
    
    # test set
    x_test = pd.concat([feature_all[j] for j in ind_test], axis=0)
    y_test = pd.concat([target_all[j]['location'] for j in ind_test], axis=0)
    x_test = x_test.reset_index(drop=True)
    y_test = y_test.reset_index(drop=True)
    
    # remove foursquare features
    x_train = x_train.drop(['fsq 0','fsq 1','fsq 2','fsq 3','fsq 4','fsq 5','fsq 6','fsq 7','fsq 8','fsq distance'],axis=1)
    x_test = x_test.drop(['fsq 0','fsq 1','fsq 2','fsq 3','fsq 4','fsq 5','fsq 6','fsq 7','fsq 8','fsq distance'],axis=1)
    x_train = x_train.reset_index(drop=True)
    x_test = x_test.reset_index(drop=True)
    
    # model (sensor)
    gbm = xgb.XGBClassifier(max_depth=6, n_estimators=50, learning_rate=0.05, nthread=12, subsample=0.25, \
                        colsample_bytree=0.5, max_delta_step=0, gamma=3, objective='mlogloss', reg_alpha=0.5, \
                        missing=np.nan)
    # model (sensor + foursquare)
#     gbm = xgb.XGBClassifier(max_depth=6, n_estimators=75, learning_rate=0.05, nthread=12, subsample=0.25, \
#                         colsample_bytree=0.2, max_delta_step=0, gamma=3, objective='mlogloss', reg_alpha=0.5, \
#                         missing=np.nan)
    
    # fitting model
#     gbm.fit(x_train, y_train, eval_set=[(x_train,y_train),(x_test, y_test)], eval_metric='mlogloss', verbose=True)
#     print gbm.evals_result()
    gbm.fit(x_train, y_train)
    
    # training performance
    y_pred = gbm.predict(x_train)
    conf_train, roc_auc_train = calculate_confusion_matrix(y_pred, y_train)

    # test
    y_pred = gbm.predict(x_test)
    conf, roc_auc = calculate_confusion_matrix(y_pred, y_test)
    
    labels.append(np.unique(y_test))
    confs.append(conf)
    aucs.append(roc_auc)

    print(np.unique(y_test))
    print(roc_auc_train, np.nanmean(roc_auc_train))
    print(roc_auc, np.nanmean(roc_auc))

# saving the results
if save_results:
    with open('auc_location_new_10fold3_1515656removed_sensor.dat','wb') as f:
        pickle.dump([aucs, confs, labels], f)
    f.close()




------------------
0
["Another's Home" 'Arts & Entertainment' 'Food' 'Home' 'Nightlife Spot'
 'Outdoors & Recreation' 'Professional Or Medical Office' 'Shop Or Store'
 'Spiritual' 'Travel Or Transport' 'Work']
[ 0.66598778  0.55990159  0.72941539  0.9497208   0.63467255  0.55915709
  0.59896687  0.84165839  0.52666667  0.51860088  0.88351558] 0.678933052448
[ 0.60002663  0.50645887  0.64502279  0.92823747  0.52461317  0.5
  0.53786062  0.79588927  0.5         0.5         0.80499874] 0.622100686879
------------------
1
["Another's Home" 'Arts & Entertainment' 'Food' 'Home' 'Nightlife Spot'
 'Outdoors & Recreation' 'Professional Or Medical Office' 'Shop Or Store'
 'Spiritual' 'Travel Or Transport' 'Work']
[ 0.70300678  0.56302895  0.73445271  0.94683704  0.65144579  0.51388889
  0.61650957  0.84422384  0.5862069   0.50306748  0.87483855] 0.685227862298
[ 0.60096233  0.50614982  0.63369812  0.90012821  0.5152418   0.49992824
  0.5844229   0.76859975  0.5         0.5         0.71883501] 0.

["Another's Home" 'Arts & Entertainment' 'Food' 'Home' 'Nightlife Spot'
 'Outdoors & Recreation' 'Professional Or Medical Office' 'Shop Or Store'
 'Spiritual' 'Travel Or Transport' 'Work']
[ 0.71201271  0.55875688  0.70605695  0.94755261  0.67065353  0.56129944
  0.63801084  0.84715376  0.51293103  0.55164319  0.86818201] 0.688568452381
[ 0.57571522  0.49969884  0.64482708  0.89532198  0.51962462  0.51724547
  0.54252141  0.76133738  0.5         0.5         0.70834627] 0.605876207122
------------------
18
["Another's Home" 'Arts & Entertainment' 'Food' 'Home' 'Nightlife Spot'
 'Outdoors & Recreation' 'Professional Or Medical Office' 'Shop Or Store'
 'Spiritual' 'Travel Or Transport' 'Work']
[ 0.69168052  0.54179626  0.7462927   0.95253806  0.6591899   0.56071556
  0.61258841  0.84411162  0.52013423  0.54497354  0.89236909] 0.68785362672
[ 0.56264276  0.51188348  0.6347856   0.91489993  0.49942707  0.5
  0.55311809  0.7928055   0.5         0.5         0.76049592] 0.611823486786
--------

["Another's Home" 'Arts & Entertainment' 'Food' 'Home' 'Nightlife Spot'
 'Outdoors & Recreation' 'Professional Or Medical Office' 'Shop Or Store'
 'Spiritual' 'Travel Or Transport' 'Work']
[ 0.69850374  0.5646133   0.73257523  0.95048434  0.66517743  0.5460021
  0.6126202   0.84808571  0.6         0.53895856  0.8892967 ] 0.6951197569
[ 0.58004922  0.51489829  0.64337745  0.93097368  0.54853643  0.5
  0.59538298  0.80146186  0.5         0.5         0.79559825] 0.628207104859
------------------
35
["Another's Home" 'Arts & Entertainment' 'Food' 'Home' 'Nightlife Spot'
 'Outdoors & Recreation' 'Professional Or Medical Office' 'Shop Or Store'
 'Spiritual' 'Travel Or Transport' 'Work']
[ 0.68909338  0.54501175  0.72780372  0.94888726  0.65040338  0.57458364
  0.6035636   0.84218431  0.57988166  0.54929577  0.86955394] 0.689114764196
[ 0.59609779  0.51958599  0.63811923  0.89337727  0.54447482  0.50634263
  0.55891975  0.8079004   0.5         0.5         0.70063265] 0.615040957723
----------

["Another's Home" 'Arts & Entertainment' 'Food' 'Home' 'Nightlife Spot'
 'Outdoors & Recreation' 'Professional Or Medical Office' 'Shop Or Store'
 'Spiritual' 'Travel Or Transport' 'Work']
[ 0.71918091  0.55520865  0.70706112  0.95194246  0.66457085  0.54115226
  0.59339886  0.84672634  0.53793103  0.55384615  0.87665192] 0.686151869977
[ 0.59373222  0.51991141  0.61472509  0.90696046  0.52485551  0.5
  0.56784623  0.79035669  0.5         0.5         0.71993977] 0.612575216078
------------------
52
["Another's Home" 'Arts & Entertainment' 'Food' 'Home' 'Nightlife Spot'
 'Outdoors & Recreation' 'Professional Or Medical Office' 'Shop Or Store'
 'Spiritual' 'Travel Or Transport' 'Work']
[ 0.69940468  0.56513539  0.73458427  0.95063367  0.66805593  0.57240795
  0.61517355  0.84121928  0.59196575  0.56060606  0.88356593] 0.698432041021
[ 0.57247281  0.51373752  0.64408732  0.91834543  0.51542624  0.5
  0.55945962  0.77053165  0.5         0.5         0.82198212] 0.619640247278
--------------

["Another's Home" 'Arts & Entertainment' 'Food' 'Home' 'Nightlife Spot'
 'Outdoors & Recreation' 'Professional Or Medical Office' 'Shop Or Store'
 'Spiritual' 'Travel Or Transport' 'Work']
[ 0.69880596  0.55933941  0.7221423   0.94406953  0.62039105  0.59054225
  0.62285918  0.84614818  0.5777027   0.55095478  0.8720597 ] 0.691365003131
[ 0.59873375  0.49976884  0.6392035   0.93545805  0.5157191   0.4991511
  0.52314566  0.81053599  0.5         0.5         0.77278718] 0.617682107368
------------------
69
["Another's Home" 'Arts & Entertainment' 'Food' 'Home' 'Nightlife Spot'
 'Outdoors & Recreation' 'Professional Or Medical Office' 'Shop Or Store'
 'Spiritual' 'Travel Or Transport' 'Work']
[ 0.68539934  0.56331374  0.71375751  0.94795825  0.6303503   0.55565026
  0.61316477  0.8533857   0.59        0.55339806  0.88582005] 0.69019981482
[ 0.58880467  0.51053142  0.62581029  0.92574418  0.54345554  0.49992607
  0.55449959  0.78921699  0.5         0.49985346  0.77226706] 0.619100843398
--

["Another's Home" 'Arts & Entertainment' 'Food' 'Home' 'Nightlife Spot'
 'Outdoors & Recreation' 'Professional Or Medical Office' 'Shop Or Store'
 'Spiritual' 'Travel Or Transport' 'Work']
[ 0.68145573  0.59059821  0.74765219  0.94492523  0.54198473  0.55697385
  0.6045697   0.84126329  0.5942029   0.55136986  0.88813994] 0.685739602933
[ 0.5702909   0.49951192  0.64679076  0.91546814  0.52934202  0.49992996
  0.55755306  0.80584629  0.5         0.5         0.69358942] 0.610756588885
------------------
86
["Another's Home" 'Arts & Entertainment' 'Food' 'Home' 'Nightlife Spot'
 'Outdoors & Recreation' 'Professional Or Medical Office' 'Shop Or Store'
 'Spiritual' 'Travel Or Transport' 'Work']
[ 0.71476156  0.55045995  0.71644397  0.95154121  0.65146235  0.56520711
  0.61401998  0.85621403  0.50813008  0.54938272  0.88063747] 0.687114584234
[ 0.59094591  0.51324927  0.61406425  0.91998445  0.52301278  0.50374662
  0.52592095  0.78427636  0.5         0.5         0.78035057] 0.614141013685


In [None]:
from sklearn.metrics import roc_auc_score
print(roc_auc_score(np.array([1,0,1,0,0,0,0,0]),np.array([0,0,0,0,0,0,0,0])))

In [None]:
files