# Performance

In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.compose import ColumnTransformer
from math import cos, asin, sqrt, pi
from sklearn.model_selection import KFold,StratifiedKFold,RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score

In [2]:
#helper functions
def distance(lat1, lon1, lat2, lon2):
    p = pi/180
    a = 0.5 - cos((lat2-lat1)*p)/2 + cos(lat1*p) * cos(lat2*p) * (1-cos((lon2-lon1)*p))/2
    return 12742000 * asin(sqrt(a))

def cumulative_distance(lat_longs):
    l=[]
    prev_lat,prev_long=lat_longs[0]

    for lat,long in lat_longs:
        l.append(distance(lat,long,prev_lat,prev_long)+1e-7)
        prev_lat=lat
        prev_long=long
        
    return l

def time_zone_cal(s):
    hour=int(s.split(':')[0])

    if 6<hour<=9:
        time_zone='Early_Morning'
    elif 9<hour<=12:
        time_zone='Morning'
    elif 12<hour<=17:
        time_zone='Afternoon'
    elif 17<hour<=23:
        time_zone='Evening'
    return time_zone

In [3]:
#reading Raw Data
df=pd.read_csv('Processed_Bus_Trail_data_8B.csv')

df=df[df.wifi_count!='Not found'].copy()
df.wifi_count=df.wifi_count.apply(float)
df.edge_wifi_count=df.edge_wifi_count.apply(float)

#Normalizing edge values with edge distance
df['next_hop_distance']=cumulative_distance(df[['lat','long']].values)

df['edge_wifi_count']=df.edge_wifi_count/df.next_hop_distance
#df['d_edge_wifi_count']=df.d_edge_wifi_count/df.next_hop_distance

#df['edge_honk_duration']=df.edge_honk_duration.apply(float)/df.next_hop_distance
#df['d_edge_honk_duration']=df.d_edge_honk_duration.apply(float)/df.next_hop_distance

#df['honk_duration']=df.honk_duration.apply(float)
#df['d_honk_duration']=df.d_honk_duration.apply(float)

#calculating timeZone
df['time_zone']=df.start_time.apply(time_zone_cal)

#Rebuilding meaning full features from sparse features
df['highly_populated_poi_exist']=(df.school+df.medical+df.other_poi+df.park).apply(lambda e:'Yes' if np.ceil(e)==1 else 'No')
df['road_exist_percent']=df.high_way+df.two_way+df.one_way

#Selected Columns
#columns=\
#['time_zone','stay_duration',
# 'wifi_count', 'edge_wifi_count', 'honk_duration',
# 'edge_honk_duration', 'd_wifi_count', 'd_edge_wifi_count','d_honk_duration', 
# 'd_edge_honk_duration', 'human_made', 'natural_land','road_exist_percent',
# 'highly_populated_poi_exist', 'RSI', 'Is_Bus_stop','Is_Turn', 'Is_Signal',
# 'Is_Congestion', 'Is_Adhoc']

columns=\
['time_zone','stay_duration',
 'wifi_count', 'edge_wifi_count',
 'human_made', 'natural_land','road_exist_percent',
 'highly_populated_poi_exist', 'RSI', 'Is_Bus_stop','Is_Turn', 'Is_Signal',
 'Is_Congestion', 'Is_Adhoc']

#New dataFrame is returned
new_df=df[columns].copy()

# Our Data

In [4]:
new_df.head()

Unnamed: 0,time_zone,stay_duration,wifi_count,edge_wifi_count,human_made,natural_land,road_exist_percent,highly_populated_poi_exist,RSI,Is_Bus_stop,Is_Turn,Is_Signal,Is_Congestion,Is_Adhoc
0,Early_Morning,502,22.0,0.0,0.173161,0.653202,0.173637,No,0.0,Not Bus_stop,Not Turn,Not Signal,Not Congestion,Adhoc
1,Early_Morning,23,0.0,0.004538,0.061028,0.764945,0.162584,Yes,2.0650003666749184,Bus_stop,Turn,Not Signal,Not Congestion,Not Adhoc
2,Early_Morning,5,2.0,0.009952,0.126635,0.637599,0.234868,Yes,1.306766369102926,Not Bus_stop,Not Turn,Not Signal,Not Congestion,Adhoc
3,Early_Morning,15,0.0,0.039576,0.128025,0.651821,0.219086,Yes,1.4383484533761353,Bus_stop,Not Turn,Not Signal,Not Congestion,Not Adhoc
4,Early_Morning,16,1.0,0.001964,0.040138,0.796306,0.163556,No,1.2647938983606837,Not Bus_stop,Turn,Not Signal,Not Congestion,Not Adhoc


# Raw-Features

In [5]:
#feature_names=['time_zone','stay_duration',
#               'wifi_count', 'edge_wifi_count', 'honk_duration',
#               'edge_honk_duration', 'd_wifi_count', 'd_edge_wifi_count','d_honk_duration', 
#               'd_edge_honk_duration', 'human_made', 'natural_land','road_exist_percent',
#               'highly_populated_poi_exist', 'RSI']
feature_names=\
['time_zone','stay_duration',
 'wifi_count', 'edge_wifi_count',
 'human_made', 'natural_land','road_exist_percent',
 'highly_populated_poi_exist', 'RSI']

data=new_df[feature_names].values

#converting categorical features in to integer encoding....
#categorical_features=[0,13]
categorical_features=[0,7]

categorical_names = {}
for feature in categorical_features:
    le = sklearn.preprocessing.LabelEncoder()
    le.fit(data[:, feature])
    data[:, feature] = le.transform(data[:, feature])
    categorical_names[feature] = le.classes_
    
# OneHot Encoding Categorical features
encoder = ColumnTransformer([('encoder',sklearn.preprocessing.OneHotEncoder(),categorical_features)],remainder='passthrough')
data=encoder.fit_transform(data)
#data=data.toarray()

# Encoding Label

In [6]:
def get_labels_for(poi_column='Is_Bus_stop'):
    labels=new_df[poi_column]
    le= sklearn.preprocessing.LabelEncoder()
    le.fit(labels)
    labels = le.transform(labels)
    class_names = le.classes_
    return class_names,labels

# Train_On_POI

In [7]:
def get_data(data,labels,train_index,test_index):
    return data[train_index,:],labels[train_index],data[test_index,:],labels[test_index]


def get_metrics_from_the_model(train,labels_train,test,labels_test):
    rf=RandomForestClassifier(n_estimators=20,max_depth=8)
    rf.fit(train, labels_train)

    pred_train= rf.predict(train)
    pred_test= rf.predict(test)
    
    performance={'tr_acc':accuracy_score(labels_train,pred_train),
                 'tr_precision':precision_score(labels_train,pred_train,average='weighted'),
                 'tr_recall':recall_score(labels_train,pred_train,average='weighted'),
                 'tr_f1-score':f1_score(labels_train,pred_train,average='weighted'),

                 'te_acc':accuracy_score(labels_test,pred_test),
                 'te_precision':precision_score(labels_test, pred_test,average='weighted'),
                 'te_recall':recall_score(labels_test, pred_test,average='weighted'),
                 'te_f1-score':f1_score(labels_test, pred_test,average='weighted')}
    return performance

In [8]:
def get_performance_of_poi_column(poi_column=None,fold=5,fold_repeat=10):
    # Spliting in Train and Test set
    class_names,labels=get_labels_for(poi_column)
    
    train_acc=[]
    train_prec=[]
    train_recall=[]
    train_f1=[]
    
    test_acc=[]
    test_prec=[]
    test_recall=[]
    test_f1=[]
    
    #np.random.seed(0)
    #kf=StratifiedKFold(n_splits=fold) #this K Fold method takes class imbalance into account....
    kf=RepeatedStratifiedKFold(n_splits=fold,n_repeats=fold_repeat) #this will repeat StratifiedKFold N times
    #kf = KFold(n_splits=fold)
    
    for train_index, test_index in kf.split(data,labels):
        train,labels_train,test,labels_test=get_data(data,labels,train_index,test_index)
        perf=get_metrics_from_the_model(train,labels_train,test,labels_test)
        
        train_acc.append(perf['tr_acc'])
        train_prec.append(perf['tr_precision'])
        train_recall.append(perf['tr_recall'])
        train_f1.append(perf['tr_f1-score'])
        
        test_acc.append(perf['te_acc'])
        test_prec.append(perf['te_precision'])
        test_recall.append(perf['te_recall'])
        test_f1.append(perf['te_f1-score'])


    performance={'poi_column':poi_column,

                 'Size':str(len(labels))+">>"+str({class_names[0]:len(labels)-labels.sum(),
                                                   class_names[1]:labels.sum()}),
                 'tr_acc':{'mean':np.mean(train_acc),'std':np.std(train_acc)},
                 'tr_precision':{'mean':np.mean(train_prec),'std':np.std(train_prec)},
                 'tr_recall':{'mean':np.mean(train_recall),'std':np.std(train_recall)},
                 'tr_f1-score':{'mean':np.mean(train_f1),'std':np.std(train_f1)},

                 'te_acc':{'mean':np.mean(test_acc),'std':np.std(test_acc)},
                 'te_precision':{'mean':np.mean(test_prec),'std':np.std(test_prec)},
                 'te_recall':{'mean':np.mean(test_recall),'std':np.std(test_recall)},
                 'te_f1-score':{'mean':np.mean(test_f1),'std':np.std(test_f1)}}
    return performance

In [9]:
#input
fold=5
fold_repeat=10

l=[]
for column in ['Is_Bus_stop','Is_Turn','Is_Signal','Is_Congestion','Is_Adhoc']:
    l.append(get_performance_of_poi_column(column,fold=fold,fold_repeat=fold_repeat))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

In [10]:
perf=pd.DataFrame(l)
perf.to_csv('./logs/rf_performance_kfold_8B_data.csv',index=False)
perf

Unnamed: 0,poi_column,Size,tr_acc,tr_precision,tr_recall,tr_f1-score,te_acc,te_precision,te_recall,te_f1-score
0,Is_Bus_stop,"1769>>{'Bus_stop': 596, 'Not Bus_stop': 1173}","{'mean': 0.9470463955600807, 'std': 0.00451819...","{'mean': 0.9469721366095709, 'std': 0.00456281...","{'mean': 0.9470463955600807, 'std': 0.00451819...","{'mean': 0.9469507425708688, 'std': 0.00456690...","{'mean': 0.8837211312238921, 'std': 0.01643204...","{'mean': 0.8831226720831776, 'std': 0.01679577...","{'mean': 0.8837211312238921, 'std': 0.01643204...","{'mean': 0.8828570864661711, 'std': 0.01713601..."
1,Is_Turn,"1769>>{'Not Turn': 1378, 'Turn': 391}","{'mean': 0.9538715637539679, 'std': 0.00684961...","{'mean': 0.9554739037963518, 'std': 0.00630781...","{'mean': 0.9538715637539679, 'std': 0.00684961...","{'mean': 0.9520611232072456, 'std': 0.00748791...","{'mean': 0.8878459051551673, 'std': 0.01389214...","{'mean': 0.8895329795732659, 'std': 0.01504883...","{'mean': 0.8878459051551673, 'std': 0.01389214...","{'mean': 0.8771900907207383, 'std': 0.01688324..."
2,Is_Signal,"1769>>{'Not Signal': 1702, 'Signal': 67}","{'mean': 0.9984595935397576, 'std': 0.00098608...","{'mean': 0.9984630610433307, 'std': 0.00098233...","{'mean': 0.9984595935397576, 'std': 0.00098608...","{'mean': 0.9984375863795362, 'std': 0.00101028...","{'mean': 0.9879022422816536, 'std': 0.00466484...","{'mean': 0.9878926758846738, 'std': 0.00468622...","{'mean': 0.9879022422816536, 'std': 0.00466484...","{'mean': 0.9866964940470448, 'std': 0.00571106..."
3,Is_Congestion,"1769>>{'Congestion': 18, 'Not Congestion': 1751}","{'mean': 0.9937819468567207, 'std': 0.00112078...","{'mean': 0.9938220120460112, 'std': 0.00110642...","{'mean': 0.9937819468567207, 'std': 0.00112078...","{'mean': 0.992342646539994, 'std': 0.001728428...","{'mean': 0.9898257070149326, 'std': 0.00137803...","{'mean': 0.9797568292421721, 'std': 0.00272880...","{'mean': 0.9898257070149326, 'std': 0.00137803...","{'mean': 0.9847650538327545, 'std': 0.00206018..."
4,Is_Adhoc,"1769>>{'Adhoc': 953, 'Not Adhoc': 816}","{'mean': 0.9228519394701642, 'std': 0.00809854...","{'mean': 0.922940249777843, 'std': 0.008091096...","{'mean': 0.9228519394701642, 'std': 0.00809854...","{'mean': 0.9228178975593678, 'std': 0.00811693...","{'mean': 0.8206931707239, 'std': 0.01783252661...","{'mean': 0.821211393091059, 'std': 0.017905700...","{'mean': 0.8206931707239, 'std': 0.01783252661...","{'mean': 0.8203755660184099, 'std': 0.01790537..."


In [11]:
#NICE