# Performance

In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.compose import ColumnTransformer
from math import cos, asin, sqrt, pi
from sklearn.model_selection import KFold,StratifiedKFold,RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score

In [2]:
#helper functions
def distance(lat1, lon1, lat2, lon2):
    p = pi/180
    a = 0.5 - cos((lat2-lat1)*p)/2 + cos(lat1*p) * cos(lat2*p) * (1-cos((lon2-lon1)*p))/2
    return 12742000 * asin(sqrt(a))

def cumulative_distance(lat_longs):
    l=[]
    prev_lat,prev_long=lat_longs[0]

    for lat,long in lat_longs:
        l.append(distance(lat,long,prev_lat,prev_long)+1e-7)
        prev_lat=lat
        prev_long=long
        
    return l

def time_zone_cal(s):
    hour=int(s.split(':')[0])

    if 6<hour<=9:
        time_zone='Early_Morning'
    elif 9<hour<=12:
        time_zone='Morning'
    elif 12<hour<=17:
        time_zone='Afternoon'
    elif 17<hour<=23:
        time_zone='Evening'
    return time_zone

In [3]:
#reading Raw Data
df=pd.read_csv('Processed_Bus_Trail_data_54F.csv')

#Normalizing edge values with edge distance
df['next_hop_distance']=cumulative_distance(df[['lat','long']].values)

df['edge_wifi_count']=df.edge_wifi_count/df.next_hop_distance
df['d_edge_wifi_count']=df.d_edge_wifi_count/df.next_hop_distance

df['edge_honk_duration']=df.edge_honk_duration.apply(float)/df.next_hop_distance
df['d_edge_honk_duration']=df.d_edge_honk_duration.apply(float)/df.next_hop_distance

df['honk_duration']=df.honk_duration.apply(float)
df['d_honk_duration']=df.d_honk_duration.apply(float)

#calculating timeZone
df['time_zone']=df.start_time.apply(time_zone_cal)

#Rebuilding meaning full features from sparse features
df['highly_populated_poi_exist']=(df.school+df.medical+df.other_poi+df.park).apply(lambda e:'Yes' if np.ceil(e)==1 else 'No')
df['road_exist_percent']=df.high_way+df.two_way+df.one_way

#Selected Columns
columns=\
['time_zone','stay_duration',
 'wifi_count', 'edge_wifi_count', 'honk_duration',
 'edge_honk_duration', 'd_wifi_count', 'd_edge_wifi_count','d_honk_duration', 
 'd_edge_honk_duration', 'human_made', 'natural_land','road_exist_percent',
 'highly_populated_poi_exist', 'RSI', 'Is_Bus_stop','Is_Turn', 'Is_Signal',
 'Is_Congestion', 'Is_Adhoc']

#New dataFrame is returned
new_df=df[columns].copy()

# Our Data

In [4]:
new_df.head()

Unnamed: 0,time_zone,stay_duration,wifi_count,edge_wifi_count,honk_duration,edge_honk_duration,d_wifi_count,d_edge_wifi_count,d_honk_duration,d_edge_honk_duration,human_made,natural_land,road_exist_percent,highly_populated_poi_exist,RSI,Is_Bus_stop,Is_Turn,Is_Signal,Is_Congestion,Is_Adhoc
0,Early_Morning,488,13,0.0,103.0,0.0,13,0.0,103.0,0.0,0.171148,0.657088,0.171764,No,0.0,Not Bus_stop,Not Turn,Not Signal,Not Congestion,Adhoc
1,Early_Morning,5,0,0.020363,1.0,0.047513,0,0.020363,2.0,0.040726,0.175362,0.642954,0.181684,No,1.762508,Not Bus_stop,Not Turn,Not Signal,Not Congestion,Adhoc
2,Early_Morning,6,0,0.006551,2.0,0.045856,0,0.006551,3.0,0.039306,0.103336,0.769924,0.12674,No,1.582408,Not Bus_stop,Not Turn,Not Signal,Not Congestion,Adhoc
3,Early_Morning,15,0,0.020829,8.0,0.020829,0,0.020829,8.0,0.020829,0.073111,0.78193,0.144959,No,1.751993,Bus_stop,Not Turn,Not Signal,Not Congestion,Not Adhoc
4,Early_Morning,26,2,0.002184,8.0,0.022932,2,0.002184,8.0,0.022932,0.057776,0.77358,0.158518,Yes,2.459421,Bus_stop,Not Turn,Not Signal,Not Congestion,Not Adhoc


# Raw-Features

In [5]:
feature_names=['time_zone','stay_duration',
               'wifi_count', 'edge_wifi_count', 'honk_duration',
               'edge_honk_duration', 'd_wifi_count', 'd_edge_wifi_count','d_honk_duration', 
               'd_edge_honk_duration', 'human_made', 'natural_land','road_exist_percent',
               'highly_populated_poi_exist', 'RSI']

data=new_df[feature_names].values

#converting categorical features in to integer encoding....
categorical_features=[0,13]

categorical_names = {}
for feature in categorical_features:
    le = sklearn.preprocessing.LabelEncoder()
    le.fit(data[:, feature])
    data[:, feature] = le.transform(data[:, feature])
    categorical_names[feature] = le.classes_
    
# OneHot Encoding Categorical features
encoder = ColumnTransformer([('encoder',sklearn.preprocessing.OneHotEncoder(),categorical_features)],remainder='passthrough')
data=encoder.fit_transform(data)
#data=data.toarray()

# Encoding Label

In [6]:
def get_labels_for(poi_column='Is_Bus_stop'):
    labels=new_df[poi_column]
    le= sklearn.preprocessing.LabelEncoder()
    le.fit(labels)
    labels = le.transform(labels)
    class_names = le.classes_
    return class_names,labels

# Train_On_POI

In [7]:
def get_data(data,labels,train_index,test_index):
    return data[train_index,:],labels[train_index],data[test_index,:],labels[test_index]


def get_metrics_from_the_model(train,labels_train,test,labels_test):
    rf=RandomForestClassifier(n_estimators=20,max_depth=8)
    rf.fit(train, labels_train)

    pred_train= rf.predict(train)
    pred_test= rf.predict(test)
    
    performance={'tr_acc':accuracy_score(labels_train,pred_train),
                 'tr_precision':precision_score(labels_train,pred_train,average='weighted'),
                 'tr_recall':recall_score(labels_train,pred_train,average='weighted'),
                 'tr_f1-score':f1_score(labels_train,pred_train,average='weighted'),

                 'te_acc':accuracy_score(labels_test,pred_test),
                 'te_precision':precision_score(labels_test, pred_test,average='weighted'),
                 'te_recall':recall_score(labels_test, pred_test,average='weighted'),
                 'te_f1-score':f1_score(labels_test, pred_test,average='weighted')}
    return performance

In [8]:
def get_performance_of_poi_column(poi_column=None,fold=5,fold_repeat=10):
    # Spliting in Train and Test set
    class_names,labels=get_labels_for(poi_column)
    
    train_acc=[]
    train_prec=[]
    train_recall=[]
    train_f1=[]
    
    test_acc=[]
    test_prec=[]
    test_recall=[]
    test_f1=[]
    
    #np.random.seed(0)
    #kf=StratifiedKFold(n_splits=fold) #this K Fold method takes class imbalance into account....
    kf=RepeatedStratifiedKFold(n_splits=fold,n_repeats=fold_repeat) #this will repeat StratifiedKFold N times
    #kf = KFold(n_splits=fold)
    
    for train_index, test_index in kf.split(data,labels):
        train,labels_train,test,labels_test=get_data(data,labels,train_index,test_index)
        perf=get_metrics_from_the_model(train,labels_train,test,labels_test)
        
        train_acc.append(perf['tr_acc'])
        train_prec.append(perf['tr_precision'])
        train_recall.append(perf['tr_recall'])
        train_f1.append(perf['tr_f1-score'])
        
        test_acc.append(perf['te_acc'])
        test_prec.append(perf['te_precision'])
        test_recall.append(perf['te_recall'])
        test_f1.append(perf['te_f1-score'])


    performance={'poi_column':poi_column,

                 'Size':str(len(labels))+">>"+str({class_names[0]:len(labels)-labels.sum(),
                                                   class_names[1]:labels.sum()}),
                 'tr_acc':{'mean':np.mean(train_acc),'std':np.std(train_acc)},
                 'tr_precision':{'mean':np.mean(train_prec),'std':np.std(train_prec)},
                 'tr_recall':{'mean':np.mean(train_recall),'std':np.std(train_recall)},
                 'tr_f1-score':{'mean':np.mean(train_f1),'std':np.std(train_f1)},

                 'te_acc':{'mean':np.mean(test_acc),'std':np.std(test_acc)},
                 'te_precision':{'mean':np.mean(test_prec),'std':np.std(test_prec)},
                 'te_recall':{'mean':np.mean(test_recall),'std':np.std(test_recall)},
                 'te_f1-score':{'mean':np.mean(test_f1),'std':np.std(test_f1)}}
    return performance

In [9]:
#input
fold=5
fold_repeat=10

l=[]
for column in ['Is_Bus_stop','Is_Turn','Is_Signal','Is_Congestion','Is_Adhoc']:
    l.append(get_performance_of_poi_column(column,fold=fold,fold_repeat=fold_repeat))

In [10]:
perf=pd.DataFrame(l)
perf.to_csv('./logs/rf_performance_kfold_54F_data.csv',index=False)
perf

Unnamed: 0,poi_column,Size,tr_acc,tr_precision,tr_recall,tr_f1-score,te_acc,te_precision,te_recall,te_f1-score
0,Is_Bus_stop,"3840>>{'Bus_stop': 1506, 'Not Bus_stop': 2334}","{'mean': 0.9060677083333333, 'std': 0.00728851...","{'mean': 0.9062570838923808, 'std': 0.00729954...","{'mean': 0.9060677083333333, 'std': 0.00728851...","{'mean': 0.9053884754581901, 'std': 0.00739852...","{'mean': 0.8287500000000001, 'std': 0.01274244...","{'mean': 0.828761630738504, 'std': 0.013116517...","{'mean': 0.8287500000000001, 'std': 0.01274244...","{'mean': 0.8261000634710034, 'std': 0.01310576..."
1,Is_Turn,"3840>>{'Not Turn': 2819, 'Turn': 1021}","{'mean': 0.90603515625, 'std': 0.0061827058897...","{'mean': 0.9114937379503776, 'std': 0.00486531...","{'mean': 0.90603515625, 'std': 0.0061827058897...","{'mean': 0.9001418755042316, 'std': 0.00733995...","{'mean': 0.8379166666666668, 'std': 0.01185751...","{'mean': 0.8367443924726875, 'std': 0.01324338...","{'mean': 0.8379166666666668, 'std': 0.01185751...","{'mean': 0.8228533702978672, 'std': 0.01459060..."
2,Is_Signal,"3840>>{'Not Signal': 3576, 'Signal': 264}","{'mean': 0.9731575520833334, 'std': 0.00271587...","{'mean': 0.9737678796055476, 'std': 0.00261662...","{'mean': 0.9731575520833334, 'std': 0.00271587...","{'mean': 0.9701061239783276, 'std': 0.00338337...","{'mean': 0.9511458333333335, 'std': 0.00521640...","{'mean': 0.9475659626378907, 'std': 0.00715179...","{'mean': 0.9511458333333335, 'std': 0.00521640...","{'mean': 0.9409987633419558, 'std': 0.00807666..."
3,Is_Congestion,"3840>>{'Congestion': 865, 'Not Congestion': 2975}","{'mean': 0.8517708333333333, 'std': 0.00548259...","{'mean': 0.8692705409226136, 'std': 0.00438071...","{'mean': 0.8517708333333333, 'std': 0.00548259...","{'mean': 0.8236978060995227, 'std': 0.00851714...","{'mean': 0.7997135416666665, 'std': 0.00788030...","{'mean': 0.7804144852653618, 'std': 0.01610972...","{'mean': 0.7997135416666665, 'std': 0.00788030...","{'mean': 0.7555378186196207, 'std': 0.01187274..."
4,Is_Adhoc,"3840>>{'Adhoc': 1314, 'Not Adhoc': 2526}","{'mean': 0.8302864583333333, 'std': 0.00685320...","{'mean': 0.8435021345366568, 'std': 0.00686543...","{'mean': 0.8302864583333333, 'std': 0.00685320...","{'mean': 0.8177841781956312, 'std': 0.00809264...","{'mean': 0.72, 'std': 0.012947606593230106}","{'mean': 0.7112970564051111, 'std': 0.01743664...","{'mean': 0.72, 'std': 0.012947606593230106}","{'mean': 0.6909920517519947, 'std': 0.01543878..."


In [11]:
#NICE