# Performance

In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.compose import ColumnTransformer
from math import cos, asin, sqrt, pi
from sklearn.model_selection import KFold,StratifiedKFold,RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score

In [2]:
#helper functions
def distance(lat1, lon1, lat2, lon2):
    p = pi/180
    a = 0.5 - cos((lat2-lat1)*p)/2 + cos(lat1*p) * cos(lat2*p) * (1-cos((lon2-lon1)*p))/2
    return 12742000 * asin(sqrt(a))

def cumulative_distance(lat_longs):
    l=[]
    prev_lat,prev_long=lat_longs[0]

    for lat,long in lat_longs:
        l.append(distance(lat,long,prev_lat,prev_long)+1e-7)
        prev_lat=lat
        prev_long=long
        
    return l

def time_zone_cal(s):
    hour=int(s.split(':')[0])

    if 6<hour<=9:
        time_zone='Early_Morning'
    elif 9<hour<=12:
        time_zone='Morning'
    elif 12<hour<=17:
        time_zone='Afternoon'
    elif 17<hour<=23:
        time_zone='Evening'
    return time_zone


#reading Raw Data
def process_df(name):
    df=pd.read_csv(name) #'Processed_Bus_Trail_data_september.csv'
    #df=df[df.honk_duration!='Not found'].copy()
    df=df[df.wifi_count!='Not found'].copy()
    df.wifi_count=df.wifi_count.apply(float)
    df.edge_wifi_count=df.edge_wifi_count.apply(float)

    #Normalizing edge values with edge distance
    df['next_hop_distance']=cumulative_distance(df[['lat','long']].values)

    df['edge_wifi_count']=df.edge_wifi_count/df.next_hop_distance
    #df['d_edge_wifi_count']=df.d_edge_wifi_count/df.next_hop_distance

    #df['edge_honk_duration']=df.edge_honk_duration.apply(float)/df.next_hop_distance
    #df['d_edge_honk_duration']=df.d_edge_honk_duration.apply(float)/df.next_hop_distance

    #df['honk_duration']=df.honk_duration.apply(float)
    #df['d_honk_duration']=df.d_honk_duration.apply(float)

    #calculating timeZone
    df['time_zone']=df.start_time.apply(time_zone_cal)

    #Rebuilding meaning full features from sparse features
    df['highly_populated_poi_exist']=(df.school+df.medical+df.other_poi+df.park).apply(lambda e:'Yes' if np.ceil(e)==1 else 'No')
    df['road_exist_percent']=df.high_way+df.two_way+df.one_way

    #Selected Columns
    #columns=\
    #['time_zone','stay_duration',
    # 'wifi_count', 'edge_wifi_count', 'honk_duration',
    # 'edge_honk_duration', 'd_wifi_count', 'd_edge_wifi_count','d_honk_duration', 
    # 'd_edge_honk_duration', 'human_made', 'natural_land','road_exist_percent',
    # 'highly_populated_poi_exist', 'RSI', 'Is_Bus_stop','Is_Turn', 'Is_Signal',
    # 'Is_Congestion', 'Is_Adhoc']
    
    columns=\
        ['time_zone','stay_duration',
         'wifi_count', 'edge_wifi_count',
         'human_made', 'natural_land','road_exist_percent',
         'highly_populated_poi_exist', 'RSI', 'Is_Bus_stop','Is_Turn', 'Is_Signal',
         'Is_Congestion', 'Is_Adhoc']

    #New dataFrame is returned
    df=df[columns].copy()
    return df

# Our Data

In [3]:
train_df=process_df('Processed_Bus_Trail_data_54F.csv')
test_df=process_df('Processed_Bus_Trail_data_8B.csv')

  result = method(y)


# Raw-Features

In [4]:
def process_features(train_df,test_df):
    #feature_names=['time_zone','stay_duration',
    #               'wifi_count', 'edge_wifi_count', 'honk_duration',
    #               'edge_honk_duration', 'd_wifi_count', 'd_edge_wifi_count','d_honk_duration', 
    #               'd_edge_honk_duration', 'human_made', 'natural_land','road_exist_percent',
    #               'highly_populated_poi_exist', 'RSI']
    
    feature_names=\
        ['time_zone','stay_duration',
         'wifi_count', 'edge_wifi_count',
         'human_made', 'natural_land','road_exist_percent',
         'highly_populated_poi_exist', 'RSI']

    data=pd.concat([train_df,test_df],axis=0)[feature_names].values

    #converting categorical features in to integer encoding....
    #categorical_features=[0,13]
    categorical_features=[0,7]

    categorical_names = {}
    for feature in categorical_features:
        le = sklearn.preprocessing.LabelEncoder()
        le.fit(data[:, feature])
        data[:, feature] = le.transform(data[:, feature])
        categorical_names[feature] = le.classes_

    # OneHot Encoding Categorical features
    #encoder = sklearn.preprocessing.OneHotEncoder(categorical_features=categorical_features)
    #data=encoder.fit_transform(data)
    #data=data.toarray()
    encoder = ColumnTransformer([('encoder',sklearn.preprocessing.OneHotEncoder(),categorical_features)],remainder='passthrough')
    data=encoder.fit_transform(data)
    
    return data[:train_df.shape[0]],data[train_df.shape[0]:]

In [5]:
train,test=process_features(train_df,test_df)

# Encoding Label

In [6]:
def get_labels_for(train_df,test_df,poi_column='Is_Bus_stop'):
    labels=pd.concat([train_df,test_df],axis=0)[poi_column]
    le= sklearn.preprocessing.LabelEncoder()
    le.fit(labels)
    labels = le.transform(labels)
    class_names = le.classes_
    return class_names,labels[:train_df.shape[0]],labels[train_df.shape[0]:]

# Train_On_POI

In [7]:
def get_performance_of_poi_column(poi_column=None):
    # Spliting in Train and Test set
    class_names,labels_train,labels_test=get_labels_for(train_df,test_df,poi_column)
    
    # Modeling
    np.random.seed(1)

    rf=RandomForestClassifier(n_estimators=20,max_depth=8)
    rf.fit(train, labels_train)

    pred_train= rf.predict(train)
    pred_test= rf.predict(test)

    performance={'poi_column':poi_column,

                 'train_size':str(len(labels_train))+">>"+str({class_names[0]:len(labels_train)-labels_train.sum(),
                                                               class_names[1]:labels_train.sum()}),
                 'tr_acc':accuracy_score(labels_train,pred_train),
                 'tr_precision':precision_score(labels_train,pred_train,average='weighted'),
                 'tr_recall':recall_score(labels_train,pred_train,average='weighted'),
                 'tr_f1-score':f1_score(labels_train,pred_train,average='weighted'),

                 'test_size':str(len(labels_test))+">>"+str({class_names[0]:len(labels_test)-labels_test.sum(),
                                                               class_names[1]:labels_test.sum()}),
                 'te_acc':accuracy_score(labels_test,pred_test),
                 'te_precision':precision_score(labels_test, pred_test,average='weighted'),
                 'te_recall':recall_score(labels_test, pred_test,average='weighted'),
                 'te_f1-score':f1_score(labels_test, pred_test,average='weighted')}
    return performance

In [8]:
l=[]
for column in ['Is_Bus_stop','Is_Turn','Is_Signal','Is_Congestion','Is_Adhoc']:
    l.append(get_performance_of_poi_column(column))

In [9]:
perf=pd.DataFrame(l)
perf.to_csv('./logs/rf_performance_train_54F_test_8B.csv')
perf

Unnamed: 0,poi_column,train_size,tr_acc,tr_precision,tr_recall,tr_f1-score,test_size,te_acc,te_precision,te_recall,te_f1-score
0,Is_Bus_stop,"3840>>{'Bus_stop': 1506, 'Not Bus_stop': 2334}",0.902604,0.902497,0.902604,0.902025,"1769>>{'Bus_stop': 596, 'Not Bus_stop': 1173}",0.671001,0.657574,0.671001,0.661645
1,Is_Turn,"3840>>{'Not Turn': 2819, 'Turn': 1021}",0.927083,0.927257,0.927083,0.92499,"1769>>{'Not Turn': 1378, 'Turn': 391}",0.745619,0.678481,0.745619,0.698279
2,Is_Signal,"3840>>{'Not Signal': 3576, 'Signal': 264}",0.973177,0.973573,0.973177,0.970265,"1769>>{'Not Signal': 1702, 'Signal': 67}",0.957038,0.925499,0.957038,0.941004
3,Is_Congestion,"3840>>{'Congestion': 865, 'Not Congestion': 2975}",0.847917,0.859143,0.847917,0.82039,"1769>>{'Congestion': 18, 'Not Congestion': 1751}",0.964952,0.979496,0.964952,0.97217
4,Is_Adhoc,"3840>>{'Adhoc': 1314, 'Not Adhoc': 2526}",0.832031,0.846926,0.832031,0.81926,"1769>>{'Adhoc': 953, 'Not Adhoc': 816}",0.535331,0.635324,0.535331,0.471615


In [10]:
#NICE