# Imports

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import *
from imblearn.over_sampling import SMOTE
import pandas as pd
import os

## Constants

In [2]:
parent_dir = r"../data/54ft/"
train_df_name = r"54ft_train.csv"

#feature # binding
F=\
    {1:'stay_duration', #f1
     2:'mfcc0',3:'mfcc1',4:'mfcc2',5:'mfcc3',6:'mfcc4', #f2,f3,f4,f5,f6
     7:'wifi_count', 8:'edge_wifi_count', #f7,f8
     9:'RSI', #f9
     10:'human_made', 11:'natural_land',12:'road_exist_percent',13:'highly_populated_poi_exist'#f10,f11,f12,f13
    }

selected_feat=\
    {
        'Is_Bus_stop':[F[e] for e in [1,2,3,4,5,6,7,8]],
        'Is_Turn':[F[e] for e in [1,2,3,4,5,6,7,8]],
        'Is_Signal':[F[e] for e in [1,2,3,4,5,6,7,8]],
        'Is_Congestion':[F[e] for e in [1,2,3,4,5,6,7,8]],
        'Is_Adhoc':[F[e] for e in [1,2,3,4,5,6,7,8]]
    }


#file Names
zone_names=list(map(lambda e:f"{e}_test_split.csv",["station","dvc_more","54ft_road","junction_mall","prantika_bus_stand"]))

_7day_test=["54ft_test.csv"]

In [3]:
#Modeling function
def get_metrics_from_the_model(train,labels_train,test,labels_test,poi,zone):
    rf=RandomForestClassifier(n_estimators=100,max_depth=8,random_state=42)
    rf.fit(train, labels_train)

    pred_train= rf.predict(train)
    pred_test= rf.predict(test)
    
    performance={'poi':poi,'zone':zone,
                 'tr_acc':accuracy_score(labels_train,pred_train),
                 'tr_precision':precision_score(labels_train,pred_train,average='weighted'),
                 'tr_recall':recall_score(labels_train,pred_train,average='weighted'),
                 'tr_f1-score':f1_score(labels_train,pred_train,average='weighted'),

                 'te_acc':accuracy_score(labels_test,pred_test),
                 'te_precision':precision_score(labels_test, pred_test,average='weighted'),
                 'te_recall':recall_score(labels_test, pred_test,average='weighted'),
                 'te_f1-score':f1_score(labels_test, pred_test,average='weighted')}
    return performance,pred_test
    

## Modeling

In [4]:
perf=[] #performance list


for poi_column in ['Is_Bus_stop','Is_Turn','Is_Signal','Is_Congestion','Is_Adhoc']:
    feature_names=selected_feat[poi_column] #feature selection

    for test_df_name in (zone_names+_7day_test):#iterator name

        zoneName=test_df_name.split('_test_split.csv')[0].split('.')[0]
        #output_result_filename = f"{poi_column}_test_result_{zoneName}_zone.csv"

        #train_data_processing
        train_csv_df = pd.read_csv(os.path.join(parent_dir, train_df_name))
        train_df = train_csv_df[feature_names+[poi_column]]

        #test data processing
        test_csv_df = pd.read_csv(os.path.join(parent_dir, test_df_name))
        test_df = test_csv_df[feature_names+[poi_column]]


        #SMOTE on training data & get features ,labels
        X = train_df[feature_names].copy()
        y = train_df[poi_column].copy()

        smote = SMOTE()
        X_resampled, y_resampled = smote.fit_sample(X, y)
        train_data = X_resampled.copy()
        train_labels = y_resampled.copy()

        #Test set get features & labels
        test_data = test_df[feature_names].values
        test_labels = test_df[poi_column].values

        #Training with Random Forest
        performance,pred_test=get_metrics_from_the_model(train_data,train_labels,test_data,test_labels,poi_column,zoneName)

        #adding to performance list
        perf.append(performance)

        #creating prediction file
        # test_csv_df[f'Prediction {poi_column}']=pred_test
        # result_df=test_csv_df[['start_date','start_time','end_time',f'Prediction {poi_column}']].copy()
        # result_df.columns=['instance_date', 'instance_start_time', 'instance_end_time',f'Prediction {poi_column}']
        # result_df.to_csv(os.path.join(parent_dir, output_result_filename), index=False)

        print(f'Completed for POI:{poi_column} and Zone:{zoneName}') #get results of length:{result_df.shape[0]}')

Completed for POI:Is_Bus_stop and Zone:station
Completed for POI:Is_Bus_stop and Zone:dvc_more
Completed for POI:Is_Bus_stop and Zone:54ft_road
Completed for POI:Is_Bus_stop and Zone:junction_mall
Completed for POI:Is_Bus_stop and Zone:prantika_bus_stand
Completed for POI:Is_Bus_stop and Zone:54ft_test
Completed for POI:Is_Turn and Zone:station
Completed for POI:Is_Turn and Zone:dvc_more
Completed for POI:Is_Turn and Zone:54ft_road
Completed for POI:Is_Turn and Zone:junction_mall
Completed for POI:Is_Turn and Zone:prantika_bus_stand
Completed for POI:Is_Turn and Zone:54ft_test
Completed for POI:Is_Signal and Zone:station
Completed for POI:Is_Signal and Zone:dvc_more
Completed for POI:Is_Signal and Zone:54ft_road
Completed for POI:Is_Signal and Zone:junction_mall
Completed for POI:Is_Signal and Zone:prantika_bus_stand
Completed for POI:Is_Signal and Zone:54ft_test
Completed for POI:Is_Congestion and Zone:station
Completed for POI:Is_Congestion and Zone:dvc_more
Completed for POI:Is_Cong

# Saving Performance for all Modelings

In [5]:
df=pd.DataFrame(perf)
df.to_csv(os.path.join(parent_dir, 'modeling_performance_temporal.csv'),index=False)
df

Unnamed: 0,poi,zone,tr_acc,tr_precision,tr_recall,tr_f1-score,te_acc,te_precision,te_recall,te_f1-score
0,Is_Bus_stop,station,0.800333,0.801127,0.800333,0.800202,0.84,0.844118,0.84,0.836111
1,Is_Bus_stop,dvc_more,0.809333,0.810216,0.809333,0.809198,0.649351,0.721179,0.649351,0.667769
2,Is_Bus_stop,54ft_road,0.805,0.806281,0.805,0.804796,0.681564,0.685263,0.681564,0.682884
3,Is_Bus_stop,junction_mall,0.794,0.794301,0.794,0.793947,0.842105,0.848042,0.842105,0.844429
4,Is_Bus_stop,prantika_bus_stand,0.799333,0.79995,0.799333,0.79923,0.701149,0.935751,0.701149,0.796039
5,Is_Bus_stop,54ft_test,0.803667,0.80417,0.803667,0.803585,0.662013,0.678638,0.662013,0.666052
6,Is_Turn,station,0.839814,0.844865,0.839814,0.839225,0.64,0.857725,0.64,0.711842
7,Is_Turn,dvc_more,0.819551,0.824657,0.819551,0.818839,0.571429,0.611003,0.571429,0.583815
8,Is_Turn,54ft_road,0.82448,0.831037,0.82448,0.823606,0.50838,0.82077,0.50838,0.602427
9,Is_Turn,junction_mall,0.825849,0.831289,0.825849,0.825131,0.539474,0.797737,0.539474,0.572574


In [6]:
#NICE

# On only Testing Dataset

In [9]:
df_pref_testset=df[df.zone=='54ft_test'].copy()
df_pref_testset.to_csv(os.path.join(parent_dir, 'modeling_performance_on_testset_temporal.csv'),index=False)
df_pref_testset

Unnamed: 0,poi,zone,tr_acc,tr_precision,tr_recall,tr_f1-score,te_acc,te_precision,te_recall,te_f1-score
5,Is_Bus_stop,54ft_test,0.803667,0.80417,0.803667,0.803585,0.662013,0.678638,0.662013,0.666052
11,Is_Turn,54ft_test,0.812979,0.815683,0.812979,0.812578,0.584864,0.648354,0.584864,0.60574
17,Is_Signal,54ft_test,0.886403,0.894262,0.886403,0.885834,0.713446,0.921359,0.713446,0.786196
23,Is_Congestion,54ft_test,0.787492,0.792061,0.787492,0.786658,0.801616,0.950793,0.801616,0.868655
29,Is_Adhoc,54ft_test,0.819034,0.823166,0.819034,0.818454,0.59809,0.597571,0.59809,0.597815


In [10]:
print('Mean f1:',df_pref_testset['te_f1-score'].mean(),'Std f1:',df_pref_testset['te_f1-score'].std())

Mean f1: 0.7048915344955882 Std f1: 0.11856842296933302


In [9]:
#DONE