# Imports

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import *
from imblearn.over_sampling import SMOTE
import pandas as pd
import os

## Constants

In [2]:
parent_dir = r"../data/54ft/"
train_df_name = r"54ft_train.csv"

#feature # binding
F=\
    {1:'stay_duration', #f1
     2:'mfcc0',3:'mfcc1',4:'mfcc2',5:'mfcc3',6:'mfcc4', #f2,f3,f4,f5,f6
     7:'wifi_count', 8:'edge_wifi_count', #f7,f8
     9:'RSI', #f9
     10:'human_made', 11:'natural_land',12:'road_exist_percent',13:'highly_populated_poi_exist'#f10,f11,f12,f13
    }

selected_feat=\
    {
        'Is_Bus_stop':[F[e] for e in [9,10,11,12,13]],
        'Is_Turn':[F[e] for e in [9,10,11,12,13]],
        'Is_Signal':[F[e] for e in [9,10,11,12,13]],
        'Is_Congestion':[F[e] for e in [9,10,11,12,13]],
        'Is_Adhoc':[F[e] for e in [9,10,11,12,13]]
    }


#file Names
zone_names=list(map(lambda e:f"{e}_test_split.csv",["station","dvc_more","54ft_road","junction_mall","prantika_bus_stand"]))

_7day_test=["54ft_test.csv"]

In [3]:
#Modeling function (md#6)
def get_metrics_from_the_model(train,labels_train,test,labels_test,poi,zone):
    rf=RandomForestClassifier(n_estimators=100,max_depth=8,random_state=42) 
    rf.fit(train, labels_train)

    pred_train= rf.predict(train)
    pred_test= rf.predict(test)
    
    performance={'poi':poi,'zone':zone,
                 'tr_acc':accuracy_score(labels_train,pred_train),
                 'tr_precision':precision_score(labels_train,pred_train,average='weighted'),
                 'tr_recall':recall_score(labels_train,pred_train,average='weighted'),
                 'tr_f1-score':f1_score(labels_train,pred_train,average='weighted'),

                 'te_acc':accuracy_score(labels_test,pred_test),
                 'te_precision':precision_score(labels_test, pred_test,average='weighted'),
                 'te_recall':recall_score(labels_test, pred_test,average='weighted'),
                 'te_f1-score':f1_score(labels_test, pred_test,average='weighted')}
    return performance,pred_test
    

## Modeling

In [4]:
perf=[] #performance list


for poi_column in ['Is_Bus_stop','Is_Turn','Is_Signal','Is_Congestion','Is_Adhoc']:
    feature_names=selected_feat[poi_column] #feature selection

    for test_df_name in (zone_names+_7day_test):#iterator name

        zoneName=test_df_name.split('_test_split.csv')[0].split('.')[0]
        #output_result_filename = f"{poi_column}_test_result_{zoneName}_zone.csv"

        #train_data_processing
        train_csv_df = pd.read_csv(os.path.join(parent_dir, train_df_name))
        train_df = train_csv_df[feature_names+[poi_column]]

        #test data processing
        test_csv_df = pd.read_csv(os.path.join(parent_dir, test_df_name))
        test_df = test_csv_df[feature_names+[poi_column]]


        #SMOTE on training data & get features ,labels
        X = train_df[feature_names].copy()
        y = train_df[poi_column].copy()

        smote = SMOTE()
        X_resampled, y_resampled = smote.fit_sample(X, y)
        train_data = X_resampled.copy()
        train_labels = y_resampled.copy()

        #Test set get features & labels
        test_data = test_df[feature_names].values
        test_labels = test_df[poi_column].values

        #Training with Random Forest
        performance,pred_test=get_metrics_from_the_model(train_data,train_labels,test_data,test_labels,poi_column,zoneName)

        #adding to performance list
        perf.append(performance)

        #creating prediction file
        # test_csv_df[f'Prediction {poi_column}']=pred_test
        # result_df=test_csv_df[['start_date','start_time','end_time',f'Prediction {poi_column}']].copy()
        # result_df.columns=['instance_date', 'instance_start_time', 'instance_end_time',f'Prediction {poi_column}']
        # result_df.to_csv(os.path.join(parent_dir, output_result_filename), index=False)

        print(f'Completed for POI:{poi_column} and Zone:{zoneName}') #get results of length:{result_df.shape[0]}')

Completed for POI:Is_Bus_stop and Zone:station
Completed for POI:Is_Bus_stop and Zone:dvc_more
Completed for POI:Is_Bus_stop and Zone:54ft_road
Completed for POI:Is_Bus_stop and Zone:junction_mall
Completed for POI:Is_Bus_stop and Zone:prantika_bus_stand
Completed for POI:Is_Bus_stop and Zone:54ft_test
Completed for POI:Is_Turn and Zone:station
Completed for POI:Is_Turn and Zone:dvc_more
Completed for POI:Is_Turn and Zone:54ft_road
Completed for POI:Is_Turn and Zone:junction_mall
Completed for POI:Is_Turn and Zone:prantika_bus_stand
Completed for POI:Is_Turn and Zone:54ft_test
Completed for POI:Is_Signal and Zone:station
Completed for POI:Is_Signal and Zone:dvc_more
Completed for POI:Is_Signal and Zone:54ft_road
Completed for POI:Is_Signal and Zone:junction_mall
Completed for POI:Is_Signal and Zone:prantika_bus_stand
Completed for POI:Is_Signal and Zone:54ft_test
Completed for POI:Is_Congestion and Zone:station
Completed for POI:Is_Congestion and Zone:dvc_more
Completed for POI:Is_Cong

# Saving Performance for all Modelings

In [5]:
df=pd.DataFrame(perf)
df.to_csv(os.path.join(parent_dir, 'modeling_performance_spatial.csv'),index=False)
df

Unnamed: 0,poi,zone,tr_acc,tr_precision,tr_recall,tr_f1-score,te_acc,te_precision,te_recall,te_f1-score
0,Is_Bus_stop,station,0.868333,0.871341,0.868333,0.868066,0.64,0.736765,0.64,0.631884
1,Is_Bus_stop,dvc_more,0.867,0.867824,0.867,0.866925,0.87013,0.867023,0.87013,0.865534
2,Is_Bus_stop,54ft_road,0.865333,0.867382,0.865333,0.865145,0.793296,0.847156,0.793296,0.774327
3,Is_Bus_stop,junction_mall,0.885,0.88636,0.885,0.884899,0.855263,0.879026,0.855263,0.834285
4,Is_Bus_stop,prantika_bus_stand,0.872667,0.873576,0.872667,0.872589,0.942529,0.93144,0.942529,0.936952
5,Is_Bus_stop,54ft_test,0.870667,0.873217,0.870667,0.870445,0.821455,0.83499,0.821455,0.823527
6,Is_Turn,station,0.853505,0.859459,0.853505,0.852896,0.94,0.94375,0.94,0.928111
7,Is_Turn,dvc_more,0.854053,0.860329,0.854053,0.853414,0.675325,0.636073,0.675325,0.626214
8,Is_Turn,54ft_road,0.856243,0.860438,0.856243,0.855824,0.944134,0.959313,0.944134,0.948501
9,Is_Turn,junction_mall,0.850219,0.855425,0.850219,0.849669,0.921053,0.918991,0.921053,0.919068


In [6]:
#NICE

# On only Testing Dataset

In [9]:
df_pref_testset=df[df.zone=='54ft_test'].copy()
df_pref_testset.to_csv(os.path.join(parent_dir, 'modeling_performance_on_testset_spatial.csv'),index=False)
df_pref_testset

Unnamed: 0,poi,zone,tr_acc,tr_precision,tr_recall,tr_f1-score,te_acc,te_precision,te_recall,te_f1-score
5,Is_Bus_stop,54ft_test,0.870667,0.873217,0.870667,0.870445,0.821455,0.83499,0.821455,0.823527
11,Is_Turn,54ft_test,0.853505,0.858857,0.853505,0.852957,0.806025,0.856384,0.806025,0.815665
17,Is_Signal,54ft_test,0.885969,0.90468,0.885969,0.884635,0.754592,0.940712,0.754592,0.816403
23,Is_Congestion,54ft_test,0.803886,0.808161,0.803886,0.803203,0.576782,0.956554,0.576782,0.71099
29,Is_Adhoc,54ft_test,0.807102,0.807925,0.807102,0.806973,0.698751,0.696862,0.698751,0.695918


In [10]:
print('Mean f1:',df_pref_testset['te_f1-score'].mean(),'Std f1:',df_pref_testset['te_f1-score'].std())

Mean f1: 0.772500657088298 Std f1: 0.0633297170435552


In [9]:
#DONE