# Imports

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import *
from imblearn.over_sampling import SMOTE
import pandas as pd
import os

## Constants

In [2]:
parent_dir = r"../data/54ft/"
train_df_name = r"54ft_train.csv"

#feature # binding
F=\
    {1:'stay_duration', #f1
     2:'mfcc0',3:'mfcc1',4:'mfcc2',5:'mfcc3',6:'mfcc4', #f2,f3,f4,f5,f6
     7:'wifi_count', 8:'edge_wifi_count', #f7,f8
     9:'RSI', #f9
     10:'human_made', 11:'natural_land',12:'road_exist_percent',13:'highly_populated_poi_exist'#f10,f11,f12,f13
    }

selected_feat=\
    {
        'Is_Bus_stop':[F[e] for e in [10,12,11,1,8]],
        'Is_Turn':[F[e] for e in [10,12,11,1,8,9]],
        'Is_Signal':[F[e] for e in [12,11,10,8,9]],
        'Is_Congestion':[F[e] for e in [2,11,10,9,12,8,6,1]],
        'Is_Adhoc':[F[e] for e in [10,11,1,12,2]]
    }


#file Names
zone_names=list(map(lambda e:f"{e}_test_split.csv",["station","dvc_more","54ft_road","junction_mall","prantika_bus_stand"]))

_7day_test=["54ft_test.csv"]

In [3]:
#Modeling function
def get_metrics_from_the_model(train,labels_train,test,labels_test,poi,zone):
    rf=RandomForestClassifier(n_estimators=100,max_depth=8,random_state=42)
    rf.fit(train, labels_train)

    pred_train= rf.predict(train)
    pred_test= rf.predict(test)
    
    performance={'poi':poi,'zone':zone,
                 'tr_acc':accuracy_score(labels_train,pred_train),
                 'tr_precision':precision_score(labels_train,pred_train,average='weighted'),
                 'tr_recall':recall_score(labels_train,pred_train,average='weighted'),
                 'tr_f1-score':f1_score(labels_train,pred_train,average='weighted'),

                 'te_acc':accuracy_score(labels_test,pred_test),
                 'te_precision':precision_score(labels_test, pred_test,average='weighted'),
                 'te_recall':recall_score(labels_test, pred_test,average='weighted'),
                 'te_f1-score':f1_score(labels_test, pred_test,average='weighted')}
    return performance,pred_test
    

## Modeling

In [4]:
perf=[] #performance list


for poi_column in ['Is_Bus_stop','Is_Turn','Is_Signal','Is_Congestion','Is_Adhoc']:
    feature_names=selected_feat[poi_column] #feature selection

    for test_df_name in (zone_names+_7day_test):#iterator name

        zoneName=test_df_name.split('_test_split.csv')[0].split('.')[0]
        output_result_filename = f"{poi_column}_test_result_{zoneName}_zone.csv"

        #train_data_processing
        train_csv_df = pd.read_csv(os.path.join(parent_dir, train_df_name))
        train_df = train_csv_df[feature_names+[poi_column]]

        #test data processing
        test_csv_df = pd.read_csv(os.path.join(parent_dir, test_df_name))
        test_df = test_csv_df[feature_names+[poi_column]]


        #SMOTE on training data & get features ,labels
        X = train_df[feature_names].copy()
        y = train_df[poi_column].copy()

        smote = SMOTE()
        X_resampled, y_resampled = smote.fit_sample(X, y)
        train_data = X_resampled.copy()
        train_labels = y_resampled.copy()

        #Test set get features & labels
        test_data = test_df[feature_names].values
        test_labels = test_df[poi_column].values

        #Training with Random Forest
        performance,pred_test=get_metrics_from_the_model(train_data,train_labels,test_data,test_labels,poi_column,zoneName)

        #adding to performance list
        perf.append(performance)

        #creating prediction file
        test_csv_df[f'Prediction {poi_column}']=pred_test
        result_df=test_csv_df[['start_date','start_time','end_time',f'Prediction {poi_column}']].copy()
        result_df.columns=['instance_date', 'instance_start_time', 'instance_end_time',f'Prediction {poi_column}']
        result_df.to_csv(os.path.join(parent_dir, output_result_filename), index=False)

        print(f'Completed for POI:{poi_column} and Zone:{zoneName} get results of length:{result_df.shape[0]}')

Completed for POI:Is_Bus_stop and Zone:station get results of length:50
Completed for POI:Is_Bus_stop and Zone:dvc_more get results of length:77
Completed for POI:Is_Bus_stop and Zone:54ft_road get results of length:179
Completed for POI:Is_Bus_stop and Zone:junction_mall get results of length:76
Completed for POI:Is_Bus_stop and Zone:prantika_bus_stand get results of length:87
Completed for POI:Is_Bus_stop and Zone:54ft_test get results of length:1361
Completed for POI:Is_Turn and Zone:station get results of length:50
Completed for POI:Is_Turn and Zone:dvc_more get results of length:77
Completed for POI:Is_Turn and Zone:54ft_road get results of length:179
Completed for POI:Is_Turn and Zone:junction_mall get results of length:76
Completed for POI:Is_Turn and Zone:prantika_bus_stand get results of length:87
Completed for POI:Is_Turn and Zone:54ft_test get results of length:1361
Completed for POI:Is_Signal and Zone:station get results of length:50
Completed for POI:Is_Signal and Zone:dvc

# Saving Performance for all Modelings

In [5]:
df=pd.DataFrame(perf)
df.to_csv(os.path.join(parent_dir, 'modeling_performance.csv'),index=False)
df

Unnamed: 0,poi,zone,tr_acc,tr_precision,tr_recall,tr_f1-score,te_acc,te_precision,te_recall,te_f1-score
0,Is_Bus_stop,station,0.917333,0.917958,0.917333,0.917302,0.92,0.929412,0.92,0.918056
1,Is_Bus_stop,dvc_more,0.916667,0.917475,0.916667,0.916626,0.87013,0.867349,0.87013,0.868022
2,Is_Bus_stop,54ft_road,0.915667,0.916355,0.915667,0.915632,0.854749,0.883577,0.854749,0.847272
3,Is_Bus_stop,junction_mall,0.916,0.916908,0.916,0.915954,0.907895,0.906741,0.907895,0.907121
4,Is_Bus_stop,prantika_bus_stand,0.910333,0.911718,0.910333,0.910258,0.804598,0.940475,0.804598,0.863252
5,Is_Bus_stop,54ft_test,0.921667,0.922412,0.921667,0.921632,0.838354,0.850705,0.838354,0.840185
6,Is_Turn,station,0.911281,0.914384,0.911281,0.911115,1.0,1.0,1.0,1.0
7,Is_Turn,dvc_more,0.912103,0.914505,0.912103,0.911975,0.701299,0.678623,0.701299,0.67346
8,Is_Turn,54ft_road,0.911829,0.914479,0.911829,0.911688,0.938547,0.947726,0.938547,0.94181
9,Is_Turn,junction_mall,0.910186,0.913688,0.910186,0.909996,0.868421,0.894205,0.868421,0.875346


In [6]:
#NICE

# On only Testing Dataset

In [7]:
df_pref_testset=df[df.zone=='54ft_test'].copy()
df_pref_testset.to_csv(os.path.join(parent_dir, 'modeling_performance_on_testset.csv'),index=False)
df_pref_testset

Unnamed: 0,poi,zone,tr_acc,tr_precision,tr_recall,tr_f1-score,te_acc,te_precision,te_recall,te_f1-score
5,Is_Bus_stop,54ft_test,0.921667,0.922412,0.921667,0.921632,0.838354,0.850705,0.838354,0.840185
11,Is_Turn,54ft_test,0.911008,0.914474,0.911008,0.910821,0.82219,0.848158,0.82219,0.828858
17,Is_Signal,54ft_test,0.949609,0.952628,0.949609,0.949525,0.909625,0.951446,0.909625,0.924011
23,Is_Congestion,54ft_test,0.869156,0.87204,0.869156,0.868902,0.782513,0.948285,0.782513,0.856827
29,Is_Adhoc,54ft_test,0.823864,0.826562,0.823864,0.823499,0.708303,0.709359,0.708303,0.70872


In [8]:
print('Mean f1:',df_pref_testset['te_f1-score'].mean(),'Std f1:',df_pref_testset['te_f1-score'].std())

Mean f1: 0.8317201121905864 Std f1: 0.0780332268657584


In [9]:
#DONE