# Imports

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import *
from imblearn.over_sampling import SMOTE
import pandas as pd
import os

## Constants

In [2]:
parent_dir = r"../data/54ft/"
train_df_name = r"54ft_train.csv"

#feature # binding
F=\
    {1:'stay_duration', #f1
     2:'mfcc0',3:'mfcc1',4:'mfcc2',5:'mfcc3',6:'mfcc4', #f2,f3,f4,f5,f6
     7:'wifi_count', 8:'edge_wifi_count', #f7,f8
     9:'RSI', #f9
     10:'human_made', 11:'natural_land',12:'road_exist_percent',13:'highly_populated_poi_exist'#f10,f11,f12,f13
    }

selected_feat=\
    {
        'Is_Bus_stop':[F[e] for e in [10,12,11,1,8]],
        'Is_Turn':[F[e] for e in [10,12,11,1,8,9]],
        'Is_Signal':[F[e] for e in [12,11,10,8,9]],
        'Is_Congestion':[F[e] for e in [2,11,10,9,12,8,6,1]],
        'Is_Adhoc':[F[e] for e in [10,11,1,12,2]]
    }


#file Names
zone_names=list(map(lambda e:f"{e}_test_split.csv",["station","dvc_more","54ft_road","junction_mall","prantika_bus_stand"]))

_7day_test=["54ft_test.csv"]

In [3]:
#Modeling function
def get_metrics_from_the_model(train,labels_train,test,labels_test,poi,zone):
    rf=RandomForestClassifier(n_estimators=100,max_depth=8)
    rf.fit(train, labels_train)

    pred_train= rf.predict(train)
    pred_test= rf.predict(test)
    
    performance={'poi':poi,'zone':zone,
                 'tr_acc':accuracy_score(labels_train,pred_train),
                 'tr_precision':precision_score(labels_train,pred_train,average='weighted'),
                 'tr_recall':recall_score(labels_train,pred_train,average='weighted'),
                 'tr_f1-score':f1_score(labels_train,pred_train,average='weighted'),

                 'te_acc':accuracy_score(labels_test,pred_test),
                 'te_precision':precision_score(labels_test, pred_test,average='weighted'),
                 'te_recall':recall_score(labels_test, pred_test,average='weighted'),
                 'te_f1-score':f1_score(labels_test, pred_test,average='weighted')}
    return performance,pred_test,rf
    

## Modeling

In [4]:
poi_column ='Is_Congestion' #['Is_Bus_stop','Is_Turn','Is_Signal','Is_Congestion','Is_Adhoc']:
feature_names=selected_feat[poi_column] #feature selection

test_df_name="54ft_test.csv"#_7day_test[0] #iterator name


###################################################################################################################

zoneName=test_df_name.split('_test_split.csv')[0].split('.')[0]
output_result_filename = f"{poi_column}_test_result_{zoneName}_zone.csv"

#train_data_processing
train_csv_df = pd.read_csv(os.path.join(parent_dir, train_df_name))
train_df = train_csv_df[feature_names+[poi_column]]

#test data processing
test_csv_df = pd.read_csv(os.path.join(parent_dir, test_df_name))
test_df = test_csv_df[feature_names+[poi_column]]


#SMOTE on training data & get features ,labels
X = train_df[feature_names].copy()
y = train_df[poi_column].copy()

smote = SMOTE()
X_resampled, y_resampled = smote.fit_sample(X, y)
train_data = X_resampled.copy()
train_labels = y_resampled.copy()

#Test set get features & labels
test_data = test_df[feature_names].values
test_labels = test_df[poi_column].values

#Training with Random Forest
performance,pred_test,rf=get_metrics_from_the_model(train_data,train_labels,test_data,test_labels,poi_column,zoneName)

In [5]:
print(classification_report(test_labels,pred_test))

              precision    recall  f1-score   support

           0       0.97      0.80      0.88      1328
           1       0.00      0.03      0.01        33

    accuracy                           0.78      1361
   macro avg       0.49      0.42      0.44      1361
weighted avg       0.95      0.78      0.86      1361



In [8]:
y.value_counts()

0    1647
1     832
Name: Is_Congestion, dtype: int64