In [1]:
import glob
import pickle
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import *
from imblearn.over_sampling import SMOTE
from library.preprocessing import new_Processing_before_journal

In [2]:
all_folders=glob.glob("./Trails/*/*/")

In [3]:
test_dates=['2019-06-28','2019-06-30','2019-07-01','2019-07-02','2019-07-03','2019-07-04','2019-07-05']
formatted_test_dates=['-'.join(d.split("-")[::-1]) for d in test_dates] #reversing order

In [4]:
#train folders
train_folders=[f for f in all_folders if f.split("\\")[-2].split("_")[0] not in formatted_test_dates]

#test folders
test_folders=[f for f in all_folders if f.split("\\")[-2].split("_")[0] in formatted_test_dates]

# Modelling

In [5]:
#trqining_data
train_df=pd.concat([new_Processing_before_journal(e+"ALL_DATA.csv") for e in train_folders])#Raw data is processed

#testing_data
test_df=pd.concat([new_Processing_before_journal(e+"ALL_DATA.csv") for e in test_folders])#Raw data is processed


In [6]:
#feature # binding
F=\
    {1:'stay_duration', #f1
     2:'mfcc0',3:'mfcc1',4:'mfcc2',5:'mfcc3',6:'mfcc4', #f2,f3,f4,f5,f6
     7:'wifi_count', 8:'edge_wifi_count', #f7,f8
     9:'RSI', #f9
     10:'human_made', 11:'natural_land',12:'road_exist_percent',13:'highly_populated_poi_exist'#f10,f11,f12,f13
    }

selected_feat=\
    {
        'Is_Bus_stop':[F[e] for e in [10,12,11,1,8]],
        'Is_Turn':[F[e] for e in [10,12,11,1,8,9]],
        'Is_Signal':[F[e] for e in [12,11,10,8,9]],
        'Is_Congestion':[F[e] for e in [2,11,10,9,12,8,6,1]],
        'Is_Adhoc':[F[e] for e in [10,11,1,12,2]]
    }

In [7]:
#Modeling function
def get_metrics_from_the_model(train,labels_train,test,labels_test,poi):
    rf=RandomForestClassifier(n_estimators=100,max_depth=8,random_state=42)
    rf.fit(train, labels_train)

    pred_train= rf.predict(train)
    pred_test= rf.predict(test)
    
    performance={'poi':poi,
                 'tr_acc':accuracy_score(labels_train,pred_train),
                 'tr_precision':precision_score(labels_train,pred_train,average='weighted'),
                 'tr_recall':recall_score(labels_train,pred_train,average='weighted'),
                 'tr_f1-score':f1_score(labels_train,pred_train,average='weighted'),

                 'te_acc':accuracy_score(labels_test,pred_test),
                 'te_precision':precision_score(labels_test, pred_test,average='weighted'),
                 'te_recall':recall_score(labels_test, pred_test,average='weighted'),
                 'te_f1-score':f1_score(labels_test, pred_test,average='weighted')}
    return performance,rf

In [8]:
perf=[] #performance list
rfs={} #rf dict


for poi_column in ['Is_Bus_stop','Is_Turn','Is_Signal','Is_Congestion','Is_Adhoc']:
    feature_names=selected_feat[poi_column] #feature selection



    #SMOTE on training data & get features ,labels
    X = train_df[feature_names].copy()
    y = train_df[poi_column].copy()

    smote = SMOTE()
    X_resampled, y_resampled = smote.fit_sample(X, y)
    train_data = X_resampled.copy()
    train_labels = y_resampled.copy()

    #Test set get features & labels
    test_data = test_df[feature_names].values
    test_labels = test_df[poi_column].values

    #Training with Random Forest
    performance,rf=get_metrics_from_the_model(train_data,train_labels,test_data,test_labels,poi_column)

    #adding to performance list
    perf.append(performance)
    rfs[poi_column]=rf

    print(f'Completed for POI:{poi_column}')

Completed for POI:Is_Bus_stop
Completed for POI:Is_Turn
Completed for POI:Is_Signal
Completed for POI:Is_Congestion
Completed for POI:Is_Adhoc


In [9]:
df=pd.DataFrame(perf)
df.to_csv("./logs/model_performace.csv",index=False)
df

Unnamed: 0,poi,tr_acc,tr_precision,tr_recall,tr_f1-score,te_acc,te_precision,te_recall,te_f1-score
0,Is_Bus_stop,0.917,0.917785,0.917,0.916961,0.847906,0.857368,0.847906,0.849475
1,Is_Turn,0.908543,0.912643,0.908543,0.908316,0.817046,0.845914,0.817046,0.824259
2,Is_Signal,0.950261,0.952104,0.950261,0.95021,0.920647,0.953043,0.920647,0.931857
3,Is_Congestion,0.877353,0.879943,0.877353,0.877143,0.769287,0.950636,0.769287,0.848734
4,Is_Adhoc,0.818182,0.820573,0.818182,0.817842,0.717855,0.718685,0.717855,0.718194


In [10]:
#Saving Models
with open('./logs/rfs.pickle', 'wb') as handle:
    pickle.dump(rfs, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [11]:
#NICE