In [1]:
import glob
import pickle
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import *
from imblearn.over_sampling import SMOTE
from library.preprocessing import new_Processing_before_journal

In [2]:
all_folders=glob.glob("./Trails/*/*/")

In [3]:
test_dates=['2019-06-28','2019-06-30','2019-07-01','2019-07-02','2019-07-03','2019-07-04','2019-07-05']
formatted_test_dates=['-'.join(d.split("-")[::-1]) for d in test_dates] #reversing order

In [4]:
#train folders
train_folders=[f for f in all_folders if f.split("\\")[-2].split("_")[0] not in formatted_test_dates]

#test folders
test_folders=[f for f in all_folders if f.split("\\")[-2].split("_")[0] in formatted_test_dates]

# Modelling

In [5]:
#trqining_data
train_df=pd.concat([new_Processing_before_journal(e+"ALL_DATA.csv") for e in train_folders])#Raw data is processed

#testing_data
test_df=pd.concat([new_Processing_before_journal(e+"ALL_DATA.csv") for e in test_folders])#Raw data is processed


In [6]:
#feature # binding
F=\
    {1:'stay_duration', #f1
     2:'mfcc0',3:'mfcc1',4:'mfcc2',5:'mfcc3',6:'mfcc4', #f2,f3,f4,f5,f6
     7:'wifi_count', 8:'edge_wifi_count', #f7,f8
     9:'RSI', #f9
     10:'human_made', 11:'natural_land',12:'road_exist_percent',13:'highly_populated_poi_exist'#f10,f11,f12,f13
    }

selected_feat=\
    {
        'Is_Bus_stop':[F[e] for e in [10,12,11,1,8]],
        'Is_Turn':[F[e] for e in [10,12,11,1,8,9]],
        'Is_Signal':[F[e] for e in [12,11,10,8,9]],
        'Is_Congestion':[F[e] for e in [2,11,10,9,12,8,6,1]],
        'Is_Adhoc':[F[e] for e in [10,11,1,12,2]]
    }

In [7]:
#Modeling function
def get_metrics_from_the_model(train,labels_train,test,labels_test,poi):
    rf=LogisticRegression(random_state=42)
    #print(labels_train)
    rf.fit(train, labels_train)

    pred_train= rf.predict(train)
    pred_test= rf.predict(test)
    
    performance={'poi':poi,
                 'tr_acc':accuracy_score(labels_train,pred_train),
                 'tr_precision':precision_score(labels_train,pred_train,average='weighted'),
                 'tr_recall':recall_score(labels_train,pred_train,average='weighted'),
                 'tr_f1-score':f1_score(labels_train,pred_train,average='weighted'),

                 'te_acc':accuracy_score(labels_test,pred_test),
                 'te_precision':precision_score(labels_test, pred_test,average='weighted'),
                 'te_recall':recall_score(labels_test, pred_test,average='weighted'),
                 'te_f1-score':f1_score(labels_test, pred_test,average='weighted')}
    return performance,rf

In [8]:
perf=[] #performance list
rfs={} #rf dict


for poi_column in ['Is_Bus_stop','Is_Turn','Is_Signal','Is_Congestion','Is_Adhoc']:
    feature_names=selected_feat[poi_column] #feature selection



    #SMOTE on training data & get features ,labels
    X = train_df[feature_names].copy()
    y = train_df[poi_column].copy()

    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    train_data = X_resampled.copy()
    train_labels = y_resampled.copy()

    #Test set get features & labels
    test_data = test_df[feature_names].values
    test_labels = test_df[poi_column].values

    #Training with Random Forest
    performance,rf=get_metrics_from_the_model(train_data,train_labels,test_data,test_labels,poi_column)

    #adding to performance list
    perf.append(performance)
    rfs[poi_column]=rf

    print(f'Completed for POI:{poi_column}')



Completed for POI:Is_Bus_stop
Completed for POI:Is_Turn
Completed for POI:Is_Signal
Completed for POI:Is_Congestion
Completed for POI:Is_Adhoc


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
df=pd.DataFrame(perf)
df.to_csv("./logs/model_performace_logreg.csv",index=False)
df

Unnamed: 0,poi,tr_acc,tr_precision,tr_recall,tr_f1-score,te_acc,te_precision,te_recall,te_f1-score
0,Is_Bus_stop,0.637333,0.637765,0.637333,0.637049,0.618663,0.626555,0.618663,0.621617
1,Is_Turn,0.589814,0.591586,0.589814,0.58782,0.637766,0.678365,0.637766,0.652486
2,Is_Signal,0.726542,0.731747,0.726542,0.724998,0.703894,0.919675,0.703894,0.779254
3,Is_Congestion,0.628112,0.628137,0.628112,0.628094,0.821455,0.950524,0.821455,0.880459
4,Is_Adhoc,0.614489,0.622688,0.614489,0.607938,0.56576,0.574021,0.56576,0.567478


In [10]:
#Saving Models
with open('./logs/logreg.pickle', 'wb') as handle:
    pickle.dump(rfs, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [11]:
coefficient=[]
for poi_column in rfs.keys():
    coefficient.append(dict(zip(selected_feat[poi_column]+['bias'],[*rfs[poi_column].coef_[0], *rfs[poi_column].intercept_])))

df=pd.DataFrame(coefficient,index=rfs.keys())

In [12]:
l=\
list(zip(['human_made','road_exist_percent', 'natural_land',
'stay_duration','edge_wifi_count', 'RSI', 'mfcc0', 'mfcc4'],
         
['coef_f10', 'coef_f12', 'coef_f11', 'coef_f1', 
'coef_f8', 'coef_f9', 'coef_f2', 'coef_f6']))

l.sort(key=lambda e:int(e[1][6:]))

dic=dict(l)

columns=list(dic.keys())+['bias']
rename_columns=list(dic.values())+['bias']

In [13]:
df1=df[columns].copy()
df1.columns=rename_columns

In [18]:
df1.round(4).to_csv("./logs/logreg_coef.csv")
df1.round(4)

Unnamed: 0,coef_f1,coef_f2,coef_f6,coef_f8,coef_f9,coef_f10,coef_f11,coef_f12,bias
Is_Bus_stop,0.0079,,,2.0685,,6.1855,0.8604,-3.8805,-0.778
Is_Turn,0.0039,,,3.6678,0.0002,-3.1079,-1.1234,-0.4826,0.9781
Is_Signal,,,,9.8328,0.027,2.1746,9.2393,6.7665,-8.0492
Is_Congestion,0.0066,0.0111,-0.0893,3.9221,0.0634,-0.1316,-1.008,-0.1918,-0.7179
Is_Adhoc,-0.0075,0.001,,,,-1.1121,0.2917,0.5945,-0.078


In [15]:
#NICE