In [1]:
import glob
import pickle
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import *
from imblearn.over_sampling import SMOTE
from library.preprocessing import new_Processing_before_journal

In [2]:
all_folders=glob.glob("./Trails/*/*/")

In [3]:
test_dates=['2019-06-28','2019-06-30','2019-07-01','2019-07-02','2019-07-03','2019-07-04','2019-07-05']
formatted_test_dates=['-'.join(d.split("-")[::-1]) for d in test_dates] #reversing order

In [4]:
#train folders
train_folders=[f for f in all_folders if f.split("\\")[-2].split("_")[0] not in formatted_test_dates]

#test folders
test_folders=[f for f in all_folders if f.split("\\")[-2].split("_")[0] in formatted_test_dates]

# Modelling

In [5]:
#trqining_data
train_df=pd.concat([new_Processing_before_journal(e+"ALL_DATA.csv") for e in train_folders])#Raw data is processed

#testing_data
test_df=pd.concat([new_Processing_before_journal(e+"ALL_DATA.csv") for e in test_folders])#Raw data is processed


In [6]:
#feature # binding
F=\
    {1:'stay_duration', #f1
     2:'mfcc0',3:'mfcc1',4:'mfcc2',5:'mfcc3',6:'mfcc4', #f2,f3,f4,f5,f6
     7:'wifi_count', 8:'edge_wifi_count', #f7,f8
     9:'RSI', #f9
     10:'human_made', 11:'natural_land',12:'road_exist_percent',13:'highly_populated_poi_exist'#f10,f11,f12,f13
    }

selected_feat=\
    {
        'Is_Bus_stop':[F[e] for e in range(1,14)],
        'Is_Turn':[F[e] for e in range(1,14)],
        'Is_Signal':[F[e] for e in range(1,14)],
        'Is_Congestion':[F[e] for e in range(1,14)],
        'Is_Adhoc':[F[e] for e in range(1,14)]
    }

In [7]:
#Modeling function
def get_metrics_from_the_model(train,labels_train,test,labels_test,poi):
    rf=LogisticRegression(random_state=42)
    #print(labels_train)
    rf.fit(train, labels_train)

    pred_train= rf.predict(train)
    pred_test= rf.predict(test)
    
    performance={'poi':poi,
                 'tr_acc':accuracy_score(labels_train,pred_train),
                 'tr_precision':precision_score(labels_train,pred_train,average='weighted'),
                 'tr_recall':recall_score(labels_train,pred_train,average='weighted'),
                 'tr_f1-score':f1_score(labels_train,pred_train,average='weighted'),

                 'te_acc':accuracy_score(labels_test,pred_test),
                 'te_precision':precision_score(labels_test, pred_test,average='weighted'),
                 'te_recall':recall_score(labels_test, pred_test,average='weighted'),
                 'te_f1-score':f1_score(labels_test, pred_test,average='weighted')}
    return performance,rf

In [8]:
perf=[] #performance list
rfs={} #rf dict


for poi_column in ['Is_Bus_stop','Is_Turn','Is_Signal','Is_Congestion','Is_Adhoc']:
    feature_names=selected_feat[poi_column] #feature selection



    #SMOTE on training data & get features ,labels
    X = train_df[feature_names].copy()
    y = train_df[poi_column].copy()

    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    train_data = X_resampled.copy()
    train_labels = y_resampled.copy()

    #Test set get features & labels
    test_data = test_df[feature_names].values
    test_labels = test_df[poi_column].values

    #Training with Random Forest
    performance,rf=get_metrics_from_the_model(train_data,train_labels,test_data,test_labels,poi_column)

    #adding to performance list
    perf.append(performance)
    rfs[poi_column]=rf

    print(f'Completed for POI:{poi_column}')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Completed for POI:Is_Bus_stop
Completed for POI:Is_Turn


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Completed for POI:Is_Signal
Completed for POI:Is_Congestion
Completed for POI:Is_Adhoc


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
df=pd.DataFrame(perf)
df.to_csv("./logs/model_performace_logreg.csv",index=False)
df

Unnamed: 0,poi,tr_acc,tr_precision,tr_recall,tr_f1-score,te_acc,te_precision,te_recall,te_f1-score
0,Is_Bus_stop,0.604,0.604359,0.604,0.603659,0.548861,0.58355,0.548861,0.554058
1,Is_Turn,0.547645,0.547668,0.547645,0.547591,0.490816,0.615047,0.490816,0.517814
2,Is_Signal,0.625543,0.62568,0.625543,0.625441,0.512123,0.900788,0.512123,0.625524
3,Is_Congestion,0.620826,0.620942,0.620826,0.620735,0.800882,0.950769,0.800882,0.868209
4,Is_Adhoc,0.613068,0.622738,0.613068,0.605294,0.512858,0.542795,0.512858,0.504676


In [10]:
#Saving Models
with open('./logs/logreg.pickle', 'wb') as handle:
    pickle.dump(rfs, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [11]:
coefficient=[]
for poi_column in rfs.keys():
    coefficient.append(dict(zip(selected_feat[poi_column]+['bias'],[*rfs[poi_column].coef_[0], *rfs[poi_column].intercept_])))

df=pd.DataFrame(coefficient,index=rfs.keys())[list(F.values())+['bias']]

In [12]:
df.columns=[f'coef_f{e}' for e in F.keys()]+['bias']

In [13]:
df.round(4).T.to_csv("./logs/logreg_coef.csv")
df.round(4).T

Unnamed: 0,Is_Bus_stop,Is_Turn,Is_Signal,Is_Congestion,Is_Adhoc
coef_f1,0.0013,0.0015,-0.0112,0.0043,0.0013
coef_f2,-0.0109,-0.0043,-0.01,0.0112,0.0011
coef_f3,-0.0183,-0.0051,-0.0252,0.0147,0.0054
coef_f4,-0.003,0.0112,-0.0197,0.0144,-0.0088
coef_f5,-0.0484,-0.0015,0.0096,-0.0668,0.0321
coef_f6,0.0529,-0.0163,0.0513,-0.0474,0.0012
coef_f7,0.0609,0.0086,0.068,0.0508,-0.1517
coef_f8,0.1261,0.1453,1.116,0.0803,-0.0662
coef_f9,-0.0137,-0.0116,-0.0018,0.0623,-0.0553
coef_f10,0.5162,-0.209,-0.8386,-0.1327,0.0679


In [21]:
import pandas as pd
df=pd.read_csv("./logs/logreg_coef.csv",index_col="Unnamed: 0")[:-1]

In [39]:
print("Feature Importance Order according to BuStop")
{'Is_Bus_stop':[10,12,11,1,8],
'Is_Turn':[10,12,11,1,8,9],
'Is_Signal':[12,11,10,8,9],
'Is_Congestion':[2,11,10,9,12,8,6,1],
'Is_Adhoc':[10,11,1,12,2]}

Feature Importance Order according to BuStop


{'Is_Bus_stop': [10, 12, 11, 1, 8],
 'Is_Turn': [10, 12, 11, 1, 8, 9],
 'Is_Signal': [12, 11, 10, 8, 9],
 'Is_Congestion': [2, 11, 10, 9, 12, 8, 6, 1],
 'Is_Adhoc': [10, 11, 1, 12, 2]}

In [40]:
print("Feature Importance Order according to Logistic Regression")
{'Is_Bus_stop':[int(e.replace("coef_f","")) for e in df.Is_Bus_stop.sort_values(ascending=False)[:5].index.to_list()],
'Is_Turn':[int(e.replace("coef_f","")) for e in df.Is_Turn.sort_values(ascending=False)[:6].index.to_list()],
'Is_Signal':[int(e.replace("coef_f","")) for e in df.Is_Signal.sort_values(ascending=False)[:5].index.to_list()],
'Is_Congestion':[int(e.replace("coef_f","")) for e in df.Is_Congestion.sort_values(ascending=False)[:8].index.to_list()],
'Is_Adhoc':[int(e.replace("coef_f","")) for e in df.Is_Adhoc.sort_values(ascending=False)[:5].index.to_list()]}

Feature Importance Order according to Logistic Regression


{'Is_Bus_stop': [10, 11, 13, 8, 7],
 'Is_Turn': [13, 8, 12, 11, 4, 7],
 'Is_Signal': [11, 8, 12, 7, 6],
 'Is_Congestion': [13, 8, 9, 7, 3, 4, 2, 1],
 'Is_Adhoc': [11, 10, 5, 12, 3]}

In [14]:
#NICE