In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
import math

In [2]:
#retrieving the data
parkingdata = pd.read_csv("parking_duration_of_parking_event_vs_street_ID.csv")
parkingdata = parkingdata.sample(n=10000)
parkingdata.size

130000

In [3]:
#Converting all the feature attributes to uppercase for uniformity

parkingdata['Area Name'] = parkingdata['Area Name'].str.upper()
parkingdata['Street Name'] = parkingdata['Street Name'].str.upper()
parkingdata['Between Street 1'] = parkingdata['Between Street 1'].str.upper()
parkingdata['Between Street 2'] = parkingdata['Between Street 2'].str.upper()
parkingdata['Street Marker'] = parkingdata['Street Marker'].str.upper()
parkingdata['Sign'] = parkingdata['Sign'].str.upper()

parkingdata.head(5)

Unnamed: 0,Area Name,Street Name,Between Street 1,Between Street 2,Side Of Street,Street Marker,Arrival Time,Departure Time,Duration of Parking Event (in seconds),Sign,In Violation?,Street ID,Device ID
10985376,JOLIMONT,ST ANDREWS PLACE,MACARTHUR STREET,LANSDOWNE STREET,4,12463S,24/08/2012 09:59:32 AM,24/08/2012 10:29:30 AM,1798,1P MTR M-SAT 7:30-18:30,0,1290,6620
10413156,REGENCY,SPRING STREET,VICTORIA PARADE,LONSDALE STREET,2,94E,07/08/2012 11:50:06 AM,07/08/2012 01:00:04 PM,4198,2P MTR M-F 7:30-16:00,0,1288,61
10069969,THE MAC,FRANKLIN STREET,ELIZABETH STREET,SWANSTON STREET,4,6531S,27/07/2012 01:54:57 PM,27/07/2012 01:57:20 PM,143,1/2P MTR M-SAT 7:30-19:30,0,681,2782
7347290,TITLES,A'BECKETT STREET,QUEEN STREET,ELIZABETH STREET,4,6005S,07/05/2012 12:20:38 PM,07/05/2012 12:23:50 PM,192,1P MTR M-SAT 7:30-19:30,0,5,2433
8721811,VICTORIA MARKET,FRANKLIN STREET,WILLAM STREET,QUEEN STREET,1,C6858,18/06/2012 09:27:30 AM,18/06/2012 09:29:22 AM,112,2P MTR M-SAT 7:30-20:30,0,681,2972


In [4]:
#Removing all redundant extra whitespaces
for x in parkingdata.columns:
    if parkingdata[x].dtype == object:
        parkingdata[x] = parkingdata[x].str.strip()

In [5]:
#converting object to datetime
parkingdata['Arrival Time'] = pd.to_datetime(parkingdata['Arrival Time'])
parkingdata['Departure Time'] = pd.to_datetime(parkingdata['Departure Time'])

In [6]:
#Target field identification
target = parkingdata['In Violation?']
target.head(10)

10985376    0
10413156    0
10069969    0
7347290     0
8721811     0
9625910     1
4758553     0
2177274     0
1752240     0
7725055     0
Name: In Violation?, dtype: int64

In [7]:
#Preliminary Target Features identification (Pre-Hill Climbing)

#Convert possible targets to dtype = int
intDf = parkingdata.copy()
#Drop useless fields
intDf.drop('Arrival Time', axis = 1, inplace = True)
intDf.drop('Departure Time', axis = 1, inplace = True)
intDf.drop('Street Marker', axis = 1, inplace = True)
#Drop target field
intDf.drop('In Violation?', axis = 1, inplace = True)
#Drop already int fields (Will add back after)
intDf.drop('Side Of Street', axis = 1, inplace = True)
intDf.drop('Duration of Parking Event (in seconds)', axis = 1, inplace = True)
intDf.drop('Street ID', axis = 1, inplace = True)
intDf.drop('Device ID', axis = 1, inplace = True)

#Transform the remainder non-int fields' values to unique int identifiers
for column in intDf:
    unique_vals = intDf[column].unique()
    intDf[column].replace(to_replace = unique_vals, value = list(range(len(unique_vals))), inplace = True)

#Create dataframe for all the relevant features
features = intDf.copy()

#Add back int fields (If not the target field)
features['Side Of Street'] = parkingdata['Side Of Street']
features['Duration of Parking Event (in seconds)'] = parkingdata['Duration of Parking Event (in seconds)']
features['Street ID'] = parkingdata['Street ID']
features['Device ID'] = parkingdata['Device ID']
#Add back datetime features with only the meaningful subset of data
features['Arrival Hour'] = parkingdata['Arrival Time'].dt.hour
features['Departure Hour'] = parkingdata['Departure Time'].dt.hour

#Table view of features dataframe
features.head(10)

Unnamed: 0,Area Name,Street Name,Between Street 1,Between Street 2,Sign,Side Of Street,Duration of Parking Event (in seconds),Street ID,Device ID,Arrival Hour,Departure Hour
10985376,0,0,0,0,0,4,1798,1290,6620,9,10
10413156,1,1,1,1,1,2,4198,1288,61,11,13
10069969,2,2,2,2,2,4,143,681,2782,13,13
7347290,3,3,3,3,3,4,192,5,2433,12,12
8721811,4,2,4,4,4,1,112,681,2972,9,9
9625910,3,3,3,3,3,4,4795,5,7339,9,10
4758553,5,4,5,5,3,3,1277,123,1339,10,10
2177274,6,5,6,4,5,3,2071,894,1816,11,11
1752240,7,6,7,6,6,2,180,641,3215,12,12
7725055,0,0,0,0,7,3,5236,1290,6599,11,13


In [8]:
#Parameter Tuning

#Define the parameters to tune and the values to tune to
params_logReg = [
                {'penalty': ['l1', 'l2', 'elasticnet', 'none'], 
                'C' : np.logspace(-4, 4, 20), 
                'solver' : ['lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'],  
                'max_iter' : [100, 1000, 2500, 5000]}
            ]

In [11]:
cv_method = RepeatedStratifiedKFold(n_splits = 5, 
                                    n_repeats = 1, 
                                    random_state = 1)

gs_logReg = GridSearchCV( LogisticRegression(max_iter=15000), 
                      param_grid = params_logReg, 
                      cv = cv_method,
                      verbose = True,
                      scoring = 'accuracy',
                      n_jobs = -1,
                      return_train_score = True)

In [12]:
#Fit the model with the dataset
bestModel = gs_logReg.fit(features, target)

Fitting 5 folds for each of 1600 candidates, totalling 8000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 249 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done 510 tasks      | elapsed:   40.3s
[Parallel(n_jobs=-1)]: Done 954 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 1466 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 2094 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 3062 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 3944 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 4866 tasks      | elapsed:  9.5min
[Parallel(n_jobs=-1)]: Done 6105 tasks      | elapsed: 11.9min
[Parallel(n_jobs=-1)]: Done 7463 tasks      | elapsed: 14.6min
[Parallel(n_jobs=-1)]: Done 8000 out of 8000 | elapsed: 16.0min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/mod

In [13]:
gs_logReg.best_params_

{'C': 0.615848211066026, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'lbfgs'}

In [14]:
#Find the best values for the  parameters of the model (Formatted output)
best_p = bestModel.best_estimator_.get_params()['penalty']
best_C = bestModel.best_estimator_.get_params()['C']
best_max =  bestModel.best_estimator_.get_params()['max_iter']
best_solver =  bestModel.best_estimator_.get_params()['solver']

print('Best penalty:', best_p)
print('Best C:', best_C)
print('Best max iteration:', best_max)
print('Best solver:', best_solver)

Best penalty: l2
Best C: 0.615848211066026
Best max iteration: 1000
Best solver: lbfgs


In [15]:
#Visualise the parameter configurations from fitting the model with the dataset
results_logReg = pd.DataFrame(gs_logReg.cv_results_['params'])
results_logReg['test_score'] = gs_logReg.cv_results_['mean_test_score']
results_logReg

Unnamed: 0,C,max_iter,penalty,solver,test_score
0,0.0001,100,l1,lbfgs,
1,0.0001,100,l1,newton-cg,
2,0.0001,100,l1,liblinear,0.8911
3,0.0001,100,l1,sag,
4,0.0001,100,l1,saga,0.8911
...,...,...,...,...,...
1595,10000.0000,5000,none,lbfgs,0.9038
1596,10000.0000,5000,none,newton-cg,0.9035
1597,10000.0000,5000,none,liblinear,
1598,10000.0000,5000,none,sag,0.8933


In [16]:
#Re-Define model with the optimal parameter values AFTER HILL CLIMBING
logReg = metric = LogisticRegression(penalty = best_p, 
                               C = best_C, 
                               solver = best_solver, 
                               max_iter = best_max 
                               )

In [17]:
# Hill climbing w/ SVM
new_Ind = []
cur_MaxScore = 0.0
col_num = len(features.columns)
col_Ind_Random = shuffle(range(0, col_num), random_state = 1)
features_array = features.values

for cur_f in range(col_num):
    new_Ind.append(col_Ind_Random[cur_f])
    newData = features_array[:, new_Ind]
    x_train, x_test, y_train, y_test = train_test_split(newData, target, test_size=0.2, random_state=1)
    fit = logReg.fit(x_train, y_train)
    cur_Score = logReg.score(x_test, y_test)
    
    if cur_Score < cur_MaxScore:
        new_Ind.remove(col_Ind_Random[cur_f])
    else:
        cur_MaxScore = cur_Score
        print ("Score with " + str(len(new_Ind)) + " selected features: " + str(cur_Score))
print("\nIndexs of the desired features")
print(new_Ind)

Score with 1 selected features: 0.8575
Score with 2 selected features: 0.8575
Score with 3 selected features: 0.8575
Score with 4 selected features: 0.8575
Score with 5 selected features: 0.8575
Score with 6 selected features: 0.8925
Score with 7 selected features: 0.9
Score with 8 selected features: 0.9
Score with 9 selected features: 0.9005

Indexs of the desired features
[2, 3, 4, 9, 1, 6, 0, 7, 10]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
features_hc = pd.DataFrame()
for index in new_Ind:
    colName = features.columns[index]
    features_hc[colName] = features[colName]
features_hc.head(10)

Unnamed: 0,Between Street 1,Between Street 2,Sign,Arrival Hour,Street Name,Duration of Parking Event (in seconds),Area Name,Street ID,Departure Hour
10985376,0,0,0,9,0,1798,0,1290,10
10413156,1,1,1,11,1,4198,1,1288,13
10069969,2,2,2,13,2,143,2,681,13
7347290,3,3,3,12,3,192,3,5,12
8721811,4,4,4,9,2,112,4,681,9
9625910,3,3,3,9,3,4795,3,5,10
4758553,5,5,3,10,4,1277,5,123,10
2177274,6,4,5,11,5,2071,6,894,11
1752240,7,6,6,12,6,180,7,641,12
7725055,0,0,7,11,0,5236,0,1290,13


In [19]:
bestModel = gs_logReg.fit(features_hc, target)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 5 folds for each of 1600 candidates, totalling 8000 fits


[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 368 tasks      | elapsed:   19.7s
[Parallel(n_jobs=-1)]: Done 678 tasks      | elapsed:   43.6s
[Parallel(n_jobs=-1)]: Done 1157 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 1658 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 2472 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 3509 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 4619 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 5704 tasks      | elapsed:  9.2min
[Parallel(n_jobs=-1)]: Done 7169 tasks      | elapsed: 11.6min
[Parallel(n_jobs=-1)]: Done 7977 out of 8000 | elapsed: 13.0min remaining:    2.1s
[Parallel(n_jobs=-1)]: Done 8000 out of 8000 | elapsed: 13.2min finished


In [20]:
gs_logReg.best_params_

{'C': 545.5594781168514,
 'max_iter': 100,
 'penalty': 'l2',
 'solver': 'liblinear'}

In [23]:
best_p_hc = bestModel.best_estimator_.get_params()['penalty']
best_C_hc = bestModel.best_estimator_.get_params()['C']
best_max_hc =  bestModel.best_estimator_.get_params()['max_iter']
best_solver_hc =  bestModel.best_estimator_.get_params()['solver']

print('Best penalty:', best_p_hc)
print('Best C:', best_C_hc)
print('Best max iteration:', best_max_hc)
print('Best solver:', best_solver_hc)

Best penalty: l2
Best C: 545.5594781168514
Best max iteration: 100
Best solver: liblinear


In [24]:
#Visualise the parameter configurations from fitting the model with the dataset
results_logReg = pd.DataFrame(gs_logReg.cv_results_['params'])
results_logReg['test_score'] = gs_logReg.cv_results_['mean_test_score']
results_logReg

Unnamed: 0,C,max_iter,penalty,solver,test_score
0,0.0001,100,l1,lbfgs,
1,0.0001,100,l1,newton-cg,
2,0.0001,100,l1,liblinear,0.8593
3,0.0001,100,l1,sag,
4,0.0001,100,l1,saga,0.8597
...,...,...,...,...,...
1595,10000.0000,5000,none,lbfgs,0.9025
1596,10000.0000,5000,none,newton-cg,0.9024
1597,10000.0000,5000,none,liblinear,
1598,10000.0000,5000,none,sag,0.8869


In [26]:
logReg = metric = LogisticRegression(penalty = best_p_hc, 
                               C = best_C_hc, 
                               solver = best_solver_hc, 
                               max_iter = best_max_hc 
                               )

In [27]:
#Defining training and testing groups
x_train, x_test, y_train, y_test = train_test_split(features_hc, target, test_size = 0.5, random_state = 4)

#Training the model previously defined
logReg.fit(x_train, y_train)

#Obtaining and printing out results from the model (Confusion Matrix)
predicted = logReg.predict(x_test)
cm = confusion_matrix(y_test,predicted)
print("Confusion Matrix")
print(cm)

#Printing the numerical result of the confusion matrix
print("\n[Train/test split] score: {:.5f}".format(logReg.score(x_test, y_test)))

Confusion Matrix
[[4198  124]
 [ 391  287]]

[Train/test split] score: 0.89700


In [28]:
#Repeat with KFold - creating groups
kf = KFold(n_splits = 5, random_state = 4, shuffle = True)


#Repeat with KFold - Training model (previously defined) and obtaining its output
kFoldTotal = 0
for k, (train_index, test_index) in enumerate(kf.split(features_hc)):
    x_train, x_test = features_hc.iloc[train_index], features_hc.iloc[test_index]
    y_train, y_test = target.iloc[train_index], target.iloc[test_index]
    logReg.fit(x_train, y_train)
    kFoldTotal += logReg.score(x_test, y_test)
    print("[fold {0}] score: {1:.5f}".format(k, logReg.score(x_test, y_test)))

#Printing out the results
roundedTotal = round(kFoldTotal/5, 5)
print("\nLogistic Regression mean score [5 folds] = " + str(roundedTotal))

[fold 0] score: 0.90100
[fold 1] score: 0.89700
[fold 2] score: 0.90050
[fold 3] score: 0.90800
[fold 4] score: 0.90300

Logistic Regression mean score [5 folds] = 0.9019
