In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
import math

In [2]:
#retrieving the data
parkingdata = pd.read_csv("parking_duration_of_parking_event_vs_street_ID.csv")
parkingdata = parkingdata.sample(n=10000)
parkingdata.size

130000

In [3]:
#Converting all the feature attributes to uppercase for uniformity

parkingdata['Area Name'] = parkingdata['Area Name'].str.upper()
parkingdata['Street Name'] = parkingdata['Street Name'].str.upper()
parkingdata['Between Street 1'] = parkingdata['Between Street 1'].str.upper()
parkingdata['Between Street 2'] = parkingdata['Between Street 2'].str.upper()
parkingdata['Street Marker'] = parkingdata['Street Marker'].str.upper()
parkingdata['Sign'] = parkingdata['Sign'].str.upper()

parkingdata.head(5)

Unnamed: 0,Area Name,Street Name,Between Street 1,Between Street 2,Side Of Street,Street Marker,Arrival Time,Departure Time,Duration of Parking Event (in seconds),Sign,In Violation?,Street ID,Device ID
341304,REGENCY,EXHIBITION STREET,LT LONSDALE STREET,LONSDALE STREET,2,562E,11/10/2011 11:45:04 AM,11/10/2011 11:58:50 AM,826,2P MTR M-SAT 7:30-20:30,0,647,144
6292566,THE MAC,FRANKLIN STREET,SWANSTON STREET,VICTORIA STREET,1,C6446,02/04/2012 12:53:40 PM,02/04/2012 12:54:41 PM,61,2P MTR M-SAT 7:30-20:30,0,681,4848
8423665,PRINCES THEATRE,BOURKE STREET,EXHIBITION STREET,SPRING STREET,3,2346N,08/06/2012 10:02:28 AM,08/06/2012 10:08:24 AM,356,1P MTR M-SAT 7:30-19:30,0,123,5884
1492224,PRINCES THEATRE,LONSDALE STREET,EXHIBITION STREET,SPRING STREET,1,C2728,11/11/2011 12:11:00 PM,11/11/2011 02:03:23 PM,6743,2P MTR M-SAT 7:30-20:30,0,894,1911
3761112,CHINATOWN,RUSSELL STREET,BOURKE STREET,LT COLLINS STREET,1,C760,17/01/2012 03:51:43 PM,17/01/2012 03:52:47 PM,64,1P MTR M-SAT 7:30-19:30,0,1221,441


In [4]:
#Removing all redundant extra whitespaces
for x in parkingdata.columns:
    if parkingdata[x].dtype == object:
        parkingdata[x] = parkingdata[x].str.strip()

In [5]:
#converting object to datetime
parkingdata['Arrival Time'] = pd.to_datetime(parkingdata['Arrival Time'])
parkingdata['Departure Time'] = pd.to_datetime(parkingdata['Departure Time'])

In [6]:
#Target field identification
duration = parkingdata['Duration of Parking Event (in seconds)'].copy()

print('Minimum parking duration (seconds): ' + str(duration.min()))
print('Maximum parking duration (seconds): ' + str(duration.max()))
print('\nMinimum parking duration (hour): ' + str(duration.min()/3600))
print('Maximum parking duration (hour): ' + str(duration.max()/3600))
print('\nMinimum parking duration is under [' + str(math.trunc(duration.min()/3600) + 1) + '] hours')
print('Maximum parking duration is under [' + str(math.trunc(duration.max()/3600) + 1) + '] hours')

durList = []
static = 1
df = pd.DataFrame()

for record in duration.values:
    hour = 3600
    index = 0
    while static == 1:
        if record < hour:
#             print("\nDuration in seconds: " + str(record))
            data = math.trunc(record/3600) + 1
            durList.append(data)
#             print("Duration is under " + str(data) + " hours")
            break
        else:
            hour += 3600

df.insert(0, 'Duration (Hours)', durList, True)
target = df['Duration (Hours)']
target.head(10)

Minimum parking duration (seconds): 1
Maximum parking duration (seconds): 86258

Minimum parking duration (hour): 0.0002777777777777778
Maximum parking duration (hour): 23.960555555555555

Minimum parking duration is under [1] hours
Maximum parking duration is under [24] hours


0    1
1    1
2    1
3    2
4    1
5    1
6    1
7    1
8    1
9    1
Name: Duration (Hours), dtype: int64

In [7]:
#Preliminary Target Features identification (Pre-Hill Climbing)

#Convert possible targets to dtype = int
intDf = parkingdata.copy()
#Drop useless fields
intDf.drop('Arrival Time', axis = 1, inplace = True)
intDf.drop('Departure Time', axis = 1, inplace = True)
intDf.drop('Street Marker', axis = 1, inplace = True)
#Drop target field
intDf.drop('Duration of Parking Event (in seconds)', axis = 1, inplace = True)
#Drop already int fields (Will add back after)
intDf.drop('In Violation?', axis = 1, inplace = True)
intDf.drop('Side Of Street', axis = 1, inplace = True)
intDf.drop('Street ID', axis = 1, inplace = True)
intDf.drop('Device ID', axis = 1, inplace = True)

#Transform the remainder non-int fields' values to unique int identifiers
for column in intDf:
    unique_vals = intDf[column].unique()
    intDf[column].replace(to_replace = unique_vals, value = list(range(len(unique_vals))), inplace = True)

#Create dataframe for all the relevant features
features = intDf.copy()

#Add back int fields (If not the target field)
features['In Violation?'] = parkingdata['In Violation?']
features['Side Of Street'] = parkingdata['Side Of Street']
features['Street ID'] = parkingdata['Street ID']
features['Device ID'] = parkingdata['Device ID']
#Add back datetime features with only the meaningful subset of data
features['Arrival Hour'] = parkingdata['Arrival Time'].dt.hour
features['Departure Hour'] = parkingdata['Departure Time'].dt.hour

#Table view of features dataframe
features.head(10)

Unnamed: 0,Area Name,Street Name,Between Street 1,Between Street 2,Sign,In Violation?,Side Of Street,Street ID,Device ID,Arrival Hour,Departure Hour
341304,0,0,0,0,0,0,2,647,144,11,11
6292566,1,1,1,1,0,0,1,681,4848,12,12
8423665,2,2,2,2,1,0,3,123,5884,10,10
1492224,2,3,2,2,0,0,1,894,1911,12,14
3761112,3,4,3,3,1,0,1,1221,441,15,15
5007351,4,3,4,4,2,0,4,894,1813,15,15
7288477,5,5,4,4,3,0,4,911,1043,18,18
268541,6,6,5,5,4,0,5,839,921,19,19
11389467,7,7,6,6,5,0,4,16,6415,17,18
8470518,8,8,7,7,1,0,1,1171,5809,14,14


In [8]:
#Parameter Tuning

#Define the parameters to tune and the values to tune to
params_logReg = [
                {'penalty': ['l1', 'l2', 'elasticnet', 'none'], 
                'solver' : ['lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'],  
                'max_iter' : [100, 1000, 2500, 5000]}
            ]

In [9]:
cv_method = RepeatedStratifiedKFold(n_splits = 5, 
                                    n_repeats = 1, 
                                    random_state = 1)

gs_logReg = GridSearchCV( LogisticRegression(max_iter=80000), 
                      param_grid = params_logReg, 
                      cv = cv_method,
                      verbose = True,
                      scoring = 'accuracy',
                      n_jobs = -1,
                      return_train_score = True)

In [10]:
#Fit the model with the dataset
bestModel = gs_logReg.fit(features, target)

Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed: 106.1min finished


In [11]:
gs_logReg.best_params_

{'max_iter': 1000, 'penalty': 'none', 'solver': 'newton-cg'}

In [12]:
#Find the best values for the  parameters of the model (Formatted output)
best_p = bestModel.best_estimator_.get_params()['penalty']
best_C = bestModel.best_estimator_.get_params()['C']
best_max =  bestModel.best_estimator_.get_params()['max_iter']
best_solver =  bestModel.best_estimator_.get_params()['solver']

print('Best penalty:', best_p)
print('Best C:', best_C)
print('Best max iteration:', best_max)
print('Best solver:', best_solver)

Best penalty: none
Best C: 1.0
Best max iteration: 1000
Best solver: newton-cg


In [13]:
#Visualise the parameter configurations from fitting the model with the dataset
results_logReg = pd.DataFrame(gs_logReg.cv_results_['params'])
results_logReg['test_score'] = gs_logReg.cv_results_['mean_test_score']
results_logReg

Unnamed: 0,max_iter,penalty,solver,test_score
0,100,l1,lbfgs,
1,100,l1,newton-cg,
2,100,l1,liblinear,0.8608
3,100,l1,sag,
4,100,l1,saga,0.8004
...,...,...,...,...
75,5000,none,lbfgs,0.8461
76,5000,none,newton-cg,0.8903
77,5000,none,liblinear,
78,5000,none,sag,0.8004


In [14]:
#Re-Define model with the optimal parameter values AFTER HILL CLIMBING
logReg = metric = LogisticRegression(penalty = best_p, 
                               C = best_C, 
                               solver = best_solver, 
                               max_iter = best_max 
                               )

In [15]:
# Hill climbing w/ SVM
new_Ind = []
cur_MaxScore = 0.0
col_num = len(features.columns)
col_Ind_Random = shuffle(range(0, col_num), random_state = 1)
features_array = features.values

for cur_f in range(col_num):
    new_Ind.append(col_Ind_Random[cur_f])
    newData = features_array[:, new_Ind]
    x_train, x_test, y_train, y_test = train_test_split(newData, target, test_size=0.2, random_state=1)
    fit = logReg.fit(x_train, y_train)
    cur_Score = logReg.score(x_test, y_test)
    
    if cur_Score < cur_MaxScore:
        new_Ind.remove(col_Ind_Random[cur_f])
    else:
        cur_MaxScore = cur_Score
        print ("Score with " + str(len(new_Ind)) + " selected features: " + str(cur_Score))
print("\nIndexs of the desired features")
print(new_Ind)

Score with 1 selected features: 0.801
Score with 2 selected features: 0.801
Score with 3 selected features: 0.801




Score with 4 selected features: 0.801
Score with 5 selected features: 0.801
Score with 6 selected features: 0.801




Score with 7 selected features: 0.801




Score with 8 selected features: 0.801




Score with 9 selected features: 0.801
Score with 10 selected features: 0.8225

Indexs of the desired features
[2, 3, 4, 1, 6, 0, 7, 10, 8, 5]




In [16]:
features_hc = pd.DataFrame()
for index in new_Ind:
    colName = features.columns[index]
    features_hc[colName] = features[colName]
features_hc.head(10)

Unnamed: 0,Between Street 1,Between Street 2,Sign,Street Name,Side Of Street,Area Name,Street ID,Departure Hour,Device ID,In Violation?
341304,0,0,0,0,2,0,647,11,144,0
6292566,1,1,0,1,1,1,681,12,4848,0
8423665,2,2,1,2,3,2,123,10,5884,0
1492224,2,2,0,3,1,2,894,14,1911,0
3761112,3,3,1,4,1,3,1221,15,441,0
5007351,4,4,2,3,4,4,894,15,1813,0
7288477,4,4,3,5,4,5,911,18,1043,0
268541,5,5,4,6,5,6,839,19,921,0
11389467,6,6,5,7,4,7,16,18,6415,0
8470518,7,7,1,8,1,8,1171,14,5809,0


In [17]:
bestModel = gs_logReg.fit(features_hc, target)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed: 106.8min finished


In [18]:
gs_logReg.best_params_

{'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}

In [19]:
best_p_hc = bestModel.best_estimator_.get_params()['penalty']
best_C_hc = bestModel.best_estimator_.get_params()['C']
best_max_hc =  bestModel.best_estimator_.get_params()['max_iter']
best_solver_hc =  bestModel.best_estimator_.get_params()['solver']

print('Best penalty:', best_p_hc)
print('Best C:', best_C_hc)
print('Best max iteration:', best_max_hc)
print('Best solver:', best_solver_hc)

Best penalty: l2
Best C: 1.0
Best max iteration: 100
Best solver: liblinear


In [20]:
#Visualise the parameter configurations from fitting the model with the dataset
results_logReg = pd.DataFrame(gs_logReg.cv_results_['params'])
results_logReg['test_score'] = gs_logReg.cv_results_['mean_test_score']
results_logReg

Unnamed: 0,max_iter,penalty,solver,test_score
0,100,l1,lbfgs,
1,100,l1,newton-cg,
2,100,l1,liblinear,0.8236
3,100,l1,sag,
4,100,l1,saga,0.8004
...,...,...,...,...
75,5000,none,lbfgs,0.8193
76,5000,none,newton-cg,0.8239
77,5000,none,liblinear,
78,5000,none,sag,0.8004


In [21]:
logReg = metric = LogisticRegression(penalty = best_p_hc, 
                               C = best_C_hc, 
                               solver = best_solver_hc, 
                               max_iter = best_max_hc 
                               )

In [22]:
#Defining training and testing groups
x_train, x_test, y_train, y_test = train_test_split(features_hc, target, test_size = 0.5, random_state = 4)

#Training the model previously defined
logReg.fit(x_train, y_train)

#Obtaining and printing out results from the model (Confusion Matrix)
predicted = logReg.predict(x_test)
cm = confusion_matrix(y_test,predicted)
print("Confusion Matrix")
print(cm)

#Printing the numerical result of the confusion matrix
print("\n[Train/test split] score: {:.5f}".format(logReg.score(x_test, y_test)))

Confusion Matrix
[[3927   84    0    0    1    1    0    0    0    0    0    0    0    0
     0]
 [ 450  195    0    0    0    0    0    0    0    0    0    0    0    0
     0]
 [  64   95    0    0    0    0    0    0    0    0    0    0    0    0
     0]
 [  31   40    0    0    0    0    0    0    0    0    0    0    0    0
     0]
 [   9   25    0    0    0    0    0    0    0    0    0    0    0    0
     0]
 [   1   14    0    0    0    0    0    0    0    0    0    0    0    0
     0]
 [   0   10    0    0    0    0    0    0    0    0    0    0    0    0
     0]
 [   2    2    0    0    0    0    0    0    0    0    0    0    0    0
     0]
 [   2    8    0    0    0    0    0    0    0    0    1    0    0    0
     0]
 [   0   12    0    0    0    0    0    0    0    0    0    0    0    0
     0]
 [   0   17    0    0    0    0    0    0    0    0    0    0    0    0
     0]
 [   0    3    0    0    0    0    0    0    0    0    0    0    0    0
     0]
 [   0    3    0    0  

In [23]:
#Repeat with KFold - creating groups
kf = KFold(n_splits = 5, random_state = 4, shuffle = True)


#Repeat with KFold - Training model (previously defined) and obtaining its output
kFoldTotal = 0
for k, (train_index, test_index) in enumerate(kf.split(features_hc)):
    x_train, x_test = features_hc.iloc[train_index], features_hc.iloc[test_index]
    y_train, y_test = target.iloc[train_index], target.iloc[test_index]
    logReg.fit(x_train, y_train)
    kFoldTotal += logReg.score(x_test, y_test)
    print("[fold {0}] score: {1:.5f}".format(k, logReg.score(x_test, y_test)))

#Printing out the results
roundedTotal = round(kFoldTotal/5, 5)
print("\nLogistic Regression mean score [5 folds] = " + str(roundedTotal))

[fold 0] score: 0.83000
[fold 1] score: 0.81100
[fold 2] score: 0.83750
[fold 3] score: 0.81150
[fold 4] score: 0.83100

Logistic Regression mean score [5 folds] = 0.8242
