In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
import math

In [2]:
#retrieving the data
parkingdata = pd.read_csv("parking-small.csv")
#parkingdata = parkingdata.sample(n=10000)
#parkingdata.size

In [3]:
#Converting all the feature attributes to uppercase for uniformity

parkingdata['Area Name'] = parkingdata['Area Name'].str.upper()
parkingdata['Street Name'] = parkingdata['Street Name'].str.upper()
parkingdata['Between Street 1'] = parkingdata['Between Street 1'].str.upper()
parkingdata['Between Street 2'] = parkingdata['Between Street 2'].str.upper()
parkingdata['Street Marker'] = parkingdata['Street Marker'].str.upper()
parkingdata['Sign'] = parkingdata['Sign'].str.upper()

parkingdata.head(5)

Unnamed: 0,Area Name,Street Name,Between Street 1,Between Street 2,Side Of Street,Street Marker,Arrival Time,Departure Time,Duration of Parking Event (in seconds),Sign,In Violation?,Street ID,Device ID
0,VICTORIA MARKET,THERRY STREET,QUEEN STREET,ELIZABETH STREET,4,7021S,24/08/2012 11:34:36 AM,24/08/2012 12:49:09 PM,4473,LZ 15M M-SAT 7:30-19:30,1,1346,3770
1,COURTNEY,PEEL STREET,O'CONNELL STREET,QUEENSBERRY STREET,2,5398E,17/03/2012 01:07:59 PM,17/03/2012 01:10:06 PM,127,1/2P A RPE M-SUN 7:30-23:00,0,1101,3472
2,VICTORIA MARKET,FRANKLIN STREET,QUEEN STREET,ELIZABETH STREET,1,C6624,17/02/2012 01:54:34 PM,17/02/2012 02:20:17 PM,1543,2P MTR M-SAT 7:30-20:30,0,681,2805
3,CHINATOWN,LONSDALE STREET,SWANSTON STREET,RUSSELL STREET,3,2888N,27/11/2011 03:03:19 PM,27/11/2011 04:02:41 PM,3562,1P SUN 7:30-18:30,0,894,1770
4,SOUTHBANK,GRANT STREET,WELLS STREET,ST KILDA ROAD,3,9870N,21/04/2012 03:08:32 PM,21/04/2012 05:06:00 PM,7048,2P TKT A M-SAT 7:30-18:30,0,728,4584


In [4]:
#Removing all redundant extra whitespaces
for x in parkingdata.columns:
    if parkingdata[x].dtype == object:
        parkingdata[x] = parkingdata[x].str.strip()

In [5]:
#converting object to datetime
parkingdata['Arrival Time'] = pd.to_datetime(parkingdata['Arrival Time'])
parkingdata['Departure Time'] = pd.to_datetime(parkingdata['Departure Time'])

In [6]:
#Target field identification
duration = parkingdata['Duration of Parking Event (in seconds)'].copy()

print('Minimum parking duration (seconds): ' + str(duration.min()))
print('Maximum parking duration (seconds): ' + str(duration.max()))
print('\nMinimum parking duration (hour): ' + str(duration.min()/3600))
print('Maximum parking duration (hour): ' + str(duration.max()/3600))
print('\nMinimum parking duration is under [' + str(math.trunc(duration.min()/3600) + 1) + '] hours')
print('Maximum parking duration is under [' + str(math.trunc(duration.max()/3600) + 1) + '] hours')

durList = []
static = 1
df = pd.DataFrame()

for record in duration.values:
    hour = 3600
    index = 0
    while static == 1:
        if record < hour:
#             print("\nDuration in seconds: " + str(record))
            data = math.trunc(record/3600) + 1
            durList.append(data)
#             print("Duration is under " + str(data) + " hours")
            break
        else:
            hour += 3600

df.insert(0, 'Duration (Hours)', durList, True)
target = df['Duration (Hours)']
target.head(10)

Minimum parking duration (seconds): 55
Maximum parking duration (seconds): 43193

Minimum parking duration (hour): 0.015277777777777777
Maximum parking duration (hour): 11.998055555555556

Minimum parking duration is under [1] hours
Maximum parking duration is under [12] hours


0    2
1    1
2    1
3    1
4    2
5    1
6    1
7    1
8    1
9    1
Name: Duration (Hours), dtype: int64

In [7]:
#Preliminary Target Features identification (Pre-Hill Climbing)

#Convert possible targets to dtype = int
intDf = parkingdata.copy()
#Drop useless fields
intDf.drop('Arrival Time', axis = 1, inplace = True)
intDf.drop('Departure Time', axis = 1, inplace = True)
intDf.drop('Street Marker', axis = 1, inplace = True)
#Drop target field
intDf.drop('Duration of Parking Event (in seconds)', axis = 1, inplace = True)
#Drop already int fields (Will add back after)
intDf.drop('In Violation?', axis = 1, inplace = True)
intDf.drop('Side Of Street', axis = 1, inplace = True)
intDf.drop('Street ID', axis = 1, inplace = True)
intDf.drop('Device ID', axis = 1, inplace = True)

#Transform the remainder non-int fields' values to unique int identifiers
for column in intDf:
    unique_vals = intDf[column].unique()
    intDf[column].replace(to_replace = unique_vals, value = list(range(len(unique_vals))), inplace = True)

#Create dataframe for all the relevant features
features = intDf.copy()

#Add back int fields (If not the target field)
features['In Violation?'] = parkingdata['In Violation?']
features['Side Of Street'] = parkingdata['Side Of Street']
features['Street ID'] = parkingdata['Street ID']
features['Device ID'] = parkingdata['Device ID']
#Add back datetime features with only the meaningful subset of data
features['Arrival Hour'] = parkingdata['Arrival Time'].dt.hour
features['Departure Hour'] = parkingdata['Departure Time'].dt.hour

#Table view of features dataframe
features.head(10)

Unnamed: 0,Area Name,Street Name,Between Street 1,Between Street 2,Sign,In Violation?,Side Of Street,Street ID,Device ID,Arrival Hour,Departure Hour
0,0,0,0,0,0,1,4,1346,3770,11,12
1,1,1,1,1,1,0,2,1101,3472,13,13
2,0,2,0,0,2,0,1,681,2805,13,14
3,2,3,2,2,3,0,3,894,1770,15,16
4,3,4,3,3,4,0,3,728,4584,15,17
5,4,5,4,4,5,0,2,1171,688,9,10
6,0,2,0,0,2,0,1,681,2804,19,19
7,5,6,5,5,6,0,3,856,1520,14,14
8,6,7,2,2,7,1,3,670,1123,18,19
9,7,8,6,6,8,0,1,647,258,14,15


In [8]:
#Parameter Tuning

#Define the parameters to tune and the values to tune to
params_logReg = [
                {'penalty': ['l1', 'l2', 'elasticnet', 'none'], 
                'C' : np.logspace(-4, 4, 20), 
                'solver' : ['lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'],  
                'max_iter' : [100, 1000, 2500, 5000]}
            ]

In [9]:
cv_method = RepeatedStratifiedKFold(n_splits = 5, 
                                    n_repeats = 1, 
                                    random_state = 1)

gs_logReg = GridSearchCV( LogisticRegression(max_iter=30000), 
                      param_grid = params_logReg, 
                      cv = cv_method,
                      verbose = True,
                      scoring = 'accuracy',
                      n_jobs = -1,
                      return_train_score = True)

In [10]:
#Fit the model with the dataset
bestModel = gs_logReg.fit(features, target)

Fitting 5 folds for each of 1600 candidates, totalling 8000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 335 tasks      | elapsed:   27.2s
[Parallel(n_jobs=-1)]: Done 615 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1010 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 1490 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 2064 tasks      | elapsed: 12.0min
[Parallel(n_jobs=-1)]: Done 2762 tasks      | elapsed: 18.6min
[Parallel(n_jobs=-1)]: Done 3560 tasks      | elapsed: 25.7min
[Parallel(n_jobs=-1)]: Done 4458 tasks      | elapsed: 34.1min
[Parallel(n_jobs=-1)]: Done 5492 tasks      | elapsed: 44.0min
[Parallel(n_jobs=-1)]: Done 6614 tasks      | elapsed: 55.0min
[Parallel(n_jobs=-1)]: Done 7836 tasks      | elapsed: 66.6min
[Parallel(n_jobs=-1)]: Done 8000 out of 8000 | elapsed: 70.0min finished


In [11]:
gs_logReg.best_params_

{'C': 11.288378916846883,
 'max_iter': 1000,
 'penalty': 'l2',
 'solver': 'newton-cg'}

In [12]:
#Find the best values for the  parameters of the model (Formatted output)
best_p = bestModel.best_estimator_.get_params()['penalty']
best_C = bestModel.best_estimator_.get_params()['C']
best_max =  bestModel.best_estimator_.get_params()['max_iter']
best_solver =  bestModel.best_estimator_.get_params()['solver']

print('Best penalty:', best_p)
print('Best C:', best_C)
print('Best max iteration:', best_max)
print('Best solver:', best_solver)

Best penalty: l2
Best C: 11.288378916846883
Best max iteration: 1000
Best solver: newton-cg


In [13]:
#Visualise the parameter configurations from fitting the model with the dataset
results_logReg = pd.DataFrame(gs_logReg.cv_results_['params'])
results_logReg['test_score'] = gs_logReg.cv_results_['mean_test_score']
results_logReg

Unnamed: 0,C,max_iter,penalty,solver,test_score
0,0.0001,100,l1,lbfgs,
1,0.0001,100,l1,newton-cg,
2,0.0001,100,l1,liblinear,0.806809
3,0.0001,100,l1,sag,
4,0.0001,100,l1,saga,0.806809
...,...,...,...,...,...
1595,10000.0000,5000,none,lbfgs,0.834844
1596,10000.0000,5000,none,newton-cg,0.872899
1597,10000.0000,5000,none,liblinear,
1598,10000.0000,5000,none,sag,0.806809


In [14]:
#Re-Define model with the optimal parameter values AFTER HILL CLIMBING
logReg = metric = LogisticRegression(penalty = best_p, 
                               C = best_C, 
                               solver = best_solver, 
                               max_iter = best_max 
                               )

In [15]:
# Hill climbing w/ SVM
new_Ind = []
cur_MaxScore = 0.0
col_num = len(features.columns)
col_Ind_Random = shuffle(range(0, col_num), random_state = 1)
features_array = features.values

for cur_f in range(col_num):
    new_Ind.append(col_Ind_Random[cur_f])
    newData = features_array[:, new_Ind]
    x_train, x_test, y_train, y_test = train_test_split(newData, target, test_size=0.2, random_state=1)
    fit = logReg.fit(x_train, y_train)
    cur_Score = logReg.score(x_test, y_test)
    
    if cur_Score < cur_MaxScore:
        new_Ind.remove(col_Ind_Random[cur_f])
    else:
        cur_MaxScore = cur_Score
        print ("Score with " + str(len(new_Ind)) + " selected features: " + str(cur_Score))
print("\nIndexs of the desired features")
print(new_Ind)

Score with 1 selected features: 0.795
Score with 2 selected features: 0.795
Score with 3 selected features: 0.795
Score with 4 selected features: 0.795
Score with 5 selected features: 0.795
Score with 6 selected features: 0.795




Score with 7 selected features: 0.855




Score with 8 selected features: 0.855

Indexs of the desired features
[2, 3, 4, 9, 6, 0, 10, 5]


In [16]:
features_hc = pd.DataFrame()
for index in new_Ind:
    colName = features.columns[index]
    features_hc[colName] = features[colName]
features_hc.head(10)

Unnamed: 0,Between Street 1,Between Street 2,Sign,Arrival Hour,Side Of Street,Area Name,Departure Hour,In Violation?
0,0,0,0,11,4,0,12,1
1,1,1,1,13,2,1,13,0
2,0,0,2,13,1,0,14,0
3,2,2,3,15,3,2,16,0
4,3,3,4,15,3,3,17,0
5,4,4,5,9,2,4,10,0
6,0,0,2,19,1,0,19,0
7,5,5,6,14,3,5,14,0
8,2,2,7,18,3,6,19,1
9,6,6,8,14,1,7,15,0


In [17]:
bestModel = gs_logReg.fit(features_hc, target)

Fitting 5 folds for each of 1600 candidates, totalling 8000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 326 tasks      | elapsed:   15.1s
[Parallel(n_jobs=-1)]: Done 663 tasks      | elapsed:   35.0s
[Parallel(n_jobs=-1)]: Done 1076 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 1666 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 2352 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 3257 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 4136 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 5154 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 6270 tasks      | elapsed:  8.4min
[Parallel(n_jobs=-1)]: Done 7486 tasks      | elapsed: 11.5min
[Parallel(n_jobs=-1)]: Done 8000 out of 8000 | elapsed: 13.6min finished


In [18]:
gs_logReg.best_params_

{'C': 0.00026366508987303583,
 'max_iter': 2500,
 'penalty': 'none',
 'solver': 'sag'}

In [19]:
best_p_hc = bestModel.best_estimator_.get_params()['penalty']
best_C_hc = bestModel.best_estimator_.get_params()['C']
best_max_hc =  bestModel.best_estimator_.get_params()['max_iter']
best_solver_hc =  bestModel.best_estimator_.get_params()['solver']

print('Best penalty:', best_p_hc)
print('Best C:', best_C_hc)
print('Best max iteration:', best_max_hc)
print('Best solver:', best_solver_hc)

Best penalty: none
Best C: 0.00026366508987303583
Best max iteration: 2500
Best solver: sag


In [20]:
#Visualise the parameter configurations from fitting the model with the dataset
results_logReg = pd.DataFrame(gs_logReg.cv_results_['params'])
results_logReg['test_score'] = gs_logReg.cv_results_['mean_test_score']
results_logReg

Unnamed: 0,C,max_iter,penalty,solver,test_score
0,0.0001,100,l1,lbfgs,
1,0.0001,100,l1,newton-cg,
2,0.0001,100,l1,liblinear,0.806809
3,0.0001,100,l1,sag,
4,0.0001,100,l1,saga,0.806809
...,...,...,...,...,...
1595,10000.0000,5000,none,lbfgs,0.872905
1596,10000.0000,5000,none,newton-cg,0.871910
1597,10000.0000,5000,none,liblinear,
1598,10000.0000,5000,none,sag,0.880915


In [21]:
logReg = metric = LogisticRegression(penalty = best_p_hc, 
                               C = best_C_hc, 
                               solver = best_solver_hc, 
                               max_iter = best_max_hc 
                               )

In [22]:
#Defining training and testing groups
x_train, x_test, y_train, y_test = train_test_split(features_hc, target, test_size = 0.5, random_state = 4)

#Training the model previously defined
logReg.fit(x_train, y_train)

#Obtaining and printing out results from the model (Confusion Matrix)
predicted = logReg.predict(x_test)
cm = confusion_matrix(y_test,predicted)
print("Confusion Matrix")
print(cm)

#Printing the numerical result of the confusion matrix
print("\n[Train/test split] score: {:.5f}".format(logReg.score(x_test, y_test)))



Confusion Matrix
[[393  13   0   0   1   0   0   0   0   0   0]
 [ 23  27   2   0   0   0   1   0   0   0   0]
 [  1  12   5   2   0   0   1   0   0   0   0]
 [  0   1   3   0   0   0   1   0   0   0   0]
 [  0   0   2   1   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   1   0   0   0   0   0   0   0]
 [  0   0   0   0   0   1   0   0   0   1   0]
 [  0   0   0   0   0   0   1   0   0   1   0]
 [  0   0   0   0   0   0   1   0   0   2   0]
 [  0   0   0   0   0   0   0   0   0   3   0]]

[Train/test split] score: 0.85400




In [23]:
#Repeat with KFold - creating groups
kf = KFold(n_splits = 5, random_state = 4, shuffle = True)


#Repeat with KFold - Training model (previously defined) and obtaining its output
kFoldTotal = 0
for k, (train_index, test_index) in enumerate(kf.split(features_hc)):
    x_train, x_test = features_hc.iloc[train_index], features_hc.iloc[test_index]
    y_train, y_test = target.iloc[train_index], target.iloc[test_index]
    logReg.fit(x_train, y_train)
    kFoldTotal += logReg.score(x_test, y_test)
    print("[fold {0}] score: {1:.5f}".format(k, logReg.score(x_test, y_test)))

#Printing out the results
roundedTotal = round(kFoldTotal/5, 5)
print("\nLogistic Regression mean score [5 folds] = " + str(roundedTotal))



[fold 0] score: 0.87500




[fold 1] score: 0.88000




[fold 2] score: 0.86000




[fold 3] score: 0.89000
[fold 4] score: 0.88945

Logistic Regression mean score [5 folds] = 0.87889


