In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.svm import SVC  
from sklearn.metrics import classification_report, confusion_matrix  
from sklearn import linear_model, decomposition, datasets
from sklearn import svm
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier

In [2]:
#retrieving the data
parkingdata = pd.read_csv("parking_duration_of_parking_event_vs_street_ID.csv")
parkingdata = parkingdata.sample(n=10000)
parkingdata.size

130000

In [3]:
#Converting all the feature attributes to uppercase for uniformity

parkingdata['Area Name'] = parkingdata['Area Name'].str.upper()
parkingdata['Street Name'] = parkingdata['Street Name'].str.upper()
parkingdata['Between Street 1'] = parkingdata['Between Street 1'].str.upper()
parkingdata['Between Street 2'] = parkingdata['Between Street 2'].str.upper()
parkingdata['Street Marker'] = parkingdata['Street Marker'].str.upper()
parkingdata['Sign'] = parkingdata['Sign'].str.upper()

parkingdata.head(5)

Unnamed: 0,Area Name,Street Name,Between Street 1,Between Street 2,Side Of Street,Street Marker,Arrival Time,Departure Time,Duration of Parking Event (in seconds),Sign,In Violation?,Street ID,Device ID
9945747,SUPREME,LONSDALE STREET,WILLIAM STREET,QUEEN STREET,1,C3038,24/07/2012 11:08:13 AM,24/07/2012 12:34:54 PM,5201,1P MTR M-SAT 7:30-19:30,1,894,2067
6055013,REGENCY,LA TROBE STREET,RUSSELL STREET,EXHIBITION STREET,4,3651S,26/03/2012 12:19:10 PM,26/03/2012 12:21:42 PM,152,2P MTR M-F 9:30-20:30,0,856,2074
2240916,BANKS,COLLINS STREET,QUEEN STREET,ELIZABETH STREET,3,1986N,01/12/2011 05:23:57 PM,01/12/2011 05:38:06 PM,849,1/2P MTR M-SAT 7:30-19:30,0,528,1091
6053093,REGENCY,SPRING STREET,LT LONSDALE STREET,LONSDALE STREET,1,C144,26/03/2012 11:45:05 AM,26/03/2012 12:54:04 PM,4139,2P MTR M-SAT 7:30-20:30,0,1288,111
5741755,PRINCES THEATRE,EXHIBITION STREET,LT BOURKE STREET,BOURKE STREET,1,C446,16/03/2012 11:53:07 AM,16/03/2012 02:24:05 PM,9058,1P MTR M-SAT 7:30-19:30,1,647,328


In [4]:
#Removing all redundant extra whitespaces
for x in parkingdata.columns:
    if parkingdata[x].dtype == object:
        parkingdata[x] = parkingdata[x].str.strip()

In [5]:
#converting object to datetime
parkingdata['Arrival Time'] = pd.to_datetime(parkingdata['Arrival Time'])
parkingdata['Departure Time'] = pd.to_datetime(parkingdata['Departure Time'])

In [6]:
#check which columns have missing values
parkingdata.columns[parkingdata.isnull().any()]

Index([], dtype='object')

In [7]:
#Target field identification
target = parkingdata['Street Name']

unique_vals = target.unique()
target.replace(to_replace = unique_vals, value = list(range(len(unique_vals))), inplace = True)

target.head(10)

9945747    0
6055013    1
2240916    2
6053093    3
5741755    4
7325173    5
4383887    6
1941937    7
4070767    1
2584738    8
Name: Street Name, dtype: int64

In [8]:
#Preliminary Target Features identification (Pre-Hill Climbing)

#Convert possible targets to dtype = int
intDf = parkingdata.copy()
#Drop useless fields
intDf.drop('Arrival Time', axis = 1, inplace = True)
intDf.drop('Departure Time', axis = 1, inplace = True)
intDf.drop('Street Marker', axis = 1, inplace = True)
#Drop target field
intDf.drop('Street Name', axis = 1, inplace = True)
#Drop already int fields (Will add back after)
intDf.drop('In Violation?', axis = 1, inplace = True)
intDf.drop('Side Of Street', axis = 1, inplace = True)
intDf.drop('Duration of Parking Event (in seconds)', axis = 1, inplace = True)
intDf.drop('Street ID', axis = 1, inplace = True)
intDf.drop('Device ID', axis = 1, inplace = True)

#Transform the remainder non-int fields' values to unique int identifiers
for column in intDf:
    unique_vals = intDf[column].unique()
    intDf[column].replace(to_replace = unique_vals, value = list(range(len(unique_vals))), inplace = True)

#Create dataframe for all the relevant features
features = intDf.copy()

#Add back int fields (If not the target field)
features['In Violation?'] = parkingdata['In Violation?']
features['Side Of Street'] = parkingdata['Side Of Street']
features['Duration of Parking Event (in seconds)'] = parkingdata['Duration of Parking Event (in seconds)']
features['Street ID'] = parkingdata['Street ID']
features['Device ID'] = parkingdata['Device ID']
#Add back datetime features with only the meaningful subset of data
features['Arrival Hour'] = parkingdata['Arrival Time'].dt.hour
features['Departure Hour'] = parkingdata['Departure Time'].dt.hour

#Table view of features dataframe
features.head(10)

Unnamed: 0,Area Name,Between Street 1,Between Street 2,Sign,In Violation?,Side Of Street,Duration of Parking Event (in seconds),Street ID,Device ID,Arrival Hour,Departure Hour
9945747,0,0,0,0,1,1,5201,894,2067,11,12
6055013,1,1,1,1,0,4,152,856,2074,12,12
2240916,2,2,2,2,0,3,849,528,1091,17,17
6053093,1,3,3,3,0,1,4139,1288,111,11,12
5741755,3,4,4,0,1,1,9058,647,328,11,14
7325173,4,5,5,4,0,1,1238,1171,732,12,13
4383887,5,6,6,2,0,3,161,123,1380,17,17
1941937,6,7,7,5,0,3,1043,1381,4028,18,18
4070767,4,0,0,6,0,3,243,856,2149,12,12
2584738,1,8,8,4,0,4,3666,926,1597,11,12


In [9]:
#Parameter Tuning

#Define the parameters to tune and the values to tune to
params_randomforest = [
                { 
                   'n_estimators' : (10,20,30,50,70,90,100),
                    'criterion' : ('gini', 'entropy'),
                    'max_depth' : (3, 5, 7, 9, 10, 15),
                    'max_features' : ('auto', 'sqrt'),
                    'min_samples_split' : (2, 4, 6, 8, 10, 14, 16)
                    
                }
            ]

In [10]:
cv_method = RepeatedStratifiedKFold(n_splits = 5, 
                                    n_repeats = 1, 
                                    random_state = 1)

gs_randomforest = GridSearchCV(RandomForestClassifier(), 
                      param_grid = params_randomforest, 
                      cv = cv_method,
                      verbose = True,
                      scoring = 'accuracy',
                      n_jobs = -1,
                      return_train_score = True)

In [11]:
#Fit the model with the dataset
bestModel = gs_randomforest.fit(features, target)

Fitting 5 folds for each of 1176 candidates, totalling 5880 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   23.8s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:   54.5s
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 2426 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 3176 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done 4026 tasks      | elapsed:  9.1min
[Parallel(n_jobs=-1)]: Done 4976 tasks      | elapsed: 11.8min
[Parallel(n_jobs=-1)]: Done 5880 out of 5880 | elapsed: 14.6min finished


In [12]:
gs_randomforest.best_params_

{'criterion': 'entropy',
 'max_depth': 15,
 'max_features': 'sqrt',
 'min_samples_split': 4,
 'n_estimators': 100}

In [13]:
#Find the best values for the  parameters of the model (Formatted output)
best_e = bestModel.best_estimator_.get_params()['n_estimators']
best_C = bestModel.best_estimator_.get_params()['criterion']
best_maxd =  bestModel.best_estimator_.get_params()['max_depth']
best_maxf =  bestModel.best_estimator_.get_params()['max_features']
best_mins =  bestModel.best_estimator_.get_params()['min_samples_split']

print('Best estimator:', best_e)
print('Best criterion:', best_C)
print('Best max depth:', best_maxd)
print('Best max features:', best_maxf)
print('Best min_samples_split:', best_mins)

Best estimator: 100
Best criterion: entropy
Best max depth: 15
Best max features: sqrt
Best min_samples_split: 4


In [14]:
#Visualise the parameter configurations from fitting the model with the dataset
results_randomforest = pd.DataFrame(gs_randomforest.cv_results_['params'])
results_randomforest['test_score'] = gs_randomforest.cv_results_['mean_test_score']
results_randomforest

Unnamed: 0,criterion,max_depth,max_features,min_samples_split,n_estimators,test_score
0,gini,3,auto,2,10,0.4937
1,gini,3,auto,2,20,0.5050
2,gini,3,auto,2,30,0.5106
3,gini,3,auto,2,50,0.5123
4,gini,3,auto,2,70,0.5094
...,...,...,...,...,...,...
1171,entropy,15,sqrt,16,30,0.9975
1172,entropy,15,sqrt,16,50,0.9981
1173,entropy,15,sqrt,16,70,0.9986
1174,entropy,15,sqrt,16,90,0.9983


In [15]:
#Re-Define model with the optimal parameter values AFTER HILL CLIMBING
randomforest = metric = RandomForestClassifier(n_estimators = best_e, 
                               criterion = best_C, 
                               max_depth = best_maxd, 
                               max_features = best_maxf,
                               min_samples_split = best_mins                
                               )

In [16]:
# Hill climbing w/ Randomforest
new_Ind = []
cur_MaxScore = 0.0
col_num = len(features.columns)
col_Ind_Random = shuffle(range(0, col_num), random_state = 1)
features_array = features.values

for cur_f in range(col_num):
    new_Ind.append(col_Ind_Random[cur_f])
    newData = features_array[:, new_Ind]
    x_train, x_test, y_train, y_test = train_test_split(newData, target, test_size=0.2, random_state=1)
    fit = randomforest.fit(x_train, y_train)
    cur_Score = randomforest.score(x_test, y_test)
    
    if cur_Score < cur_MaxScore:
        new_Ind.remove(col_Ind_Random[cur_f])
    else:
        cur_MaxScore = cur_Score
        print ("Score with " + str(len(new_Ind)) + " selected features: " + str(cur_Score))
print("\nIndexs of the desired features")
print(new_Ind)

Score with 1 selected features: 0.354
Score with 2 selected features: 0.751
Score with 3 selected features: 0.7675
Score with 4 selected features: 0.912
Score with 5 selected features: 1.0
Score with 6 selected features: 1.0
Score with 7 selected features: 1.0
Score with 8 selected features: 1.0

Indexs of the desired features
[2, 3, 1, 0, 7, 10, 8, 5]


In [17]:
features_hc = pd.DataFrame()
for index in new_Ind:
    colName = features.columns[index]
    features_hc[colName] = features[colName]
features_hc.head(10)

Unnamed: 0,Between Street 2,Sign,Between Street 1,Area Name,Street ID,Departure Hour,Device ID,Side Of Street
9945747,0,0,0,0,894,12,2067,1
6055013,1,1,1,1,856,12,2074,4
2240916,2,2,2,2,528,17,1091,3
6053093,3,3,3,1,1288,12,111,1
5741755,4,0,4,3,647,14,328,1
7325173,5,4,5,4,1171,13,732,1
4383887,6,2,6,5,123,17,1380,3
1941937,7,5,7,6,1381,18,4028,3
4070767,0,6,0,4,856,12,2149,3
2584738,8,4,8,1,926,12,1597,4


In [18]:
bestModel = gs_randomforest.fit(features_hc, target)

Fitting 5 folds for each of 1176 candidates, totalling 5880 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done 200 tasks      | elapsed:   25.7s
[Parallel(n_jobs=-1)]: Done 450 tasks      | elapsed:   58.0s
[Parallel(n_jobs=-1)]: Done 800 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 1250 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 1800 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 2450 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 3200 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done 4050 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Done 5000 tasks      | elapsed: 11.2min
[Parallel(n_jobs=-1)]: Done 5880 out of 5880 | elapsed: 13.4min finished


In [19]:
gs_randomforest.best_params_

{'criterion': 'entropy',
 'max_depth': 15,
 'max_features': 'auto',
 'min_samples_split': 2,
 'n_estimators': 50}

In [20]:
best_e_hc = bestModel.best_estimator_.get_params()['n_estimators']
best_C_hc = bestModel.best_estimator_.get_params()['criterion']
best_maxd_hc =  bestModel.best_estimator_.get_params()['max_depth']
best_maxf_hc =  bestModel.best_estimator_.get_params()['max_features']
best_mins_hc =  bestModel.best_estimator_.get_params()['min_samples_split']

print('Best estimators:', best_e_hc)
print('Best criterion:', best_C_hc)
print('Best max depth:', best_maxd_hc)
print('Best max features:', best_maxf_hc)
print('Best min_samples_split:', best_mins_hc)

Best estimators: 50
Best criterion: entropy
Best max depth: 15
Best max features: auto
Best min_samples_split: 2


In [21]:
#Visualise the parameter configurations from fitting the model with the dataset
results_randomforest = pd.DataFrame(gs_randomforest.cv_results_['params'])
results_randomforest['test_score'] = gs_randomforest.cv_results_['mean_test_score']
results_randomforest

Unnamed: 0,criterion,max_depth,max_features,min_samples_split,n_estimators,test_score
0,gini,3,auto,2,10,0.4853
1,gini,3,auto,2,20,0.4919
2,gini,3,auto,2,30,0.5082
3,gini,3,auto,2,50,0.5062
4,gini,3,auto,2,70,0.5127
...,...,...,...,...,...,...
1171,entropy,15,sqrt,16,30,0.9980
1172,entropy,15,sqrt,16,50,0.9980
1173,entropy,15,sqrt,16,70,0.9983
1174,entropy,15,sqrt,16,90,0.9989


In [22]:
randomforest = metric = RandomForestClassifier(n_estimators = best_e_hc, 
                               criterion = best_C_hc, 
                               max_depth = best_maxd_hc, 
                               max_features = best_maxf_hc,
                               min_samples_split = best_mins_hc                
                               )

In [28]:
#Defining training and testing groups
x_train, x_test, y_train, y_test = train_test_split(features_hc, target, test_size = 0.5, random_state = 4)

#Training the model previously defined
randomforest.fit(x_train, y_train)

#Obtaining and printing out results from the model (Confusion Matrix)
predicted = randomforest.predict(x_test)
cm = metrics.confusion_matrix(y_test,predicted)
print("Confusion Matrix")
print(cm)
TP = cm[1, 1]
TN = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]

#Printing the numerical result of the confusion matrix
print("\n Accuracy score: {:.5f}".format(randomforest.score(x_test, y_test)))
misclassification_error = (FP + FN) / float(TP + TN + FP + FN)
print("\n Misclassification Rate: {:.5f}".format(misclassification_error))
sensitivity = TP / float(FN + TP)
print("\n True Positive Rate: {:.5f}".format(sensitivity))
specificity = TN / (TN + FP)
print("\n True Negative Rate: {:.5f}".format(specificity))
precision = TP / float(TP + FP)
print("\n Precision : {:.5f}".format(precision))
prevalence =  float(FN + TP)/float(TP + TN + FP + FN)
print("\n Prevalence : {:.5f}".format(prevalence))
fscore = (2 * precision * sensitivity) / (precision + sensitivity)
print("\n F score : {:.5f}".format(fscore))
print("\n False Positive Rate: {:.5f}".format(1-specificity))

Confusion Matrix
[[490   0   0 ...   0   0   0]
 [  0 282   0 ...   0   0   0]
 [  0   0 281 ...   0   0   0]
 ...
 [  0   0   0 ...   3   0   0]
 [  0   0   0 ...   0   5   0]
 [  0   0   0 ...   0   0   0]]

 Accuracy score: 0.99720

 Misclassification Rate: 0.00000

 True Positive Rate: 1.00000

 True Negative Rate: 1.00000

 Precision : 1.00000

 Prevalence : 0.36528

 F score : 1.00000

 False Positive Rate: 0.00000


In [29]:
#Repeat with KFold - creating groups
kf = KFold(n_splits = 5, random_state = 4, shuffle = True)


#Repeat with KFold - Training model (previously defined) and obtaining its output
kFoldTotal = 0
for k, (train_index, test_index) in enumerate(kf.split(features_hc)):
    x_train, x_test = features_hc.iloc[train_index], features_hc.iloc[test_index]
    y_train, y_test = target.iloc[train_index], target.iloc[test_index]
    randomforest.fit(x_train, y_train)
    kFoldTotal += randomforest.score(x_test, y_test)
    print("[fold {0}] score: {1:.5f}".format(k, randomforest.score(x_test, y_test)))

#Printing out the results
roundedTotal = round(kFoldTotal/5, 5)
print("\nKNN mean score [5 folds] = " + str(roundedTotal))

[fold 0] score: 0.99800
[fold 1] score: 0.99850
[fold 2] score: 0.99900
[fold 3] score: 0.99950
[fold 4] score: 0.99850

KNN mean score [5 folds] = 0.9987
