In [1]:
!pip install mlxtend



In [2]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from scipy import stats
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
import math
from sklearn import linear_model, decomposition, datasets
from sklearn import metrics
from sklearn.metrics import classification_report
from mlxtend.plotting import plot_confusion_matrix

In [3]:
#retrieving the data
parkingdata = pd.read_csv("parking_duration_of_parking_event_vs_street_ID.csv")
parkingdata = parkingdata.sample(n=10000)
parkingdata.size

130000

In [4]:
#Converting all the feature attributes to uppercase for uniformity

parkingdata['Area Name'] = parkingdata['Area Name'].str.upper()
parkingdata['Street Name'] = parkingdata['Street Name'].str.upper()
parkingdata['Between Street 1'] = parkingdata['Between Street 1'].str.upper()
parkingdata['Between Street 2'] = parkingdata['Between Street 2'].str.upper()
parkingdata['Street Marker'] = parkingdata['Street Marker'].str.upper()
parkingdata['Sign'] = parkingdata['Sign'].str.upper()

parkingdata.head(5)

Unnamed: 0,Area Name,Street Name,Between Street 1,Between Street 2,Side Of Street,Street Marker,Arrival Time,Departure Time,Duration of Parking Event (in seconds),Sign,In Violation?,Street ID,Device ID
6930146,HYATT,FLINDERS STREET,RUSSELL STREET,EXHIBITION STREET,4,1599S,23/04/2012 02:39:36 PM,23/04/2012 04:31:45 PM,6729,2P TKT A M-SAT 7:30-20:30,0,670,1058
717834,CITY SQUARE,COLLINS STREET,SWANSTON STREET,RUSSELL STREET,4,1943S,21/10/2011 09:46:54 AM,21/10/2011 10:18:36 AM,1902,1/2P MTR M-SAT 7:30-19:30,0,528,1172
4253549,SOUTHBANK,COVENTRY STREET,WELLS STREET,ST KILDA ROAD,4,9243S,01/02/2012 10:46:35 AM,01/02/2012 10:59:35 AM,780,1P MTR M-F 7:30-18:30,0,547,4087
11493453,RIALTO,KING STREET,FLINDERS LANE,FLINDERS STREET,2,1384E,08/09/2012 04:13:05 PM,08/09/2012 04:18:31 PM,326,2P SAT 7:30-19:30,0,839,869
1229486,REGENCY,EXHIBITION STREET,LA TROBE STREET,LT LONSDALE STREET,1,C588,04/11/2011 11:34:14 AM,04/11/2011 11:39:32 AM,318,2P MTR M-SAT 7:30-20:30,0,647,390


In [5]:
#Removing all redundant extra whitespaces
for x in parkingdata.columns:
    if parkingdata[x].dtype == object:
        parkingdata[x] = parkingdata[x].str.strip()

In [6]:
#converting object to datetime
parkingdata['Arrival Time'] = pd.to_datetime(parkingdata['Arrival Time'])
parkingdata['Departure Time'] = pd.to_datetime(parkingdata['Departure Time'])

In [7]:
#Target field identification
target = parkingdata['Street Name']

unique_vals = target.unique()
target.replace(to_replace = unique_vals, value = list(range(len(unique_vals))), inplace = True)

target.head(10)

6930146     0
717834      1
4253549     2
11493453    3
1229486     4
3375594     1
8027829     5
464768      6
8637966     7
921552      8
Name: Street Name, dtype: int64

In [8]:
#Preliminary Target Features identification (Pre-Hill Climbing)

#Convert possible targets to dtype = int
intDf = parkingdata.copy()
#Drop useless fields
intDf.drop('Arrival Time', axis = 1, inplace = True)
intDf.drop('Departure Time', axis = 1, inplace = True)
intDf.drop('Street Marker', axis = 1, inplace = True)
#Drop target field
intDf.drop('Street Name', axis = 1, inplace = True)
#Drop already int fields (Will add back after)
intDf.drop('In Violation?', axis = 1, inplace = True)
intDf.drop('Side Of Street', axis = 1, inplace = True)
intDf.drop('Duration of Parking Event (in seconds)', axis = 1, inplace = True)
intDf.drop('Street ID', axis = 1, inplace = True)
intDf.drop('Device ID', axis = 1, inplace = True)

#Transform the remainder non-int fields' values to unique int identifiers
for column in intDf:
    unique_vals = intDf[column].unique()
    intDf[column].replace(to_replace = unique_vals, value = list(range(len(unique_vals))), inplace = True)

#Create dataframe for all the relevant features
features = intDf.copy()

#Add back int fields (If not the target field)
features['In Violation?'] = parkingdata['In Violation?']
features['Side Of Street'] = parkingdata['Side Of Street']
features['Duration of Parking Event (in seconds)'] = parkingdata['Duration of Parking Event (in seconds)']
features['Street ID'] = parkingdata['Street ID']
features['Device ID'] = parkingdata['Device ID']
#Add back datetime features with only the meaningful subset of data
features['Arrival Hour'] = parkingdata['Arrival Time'].dt.hour
features['Departure Hour'] = parkingdata['Departure Time'].dt.hour

#Table view of features dataframe
features.head(10)

Unnamed: 0,Area Name,Between Street 1,Between Street 2,Sign,In Violation?,Side Of Street,Duration of Parking Event (in seconds),Street ID,Device ID,Arrival Hour,Departure Hour
6930146,0,0,0,0,0,4,6729,670,1058,14,16
717834,1,1,1,1,0,4,1902,528,1172,9,10
4253549,2,2,2,2,0,4,780,547,4087,10,10
11493453,3,3,3,3,0,2,326,839,869,16,16
1229486,4,4,4,4,0,1,318,647,390,11,11
3375594,3,5,5,5,0,4,81,528,1261,15,15
8027829,5,0,0,6,1,1,3998,894,5820,8,9
464768,4,6,6,4,0,1,2641,1288,92,11,12
8637966,6,7,7,1,0,4,367,681,2782,9,9
921552,7,8,8,4,0,5,552,1171,3045,17,17


In [9]:
#Parameter Tuning
         ####NOTE: random_split=1 for test_train
#Define the parameters to tune and the values to tune to
params_dtc = [
    {
        'criterion': ['gini', 'entropy'],
        'min_samples_leaf': [None,1, 2, 4, 10],
        'max_depth': [None, 4, 10, 15],
        'splitter' : ['best', 'random'],
        'min_samples_split':[None,5, 8, 10, 12, 14, 16]
    }
]

In [10]:
cv_method = RepeatedStratifiedKFold(n_splits = 5, 
                                    n_repeats = 1, 
                                    random_state = 1)

gs_dtc = GridSearchCV(estimator = DecisionTreeClassifier(),
                      param_grid = params_dtc, 
                      cv = cv_method,
                      verbose = True,
                      scoring = 'accuracy',
                      n_jobs = -1,
                      return_train_score = True)

In [11]:
#Fit the model with the dataset
bestModel = gs_dtc.fit(features, target)

Fitting 5 folds for each of 560 candidates, totalling 2800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done 1232 tasks      | elapsed:   11.8s
[Parallel(n_jobs=-1)]: Done 2800 out of 2800 | elapsed:   25.9s finished


In [12]:
#Find the best values for the  parameters of the model (Standard output)
gs_dtc.best_params_

{'criterion': 'entropy',
 'max_depth': 15,
 'min_samples_leaf': 1,
 'min_samples_split': 10,
 'splitter': 'best'}

In [13]:
#Find the best values for the  parameters of the model (Formatted output)
best_criterion = bestModel.best_estimator_.get_params()['criterion']
best_max_depth =  bestModel.best_estimator_.get_params()['max_depth']
best_min_samples_leaf =  bestModel.best_estimator_.get_params()['min_samples_leaf']
best_min_samples_split =  bestModel.best_estimator_.get_params()['min_samples_split']
best_splitter =  bestModel.best_estimator_.get_params()['splitter']

print('Best criterion: ', best_criterion)
print('Best max_depth: ', best_max_depth )
print('Best min_samples_leaf: ', best_min_samples_leaf )
print('Best min_samples_split: ', best_min_samples_split)
print('Best splitter: ', best_splitter)

Best criterion:  entropy
Best max_depth:  15
Best min_samples_leaf:  1
Best min_samples_split:  10
Best splitter:  best


In [14]:
#Visualise the parameter configurations from fitting the model with the dataset
results_dtc = pd.DataFrame(gs_dtc.cv_results_['params'])
results_dtc['test_score'] = gs_dtc.cv_results_['mean_test_score']
results_dtc

Unnamed: 0,criterion,max_depth,min_samples_leaf,min_samples_split,splitter,test_score
0,gini,,,,best,
1,gini,,,,random,
2,gini,,,5.0,best,
3,gini,,,5.0,random,
4,gini,,,8.0,best,
...,...,...,...,...,...,...
555,entropy,15.0,10.0,12.0,random,0.9862
556,entropy,15.0,10.0,14.0,best,0.9962
557,entropy,15.0,10.0,14.0,random,0.9817
558,entropy,15.0,10.0,16.0,best,0.9964


In [15]:
#Re-Define model with the optimal parameter values AFTER HILL CLIMBING
dtc = metric = DecisionTreeClassifier(criterion = best_criterion, 
                               max_depth = best_max_depth, 
                               min_samples_leaf = best_min_samples_leaf,
                               min_samples_split = best_min_samples_split,
                               splitter = best_splitter,
                               random_state = 0
                               )

In [16]:
# Hill climbing
new_Ind = []
cur_MaxScore = 0.0
col_num = len(features.columns)
col_Ind_Random = shuffle(range(0, col_num), random_state = 1)
features_array = features.values

for cur_f in range(col_num):
    new_Ind.append(col_Ind_Random[cur_f])
    newData = features_array[:, new_Ind]
    x_train, x_test, y_train, y_test = train_test_split(newData, target, test_size=0.2, random_state=1)
    fit = dtc.fit(x_train, y_train)
    cur_Score = dtc.score(x_test, y_test)
    
    if cur_Score < cur_MaxScore:
        new_Ind.remove(col_Ind_Random[cur_f])
    else:
        cur_MaxScore = cur_Score
        print ("Score with " + str(len(new_Ind)) + " selected features: " + str(cur_Score))
print("\nIndexs of the desired features")
print(new_Ind)

Score with 1 selected features: 0.3585
Score with 2 selected features: 0.749
Score with 3 selected features: 0.749
Score with 4 selected features: 0.7705
Score with 5 selected features: 0.911
Score with 6 selected features: 0.9995
Score with 7 selected features: 0.9995
Score with 8 selected features: 0.9995
Score with 9 selected features: 0.9995

Indexs of the desired features
[2, 3, 4, 1, 0, 7, 10, 8, 5]


In [17]:
features_hc = pd.DataFrame()
for index in new_Ind:
    colName = features.columns[index]
    features_hc[colName] = features[colName]
features_hc.head(10)

Unnamed: 0,Between Street 2,Sign,In Violation?,Between Street 1,Area Name,Street ID,Departure Hour,Device ID,Side Of Street
6930146,0,0,0,0,0,670,16,1058,4
717834,1,1,0,1,1,528,10,1172,4
4253549,2,2,0,2,2,547,10,4087,4
11493453,3,3,0,3,3,839,16,869,2
1229486,4,4,0,4,4,647,11,390,1
3375594,5,5,0,5,3,528,15,1261,4
8027829,0,6,1,0,5,894,9,5820,1
464768,6,4,0,6,4,1288,12,92,1
8637966,7,1,0,7,6,681,9,2782,4
921552,8,4,0,8,7,1171,17,3045,5


In [18]:
bestModel = gs_dtc.fit(features_hc, target)

Fitting 5 folds for each of 560 candidates, totalling 2800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 2680 tasks      | elapsed:   14.5s
[Parallel(n_jobs=-1)]: Done 2800 out of 2800 | elapsed:   15.4s finished


In [19]:
gs_dtc.best_params_

{'criterion': 'entropy',
 'max_depth': 10,
 'min_samples_leaf': 1,
 'min_samples_split': 10,
 'splitter': 'best'}

In [20]:
best_criterion_hc = bestModel.best_estimator_.get_params()['criterion']
best_max_depth_hc =  bestModel.best_estimator_.get_params()['max_depth']
best_min_samples_leaf_hc =  bestModel.best_estimator_.get_params()['min_samples_leaf']
best_min_samples_split_hc =  bestModel.best_estimator_.get_params()['min_samples_split']
best_splitter_hc =  bestModel.best_estimator_.get_params()['splitter']

print('Best criterion: ', best_criterion_hc)
print('Best max_depth: ', best_max_depth_hc )
print('Best min_samples_leaf: ', best_min_samples_leaf_hc )
print('Best min_samples_split: ', best_min_samples_split_hc)
print('Best splitter: ', best_splitter_hc)

Best criterion:  entropy
Best max_depth:  10
Best min_samples_leaf:  1
Best min_samples_split:  10
Best splitter:  best


In [21]:
#Visualise the parameter configurations from fitting the model with the dataset
results_dtc = pd.DataFrame(gs_dtc.cv_results_['params'])
results_dtc['test_score'] = gs_dtc.cv_results_['mean_test_score']
results_dtc

Unnamed: 0,criterion,max_depth,min_samples_leaf,min_samples_split,splitter,test_score
0,gini,,,,best,
1,gini,,,,random,
2,gini,,,5.0,best,
3,gini,,,5.0,random,
4,gini,,,8.0,best,
...,...,...,...,...,...,...
555,entropy,15.0,10.0,12.0,random,0.9875
556,entropy,15.0,10.0,14.0,best,0.9960
557,entropy,15.0,10.0,14.0,random,0.9830
558,entropy,15.0,10.0,16.0,best,0.9961


In [22]:
dtc = metric = DecisionTreeClassifier(criterion = best_criterion_hc, 
                               max_depth = best_max_depth_hc, 
                               min_samples_leaf = best_min_samples_leaf_hc,
                               min_samples_split = best_min_samples_split_hc,
                               splitter = best_splitter_hc,
                               random_state = 0
                               )

In [24]:
#Defining training and testing groups
x_train, x_test, y_train, y_test = train_test_split(features_hc, target, test_size = 0.5, random_state = 4)

#Training the model previously defined
dtc.fit(x_train, y_train)

#Obtaining and printing out results from the model (Confusion Matrix)
predicted = dtc.predict(x_test)
cm = metrics.confusion_matrix(y_test,predicted)
print (cm)
print (classification_report(y_test,predicted))


[[ 57   0   0 ...   0   0   0]
 [  0 278   0 ...   0   0   0]
 [  0   0  99 ...   0   0   0]
 ...
 [  0   0   0 ...   5   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        57
           1       1.00      1.00      1.00       278
           2       1.00      1.00      1.00        99
           3       1.00      1.00      1.00       118
           4       1.00      1.00      1.00       275
           5       1.00      1.00      1.00       444
           6       1.00      1.00      1.00       141
           7       1.00      1.00      1.00       287
           8       1.00      1.00      1.00       382
           9       1.00      1.00      1.00       267
          10       1.00      1.00      1.00        51
          11       1.00      1.00      1.00       116
          12       1.00      1.00      1.00        31
          13       1.00      1.00      1.00        

  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
#Repeat with KFold - creating groups
kf = KFold(n_splits = 5, random_state = 4, shuffle = True)


#Repeat with KFold - Training model (previously defined) and obtaining its output
kFoldTotal = 0
for k, (train_index, test_index) in enumerate(kf.split(features_hc)):
    x_train, x_test = features_hc.iloc[train_index], features_hc.iloc[test_index]
    y_train, y_test = target.iloc[train_index], target.iloc[test_index]
    dtc.fit(x_train, y_train)
    kFoldTotal += dtc.score(x_test, y_test)
    print("[fold {0}] score: {1:.5f}".format(k, dtc.score(x_test, y_test)))

#Printing out the results
roundedTotal = round(kFoldTotal/5, 5)
print("\nK-fold mean score [5 folds] = " + str(roundedTotal))

[fold 0] score: 0.99850
[fold 1] score: 0.99900
[fold 2] score: 0.99900
[fold 3] score: 0.99950
[fold 4] score: 0.99850

K-fold mean score [5 folds] = 0.9989


In [26]:
#visualise the tree
from sklearn import tree
with open('streetTarget.dot', 'w') as f:
    f = tree.export_graphviz(dtc, out_file=f, feature_names=None, class_names=None, filled=True, rounded=True, special_characters=True)
    


In [28]:
#Obtaining and printing out results from the model (Confusion Matrix)
predicted = dtc.predict(x_test)
cm = metrics.confusion_matrix(y_test,predicted)
print (cm)

[[ 25   0   0 ...   0   0   0]
 [  0 106   0 ...   0   0   0]
 [  0   0  33 ...   0   0   0]
 ...
 [  0   0   0 ...   2   0   0]
 [  0   0   0 ...   0   0   1]
 [  0   0   0 ...   0   0   0]]


In [29]:
print ("Accuracy: {:.5f}".format(dtc.score(x_test, y_test)))

Accuracy: 0.99850
