In [1]:
#PROJECT: PREDICTING TYPE OF CRIME IN TORONTO


#Notes: 

#Approach is to numeric encode the data set and run a RF model to see performance and then to create a second data set
#that is one-hot encoded to test it's performance on the same model. 

#--------------------------------------------------#

#1) IMPORT LIBRARIES

#Computation and Structuring:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

#Modeling:

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

#Testing:

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

#--------------------------------------------------#

#2) DATA IMPORT AND PRE-PROCESSING

#import full data set
df = pd.read_csv('F:\THesis\Dataset\New\Crime1.csv',sep=',') 

#list of relevant columns for model
col_list = ['occurrenceyear',	'occurrencemonth','occurrenceday','occurrencedayofyear','occurrencedayofweek','occurrencehour','MCI',	'Division',	'Neighbourhood','premisetype']

#dataframe created from list of relevant columns

df2 = df[col_list]
df2 = df2[df2['occurrenceyear'] > 2013] #drop "stale" crimes, where occurence is before 2014. Since data set is filtered based on reported date, we're ignoring these old crimes.

#Factorize dependent variable column:

crime_var = pd.factorize(df2['MCI']) #codes the list of crimes to a int64 variable
df2['MCI'] = crime_var[0]
definition_list_MCI = crime_var[1] #create an index reference so we know which crimes are coded to which factors

#factorize independent variables:

#factorize premisetype:

premise_var = pd.factorize(df2['premisetype'])
df2['premisetype'] = premise_var[0]
definition_list_premise = premise_var[1] 

#factorize occurenceyear:

year_var = pd.factorize(df2['occurrenceyear'])
df2['occurrenceyear'] = year_var[0]
definition_list_year = year_var[1] 

#factorize occurencemonth:

month_var = pd.factorize(df2['occurrencemonth'])
df2['occurrencemonth'] = month_var[0]
definition_list_month = month_var[1] 

#factorize occurenceday:

day_var = pd.factorize(df2['occurrenceday'])
df2['occurenceday'] = day_var[0]
definition_list_day = day_var[1] 

#factorize occurencedayofweek:

dayweek_var = pd.factorize(df2['occurrencedayofweek'])
df2['occurrencedayofweek'] = dayweek_var[0]
definition_list_day = dayweek_var[1] 

#factorize division:

division_var = pd.factorize(df2['Division'])
df2['Division'] = division_var[0]
definition_list_division = division_var[1] 

#factorize HOOD_ID:

hood_var = pd.factorize(df2['Neighbourhood'])
df2['Neighbourhood'] = hood_var[0]
definition_list_hood = hood_var[1] 

#factorize occurencehour:

hour_var = pd.factorize(df2['occurrencehour'])
df2['occurrencehour'] = hour_var[0]
definition_list_hour = hour_var[1] 

#factorize occurencedayofyear:

dayyear_var = pd.factorize(df2['occurrencedayofyear'])
df2['occurrencedayofyear'] = dayyear_var[0]
definition_list_dayyear = dayyear_var[1] 

In [3]:
#set X and Y:

X = df2.drop(['MCI'],axis=1).values #sets x and converts to an array


y = df2['MCI'].values #sets y and converts to an array

#split the data into train and test sets for numeric encoded dataset:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 21)

#need to OneHotEncode all the X variables for input into the classification model:

enc = OneHotEncoder()
enc_X = enc.fit_transform(X)

X_train_OH, X_test_OH, y_train_OH, y_test_OH = train_test_split(enc_X, y, test_size = 0.25, random_state = 21)


print('#--------------------------------------------------#')



#--------------------------------------------------#


In [10]:

classifier = RandomForestClassifier(n_estimators = 120, criterion = 'entropy', random_state = 42, max_features = 'auto', max_depth=None, min_samples_split=2)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test) # Predicting the Test set results

print('Accuracy:')
print(accuracy_score(y_test, y_pred)) 
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))
print('Classification Report:')
print(classification_report(y_test,y_pred, target_names=definition_list_MCI)) 


#theft over is pulling down results. Pretty good on Assault (largest sample size) and break and enter 


#One Hot Encoded Model w/ SKLEARN:

classifier = RandomForestClassifier(n_estimators = 120, criterion = 'entropy', random_state = 42, max_features = 'auto', max_depth=None)
classifier.fit(X_train_OH, y_train_OH)
y_pred_OH = classifier.predict(X_test_OH) # Predicting the Test set results
print('After Encoding:')
print('Accuracy:')
print(accuracy_score(y_test_OH, y_pred_OH))
print('Confusion Matrix:')
print(confusion_matrix(y_test_OH, y_pred_OH)) 
print('Classification Report:')
print(classification_report(y_test_OH,y_pred_OH, target_names=definition_list_MCI)) #modest improve|ment

#Balanced Class Weight doesn't make a big difference for results:

# classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 42, class_weight='balanced')
# classifier.fit(X_train, y_train)
# y_pred = classifier.predict(X_test) 
# print(accuracy_score(y_test, y_pred)) #accuracy at 0.63
#print(confusion_matrix(y_test, y_pred)) 

#--------------------------------------------------#

#gradientboost performs poorly relative to randomforest



Accuracy:
0.759386133187
Confusion Matrix:
[[ 395  446    7    3   59]
 [ 217 1987   71    2  130]
 [  26  275  163    2   48]
 [  19   69    2    4   12]
 [  92  249   23    4 2993]]
Classification Report:
                 precision    recall  f1-score   support

Break and Enter       0.53      0.43      0.48       910
        Assault       0.66      0.83      0.73      2407
        Robbery       0.61      0.32      0.42       514
     Theft Over       0.27      0.04      0.07       106
     Auto Theft       0.92      0.89      0.91      3361

    avg / total       0.75      0.76      0.75      7298

After Encoding:
Accuracy:
0.77870649493
Confusion Matrix:
[[ 434  424    7    1   44]
 [ 178 2034   57    1  137]
 [  26  276  157    2   53]
 [  19   69    1    3   14]
 [  54  234   16    2 3055]]
Classification Report:
                 precision    recall  f1-score   support

Break and Enter       0.61      0.48      0.54       910
        Assault       0.67      0.85      0.75      24

In [22]:
from  sklearn.neural_network import MLPClassifier
clf = MLPClassifier(hidden_layer_sizes=(2, ), activation='tanh', solver='sgd', learning_rate='constant', random_state=11)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test) # Predicting the Test set results

print(accuracy_score(y_test, y_pred))

clf.fit(X_train_OH, y_train_OH)
y_pred_OH = clf.predict(X_test_OH) # Predicting the Test set results

print(accuracy_score(y_test_OH, y_pred_OH)) 

0.459577966566
0.73979172376


In [52]:
clf = GradientBoostingClassifier(learning_rate=0.1,n_estimators = 10, random_state = 42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test) # Predicting the Test set results

print('Accuracy:')
print(accuracy_score(y_test, y_pred)) 
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))
print('Classification Report:')
print(classification_report(y_test,y_pred, target_names=definition_list_MCI)) 

# clf.fit(X_train_OH, y_train_OH)
# y_pred_OH = clf.predict(X_test_OH) 
# print('After Encoding:')
# print('Accuracy:')
# print(accuracy_score(y_test_OH, y_pred_OH))
# print('Confusion Matrix:')
# print(confusion_matrix(y_test_OH, y_pred_OH)) 
# print('Classification Report:')
# print(classification_report(y_test_OH,y_pred_OH, target_names=definition_list_MCI))

# print(classification_report(y_test_OH,y_pred_OH, target_names=definition_list_MCI)) 
# print(classification_report(y_test,y_pred, target_names=definition_list_MCI)) 


Accuracy:
0.686489449164
Confusion Matrix:
[[ 121  691    0    0   98]
 [  82 2008    0    0  317]
 [   3  375    0    0  136]
 [   7   87    0    0   12]
 [ 106  374    0    0 2881]]
Classification Report:
                 precision    recall  f1-score   support

Break and Enter       0.38      0.13      0.20       910
        Assault       0.57      0.83      0.68      2407
        Robbery       0.00      0.00      0.00       514
     Theft Over       0.00      0.00      0.00       106
     Auto Theft       0.84      0.86      0.85      3361

    avg / total       0.62      0.69      0.64      7298



In [49]:
from sklearn import tree
X = [[0, 0], [1, 1]]
Y = [0, 1]
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, Y)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test) # Predicting the Test set results

print('Accuracy:')
print(accuracy_score(y_test, y_pred)) 
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))
print('Classification Report:')
print(classification_report(y_test,y_pred, target_names=definition_list_MCI)) 

clf.fit(X_train_OH, y_train_OH)
y_pred_OH = clf.predict(X_test_OH) # Predicting the Test set results

print('After Encoding:')
print('Accuracy:')
print(accuracy_score(y_test_OH, y_pred_OH))
print('Confusion Matrix:')
print(confusion_matrix(y_test_OH, y_pred_OH)) 
print('Classification Report:')
print(classification_report(y_test_OH,y_pred_OH, target_names=definition_list_MCI))


Accuracy:
0.703480405591
Confusion Matrix:
[[ 398  318   59   31  104]
 [ 342 1612  208   49  196]
 [  45  204  181   14   70]
 [  24   44   10   11   17]
 [  93  231   77   28 2932]]
Classification Report:
                 precision    recall  f1-score   support

Break and Enter       0.44      0.44      0.44       910
        Assault       0.67      0.67      0.67      2407
        Robbery       0.34      0.35      0.35       514
     Theft Over       0.08      0.10      0.09       106
     Auto Theft       0.88      0.87      0.88      3361

    avg / total       0.71      0.70      0.71      7298

After Encoding:
Accuracy:
0.724582077281
Confusion Matrix:
[[ 431  348   32   19   80]
 [ 321 1691  152   46  197]
 [  40  210  180   13   71]
 [  23   57    4    8   14]
 [  87  215   67   14 2978]]
Classification Report:
                 precision    recall  f1-score   support

Break and Enter       0.48      0.47      0.48       910
        Assault       0.67      0.70      0.69      2

In [45]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=100)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test) 
print(accuracy_score(y_test, y_pred))



0.54014798575


In [53]:
from sklearn.svm import LinearSVC
clf = LinearSVC(random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test) # Predicting the Test set results

print('Accuracy:')
print(accuracy_score(y_test, y_pred)) 
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))
print('Classification Report:')
print(classification_report(y_test,y_pred, target_names=definition_list_MCI)) 

clf.fit(X_train_OH, y_train_OH)
y_pred_OH = clf.predict(X_test_OH) # Predicting the Test set results

print('After Encoding:')
print('Accuracy:')
print(accuracy_score(y_test_OH, y_pred_OH))
print('Confusion Matrix:')
print(confusion_matrix(y_test_OH, y_pred_OH)) 
print('Classification Report:')
print(classification_report(y_test_OH,y_pred_OH, target_names=definition_list_MCI))

Accuracy:
0.502740476843
Confusion Matrix:
[[ 179  709    1    0   21]
 [ 356 1966    5    0   80]
 [  54  438    3    0   19]
 [  12   90    1    0    3]
 [ 344 1491    5    0 1521]]
Classification Report:
                 precision    recall  f1-score   support

Break and Enter       0.19      0.20      0.19       910
        Assault       0.42      0.82      0.55      2407
        Robbery       0.20      0.01      0.01       514
     Theft Over       0.00      0.00      0.00       106
     Auto Theft       0.93      0.45      0.61      3361

    avg / total       0.60      0.50      0.49      7298

After Encoding:
Accuracy:
0.748287201973
Confusion Matrix:
[[ 366  506    4    0   34]
 [ 193 2089   28    0   97]
 [  33  396   34    0   51]
 [  21   72    1    0   12]
 [  89  291    9    0 2972]]
Classification Report:
                 precision    recall  f1-score   support

Break and Enter       0.52      0.40      0.45       910
        Assault       0.62      0.87      0.73      2