Project Prep

In [0]:
#IMPORT FILES FROM DRIVE INTO GOOGLE-COLAB:

#STEP-1: Import Libraries

# Code to read csv file into colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

#STEP-2: Autheticate E-Mail ID

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

#STEP-3: Get File from Drive using file-ID

#2.1 Get the file
downloaded = drive.CreateFile({'id':'1dDC9SZCUCF4VWBcv-zqdwPU5Gn8yqTF7'}) # replace the id with id of file you want to access
downloaded.GetContentFile('Police_Incidents_2018.csv') 



In [0]:
import numpy as np
import pandas as pd
from numpy import array
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import statsmodels.api as sm

#load dataset

df = pd.read_csv("Police_Incidents_2018.csv")


#Create "Hours", "BeginDate" and "dayofyear" parameters
df["Hours"] = df.Time.str.split(':').str[0].astype(int)  
df['BeginDate'] = pd.to_datetime(df['BeginDate']) 
df['dayofyear'] = df['BeginDate'].dt.dayofyear 


#Create labels called "Night", "Morning", "Afternoon", and "Evening"
df.assign(session=pd.cut(df.Hours,[0,6,12,18,24],labels=['Night','Morning','Afternoon','Evening']))  
prods = pd.DataFrame({'Time':range(1, 25)}) 
b = [0,4,8,12,16,20,24] 
l = ['Late Night', 'Early Morning','Morning','Noon','Eve','Night'] 
df['Timeofday'] = pd.cut(df['Hours'], bins=b, labels=l) 


bins = [0, 91, 183, 275, 366] 
labels=['Winter', 'Spring', 'Summer', 'Fall'] 

df['SEASON'] = pd.cut(df['dayofyear'] + 11 - 366*(df['dayofyear'] > 355), bins=bins, labels=labels)


#16- Neighborhood
#23- Time of Day
#22 Season
#3-Precinct,
#11-GBSID,
#12-Lat
#13-Long
X = df.iloc[:,[16,23,22,3,11,12,13]].values

#9-UCR Code our Class Label
y = df.iloc[:,9].values

# SimpleImputer
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer.fit(X[:,:])
X[:,:]=imputer.transform(X[:,:])
print("SimpleImputer-Complete")


#Neighbourhood in one hot encoding 
data = X[:,0]
values = array(data)
# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
# binary encode
onehot_encoder = OneHotEncoder(sparse=False,categorical_features=[0])
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
#concatenate OHE to the features set X
X = np.concatenate((X,onehot_encoder.fit_transform(integer_encoded)),axis=1)
#drop the Neighborhood column
X = X[:,1:]
print("one hot encoding on Neighborhood-Complete")


# one hot encoding on TimeOfDay

data = X[:,0]
values = array(data)
# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
# binary encode
onehot_encoder = OneHotEncoder(sparse=False,categorical_features=[0])
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
#concatenate OHE to the features set X
X = np.concatenate((X,onehot_encoder.fit_transform(integer_encoded)),axis=1)
#drop the Timeofday column
X = X[:,1:]
print("one hot encoding on Time-Complete")


# one hot encoding on Season #Column index =0 #Neighbourhood in one hot encoding 
data = X[:,0] 
values = array(data) # integer encode 
label_encoder = LabelEncoder() 
integer_encoded = label_encoder.fit_transform(values) # binary encode 
onehot_encoder = OneHotEncoder(sparse=False,categorical_features=[0]) 
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1) #concatenate OHE to the features set X 
X = np.concatenate((X,onehot_encoder.fit_transform(integer_encoded)),axis=1) #drop the Season column 
X = X[:,1:] 
print("one hot encoding on Season") 


# Splitting 
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X,y, test_size=0.3)


# Normalization
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)


#Applying PCA
from sklearn.decomposition import PCA
pcaObj = PCA(n_components=4)
X_train = pcaObj.fit_transform(X_train)
X_test = pcaObj.transform(X_test)
components_variance = pcaObj.explained_variance_ratio_

print(components_variance)


SimpleImputer-Complete
one hot encoding on Neighborhood-Complete
one hot encoding on Time-Complete
one hot encoding on Season


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


[0.02301507 0.02004342 0.01441111 0.01313143]


In [0]:
#this is SVM

#Fitting Classifier to Training Set. Create a classifier object here and call it classifierObj 

from sklearn.svm import SVC
classifierObj = SVC()


grid_param = { 
 
     'kernel': ['linear'],#'linear', 'poly', 'sigmoid'], 
      'C': [10],
    #'degree': [3,4,5],
    'gamma': [1],
    #'shrinking': [True, False],
    #'probability': [False, True],
    #'tol': [0.001],
    #'verbose': [False, True],
    #'max_iter': [-1],
    #'decision_function_shape':['ovr','ovo']
    
}
from sklearn.model_selection import GridSearchCV
gd_sr = GridSearchCV(estimator=classifierObj, param_grid=grid_param, scoring='accuracy', cv=5, n_jobs=-1) 
gd_sr.fit(X_train, y_train) 
print("Fitting SVM")
print(gd_sr.best_params_)  
print('The Accuracy Score is:', gd_sr.best_score_)



Fitting SVM
{'C': 10, 'gamma': 1, 'kernel': 'linear'}
The Accuracy Score is: 0.5628765792031099


In [0]:
#Fitting Naive Bayes
# GridSearch cannot be performed on GaussianNaiveBayes
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
gnb = GaussianNB()
print("naive bayes start")

GNB_trainedModel = gnb.fit(X_train, y_train)
print("naive bayes trained!")
gnb_pred = GNB_trainedModel.predict(X_test)
GNB_score = metrics.accuracy_score(y_test, gnb_pred)
print("naive bayes predicted!")
print("complete")
print('The Accuracy score is:', GNB_score)



naive bayes start
naive bayes trained!
naive bayes predicted!
complete
The Accuracy score is: 0.5569160997732426


In [0]:

#Fitting decision tree

from sklearn.tree import DecisionTreeClassifier
classifierObj = DecisionTreeClassifier()


grid_param = { 
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [2,3],
    'min_samples_leaf': [1,2],
    'min_weight_fraction_leaf': [0],
    'max_features': ['auto'],
    'min_impurity_decrease': [0],
    #'min_impurity_split': [1e-7],deprecated 
    'class_weight': ['balanced'],
    'presort':[False]
}
from sklearn.model_selection import GridSearchCV
gd_sr = GridSearchCV(estimator=classifierObj, param_grid=grid_param, scoring='accuracy', cv=5, n_jobs=-1) 
gd_sr.fit(X_train, y_train) 
print("Fitting decision tree")
print(gd_sr.best_params_)  
print('The Accuracy Score is:', gd_sr.best_score_)

Fitting decision tree
{'class_weight': 'balanced', 'criterion': 'gini', 'max_features': 'auto', 'min_impurity_decrease': 0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0, 'presort': False}
The Accuracy Score is: 0.40641399416909624


In [0]:
from sklearn.ensemble import RandomForestClassifier
classifierObj = RandomForestClassifier()


grid_param = { 
    'n_estimators': [300]
   #  'criterion': ['gini'],
#     'max_depth': ['None'],
#     'min_samples_split': [2],
#     'min_samples_leaf': [1],
#     'min_weight_fraction_leaf': [0],
#     'max_features': ['auto'],
#     'max_leaf_nodes': ['None'],
#     'min_impurity_decrease': [0],
#     'min_impurity_split': [1e-7],
#     'bootstrap':[True],
#     'oob_score': [False],
#     'n_jobs':['None'],
#     #'random_state': ['None'],
#     'verbose':[0],
#     'warm_start': [False]
#     #'class_weight': ['None']
}
from sklearn.model_selection import GridSearchCV
gd_sr = GridSearchCV(estimator=classifierObj, param_grid=grid_param, scoring='accuracy', cv=6, n_jobs=-1) 
gd_sr.fit(X_train, y_train) 

print(gd_sr.best_params_)  
print('The Accuracy Score is:', gd_sr.best_score_)



{'n_estimators': 300}
The Accuracy Score is: 0.49523809523809526


In [0]:

# ---------- KNN
from sklearn.neighbors import KNeighborsClassifier
grid_param = {
    'n_neighbors':[43],
    'weights':['uniform', 'distance'],
    'metric': ['euclidean','manhattan','minkowski']

}
from sklearn.model_selection import GridSearchCV
gd_sr = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=grid_param, scoring='accuracy', cv=5, n_jobs=-1) 
gd_sr.fit(X_train, y_train) 
print("Fitting KNN")
print(gd_sr.best_params_)  
print(gd_sr.best_score_)


Fitting KNN
{'metric': 'manhattan', 'n_neighbors': 43, 'weights': 'uniform'}
0.5615160349854227


In [0]:
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score
#2 classifierObjKNN_168 with 168 neighbors for K-NN
classifierObjKNN = KNeighborsClassifier(n_neighbors=43, p=2, metric='manhattan')

#3 classifierObjSVC_lr with a linear kernel for SVM
classifierObjSVC = SVC(kernel = 'linear')

#4 classifierObjNB for Naive Bayes
classifierObjGNB = GaussianNB()
#5 classifierObjRF_57 with 57 estimators for random forest
classifierObjRF = RandomForestClassifier(n_estimators=57, criterion='entropy')

classifierObjVC = VotingClassifier(estimators=[('kNN', classifierObjKNN),
                                               ('SVM', classifierObjSVC),
                                               ('NB', classifierObjGNB),
                                               ('RF', classifierObjRF)], voting='hard')
# fit the model
classifierObjVC.fit(X_train,y_train)

#Making predictions on the Test Set
y_pred_VC = classifierObjVC.predict(X_test)

#Test Error 
Test_Error = 1 - metrics.accuracy_score(y_test, y_pred_VC)

print('Accuracy using kNN is:' , cross_val_score(classifierObjKNN,X_train,y_train,scoring='accuracy',cv=6).mean())
print('Accuracy using SVC is:' , cross_val_score(classifierObjSVC,X_train,y_train,scoring='accuracy',cv=6).mean())
print('Accuracy using GNB is:' , cross_val_score(classifierObjGNB,X_train,y_train,scoring='accuracy',cv=6).mean())
print('Accuracy using Random Forest is:' , cross_val_score(classifierObjRF,X_train,y_train,scoring='accuracy',cv=6).mean())

print('The Accuracy Score from Ensemble is :', metrics.accuracy_score(y_test, y_pred_VC))



Accuracy using kNN is: 0.5587936155160306




Accuracy using SVC is: 0.5628792400317572
Accuracy using GNB is: 0.55472202524493




Accuracy using Random Forest is: 0.4925341733375925
The Accuracy Score from Ensemble is : 0.5578231292517006


In [0]:
#Creating a pipeline 
from sklearn.preprocessing import StandardScaler 
from sklearn.decomposition import PCA 
from sklearn.linear_model import LogisticRegression 
from sklearn.pipeline import make_pipeline 
pipe_lr = make_pipeline(StandardScaler(), PCA(n_components=2), KNeighborsClassifier(n_neighbors=43, p=2, metric='minkowski'))  
pipe_lr.fit(X_train, y_train) 
y_pred = pipe_lr.predict(X_test) 
print('Test Accuracy: %.3f' % pipe_lr.score(X_test, y_test))

Test Accuracy: 0.559
