# Implementing 4 Standard Machine Learning Algorithms

## Imports

In [79]:
import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer

## Data Cleaning and Processing

In [11]:
crimes = pd.read_csv("data/Crimes.csv").drop(["Unnamed: 0", "ID", "Case Number", "Description", "Block",
                                              "Location Description", "X Coordinate", "Y Coordinate", "IUCR",
                                              "Year", "Updated On", "Location", "District", "FBI Code"], axis = 1)

In [20]:
crimes["Date"] = crimes["Date"].apply(lambda x : datetime.strptime(x, "%m/%d/%Y %I:%M:%S %p"))

In [31]:
crimes["hour"] = crimes["Date"].apply(lambda x : x.hour)

crimes["Time_Day"] = crimes["hour"] \
    .mask((crimes["hour"] < 4), "12am-4am") \
    .mask((crimes["hour"] <= 8) & (crimes["hour"] > 4), "4am-8am") \
    .mask((crimes["hour"] <= 12) & (crimes["hour"] > 8), "8am-12pm") \
    .mask((crimes["hour"] <= 16) & (crimes["hour"] > 12), "12pm-4pm") \
    .mask((crimes["hour"] <= 20) & (crimes["hour"] > 16), "4pm-8pm") \
    .mask((crimes["hour"] <= 24) & (crimes["hour"] > 20), "8pm-12am")

In [39]:
crimes["Community Area"] = crimes["Community Area"].apply(lambda x : str(x))
crimes["Beat"] = crimes["Beat"].apply(lambda x : str(x))
crimes["Ward"] = crimes["Ward"].apply(lambda x : str(int(x)))
crimes.drop("hour", axis=1, inplace=True)

In [63]:
violent_crimes = ["HOMICIDE", "BATTERY", "CRIM SEXUAL ASSAULT", "CRIMINAL SEXUAL ASSAULT",
                  "ASSAULT", "ROBBERY", "HUMAN TRAFFICKING"]

crimes["is_violent"] = crimes["Primary Type"].apply(lambda x : True if x in violent_crimes else False)

In [82]:
final_data = pd.get_dummies(crimes.drop(["Date", "Primary Type", "Arrest"], axis = 1))
X = final_data.drop("is_violent", axis=1)
y = final_data["is_violent"]

In [83]:
imputer = SimpleImputer(strategy="mean")
X = imputer.fit_transform(X)

In [84]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

## Random Forest

In [86]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
preds = rf.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

       False       0.77      0.82      0.80     44612
        True       0.56      0.48      0.51     20851

    accuracy                           0.71     65463
   macro avg       0.66      0.65      0.65     65463
weighted avg       0.70      0.71      0.71     65463



## KNN

In [85]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
preds = knn.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

       False       0.78      0.85      0.81     44612
        True       0.59      0.48      0.53     20851

    accuracy                           0.73     65463
   macro avg       0.68      0.66      0.67     65463
weighted avg       0.72      0.73      0.72     65463



## Naive Bayes

In [87]:
nb = GaussianNB()
nb.fit(X_train, y_train)
preds = nb.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

       False       0.50      0.00      0.00     44612
        True       0.32      1.00      0.48     20851

    accuracy                           0.32     65463
   macro avg       0.41      0.50      0.24     65463
weighted avg       0.44      0.32      0.15     65463



## Support Vector Machine

In [88]:
svm = SVC()
svm.fit(X_train, y_train)
preds = svm.predict(X_test)
print(classification_report(y_test, preds))

## Summary

As we can see from these Model results, just using untuned models, the K Nearest Neighbors performs best when trying to assess whether or not the crime was considered a violent crime. The benefit of using a model like a Random Forest, is that you get automatic feature importances so, although the Random Forest did not perform as well, I would use the Random Forest over the KNN because the model tells us more about the data.