# Atlanta Crime - Supervised Learning

## Description
In this part of the project, we will use supervised learning algorithms, namely, Decision Trees/Random Forests, SVM, and Logistic Regression algorithms to predict crime score for a given location and time.

## Import Packages

In [44]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.tree import plot_tree
from sklearn.tree.export import export_text
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import datetime
import random
import numpy as np

## Load in Data

In [9]:
# data = pd.read_csv("data/examples/cobra_example_10.csv") #, sep=',', header=0, engine='python', usecols = lambda column : column in ["Neighborhood", "Month", "Day", "Shift"])
training_data = pd.read_csv("cobra-clean.csv", sep=',', header=0, usecols = lambda column : column in ["Occur Date", "Neighborhood", "Shift Occurence", "Shift", "UCR #"])

def day_of_week(row):
    cur_date = row['Occur Date']
    year, month, day = (int(x) for x in cur_date.split('-'))
    dt = datetime.date(year, month, day)
    return(dt.weekday())

def month(row):
    cur_date = row['Occur Date']
    year, month, day = (int(x) for x in cur_date.split('-'))
    return month

training_data['Day of Week'] = training_data.apply(lambda row: day_of_week (row), axis=1)
training_data['Month'] = training_data.apply(lambda row: month (row), axis=1)
training_data = training_data.drop(['Occur Date'], 1)

# range(0 - 1000)
# two decimal places
# training_data['Crime Score'] = training_data.apply(lambda row: random.randint(0 * 100, 1000 * 100), axis=1)

training_data.head()

Unnamed: 0,UCR #,Neighborhood,Shift Occurence,Day of Week,Month
0,630,Greenbriar,Day,3,1
1,630,Downtown,Day,3,1
2,630,Lenox,Day,3,1
3,630,Greenbriar,Evening,3,1
4,630,Edgewood,Evening,3,1


In [10]:
"""
# data = pd.read_csv("data/examples/cobra_example_10.csv") #, sep=',', header=0, engine='python', usecols = lambda column : column in ["Neighborhood", "Month", "Day", "Shift"])
test_data = pd.read_csv("asdf.csv", sep=',', header=0, usecols = lambda column : column in ["Occur Date", "Neighborhood", "Shift Occurence"])

def day_of_week(row):
    cur_date = row['Occur Date']
    year, month, day = (int(x) for x in cur_date.split('-'))
    dt = datetime.date(year, month, day)
    return(dt.weekday())

def month(row):
    cur_date = row['Occur Date']
    year, month, day = (int(x) for x in cur_date.split('-'))
    return month

test_data['Day of Week'] = test_data.apply(lambda row: day_of_week (row), axis=1)
test_data['Month'] = test_data.apply(lambda row: month (row), axis=1)
test_data = test_data.drop(['Occur Date'], 1)

# range(0 - 1000)
# two decimal places
test_data['Crime Score'] = test_data.apply(lambda row: random.randint(0 * 100, 1000 * 100), axis=1)
"""

test_data = training_data[int(0.10 * len(training_data)):]
training_data = training_data[:int(0.10 * len(training_data))]

test_data.head()

Unnamed: 0,UCR #,Neighborhood,Shift Occurence,Day of Week,Month
29918,640,Blandtown,Evening,3,10
29919,640,Downtown,Day,3,10
29920,640,Virginia Highland,Evening,3,10
29921,640,Lenox,Evening,3,10
29922,640,Buckhead Village,Evening,3,10


## Decision Tree Model

### Transforming the data

In [11]:
le = preprocessing.LabelEncoder()
le.fit(training_data["Neighborhood"])
training_data["Neighborhood"] = le.transform(training_data["Neighborhood"])
le.fit(training_data["Shift Occurence"])
training_data["Shift Occurence"] = le.transform(training_data["Shift Occurence"])

training_data.head()

Unnamed: 0,UCR #,Neighborhood,Shift Occurence,Day of Week,Month
0,630,84,0,3,1
1,630,63,0,3,1
2,630,108,0,3,1
3,630,84,1,3,1
4,630,69,1,3,1


In [12]:
le = preprocessing.LabelEncoder()
le.fit(test_data["Neighborhood"])
test_data["Neighborhood"] = le.transform(test_data["Neighborhood"])
le.fit(test_data["Shift Occurence"])
test_data["Shift Occurence"] = le.transform(test_data["Shift Occurence"])

test_data.head()

Unnamed: 0,UCR #,Neighborhood,Shift Occurence,Day of Week,Month
29918,640,24,1,3,10
29919,640,64,0,3,10
29920,640,182,1,3,10
29921,640,111,1,3,10
29922,640,36,1,3,10


In [16]:
# Reading data & split into training and testing sets
# X = data.drop(['Crime Score'], 1)
# print(X)
# y = data['Crime Score']
# print(y)

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=100)
# Geeksforgeeks: "random_state" variable is a pseudo-random number generator state used for random sampling

# X_train = X[:int(len(X) * 0.7)]
# X_test = X[int(len(X) * 0.7):]
# y_train = y[:int(len(y) * 0.7)]
# y_test = y[int(len(y) * 0.7):]

X_train = training_data.drop(['UCR #'], 1)
y_train = training_data['UCR #']
X_test = test_data.drop(['UCR #'], 1)
y_test = test_data['UCR #']

In [21]:
# Model Definition & Fitting
dt_clf = DecisionTreeClassifier()
dt_clf = dt_clf.fit(X_train[:int(len(X_train))], y_train[:int(len(y_train))])

# Predict based on the given features for the test set
dt_y_pred = dt_clf.predict(X_test[:int(len(X_test))])
dt_predictions_probability = dt_clf.predict_proba(X_test[:int(len(X_test))])

### Decision Tree Metrics & Plots

In [46]:
# Accuracy within range
count = 0
i = 0
for keys in y_test:
    if np.absolute(keys - dt_y_pred[i]) <= 50:
        count += 1
    i += 1
print("Accuracy:", count / int(len(y_test)))

Accuracy: 0.34992683705832983


In [49]:
# Classification Metrics
print("Accuracy:",metrics.accuracy_score(y_test[:int(len(y_test))], dt_y_pred))
# print("Precision:",metrics.precision_score(y_test[:int(len(y_test) * 0.25)], dt_y_pred))
# print("Recall:",metrics.recall_score(y_test[:int(len(y_test) * 0.25)], dt_y_pred))

# # Compare to the actual labels
# '''Insert comparison methods here'''

# Plot the tree
# plot_tree(dt_clf)

# # Export a text file containing the rules for the decision tree
# '''decision_tree = DecisionTreeClassifier(random_state=0, max_depth=2) # 
# decision_tree = decision_tree.fit(X, Y)
# r = export_text(decision_tree, feature_names=iris['feature_names'])
# print(r)'''

Accuracy: 0.12738150945918844


## Random Forest Model

In [33]:
# Source: https://www.datacamp.com/community/tutorials/random-forests-classifier-python

# Keeps on prompting a restart of the kernel

# Create a Gaussian Classifier
rf_clf = RandomForestClassifier(n_estimators=100)

# Train the model using the training sets y_pred=clf.predict(X_test)
rf_clf = rf_clf.fit(X_train[:int(len(X_train))], y_train[:int(len(y_train))])

rf_y_pred = rf_clf.predict(X_test[:int(len(X_test))])
rf_predictions_probability = rf_clf.predict_proba(X_test[:int(len(X_test))])

In [51]:
# Accuracy within range
count = 0
i = 0
for keys in y_test:
    if np.absolute(keys - dt_y_pred[i]) <= 50:
        count += 1
    i += 1
print("Accuracy:", count / int(len(y_test)))

Accuracy: 0.34992683705832983


In [50]:
# Classification Metrics
print("Accuracy:",metrics.accuracy_score(y_test[:int(len(y_test))], rf_y_pred))

Accuracy: 0.14462865164783742


## NBC Model

In [55]:
gnb = GaussianNB()
gnb = gnb.fit(X_train[:int(len(X_train))], y_train[:int(len(y_train))])

nbc_y_pred = gnb.predict(X_test[:int(len(X_test))])
nbc_predictions_probability = gnb.predict_proba(X_test[:int(len(X_test))])

In [56]:
# Accuracy within range
count = 0
i = 0
for keys in y_test:
    if np.absolute(keys - nbc_y_pred[i]) <= 50:
        count += 1
    i += 1
print("Accuracy:", count / int(len(y_test)))

Accuracy: 0.42022639659513783


In [None]:
# Classification Metrics
print("Accuracy:",metrics.accuracy_score(y_test[:int(len(y_test))], nbc_y_pred))

## SVM Model

In [57]:
svm_clf = SVC(gamma='auto')
svm_clf = svm_clf.fit(X_train[:int(len(X_train))], y_train[:int(len(y_train))])

svm_y_pred = svm_clf.predict(X_test[:int(len(X_test))])
svm_predictions_probability = svm_clf.predict_proba(X_test[:int(len(X_test))])

AttributeError: predict_proba is not available when  probability=False

In [None]:
# Accuracy within range
count = 0
i = 0
for keys in y_test:
    if np.absolute(keys - svm_y_pred[i]) <= 50:
        count += 1
    i += 1
print("Accuracy:", count / int(len(y_test)))

In [None]:
# Classification Metrics
print("Accuracy:",metrics.accuracy_score(y_test[:int(len(y_test))], svm_y_pred))