# Atlanta Crime - Supervised Learning

## Description
In this part of the project, we will use supervised learning algorithms, namely, Decision Trees/Random Forests, SVM, and Logistic Regression algorithms to predict crime score for a given location and time.

## Import Packages

In [40]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.tree import plot_tree
from sklearn.tree.export import export_text
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn import preprocessing
import datetime
import random

## Load in Data

In [2]:
# data = pd.read_csv("data/examples/cobra_example_10.csv") #, sep=',', header=0, engine='python', usecols = lambda column : column in ["Neighborhood", "Month", "Day", "Shift"])
data = pd.read_csv("cobra-clean.csv", sep=',', header=0, usecols = lambda column : column in ["Occur Date", "Neighborhood", "Shift Occurence"])

def day_of_week(row):
    cur_date = row['Occur Date']
    year, month, day = (int(x) for x in cur_date.split('-'))
    dt = datetime.date(year, month, day)
    return(dt.weekday())

def month(row):
    cur_date = row['Occur Date']
    year, month, day = (int(x) for x in cur_date.split('-'))
    return month

data['Day of Week'] = data.apply(lambda row: day_of_week (row), axis=1)
data['Month'] = data.apply(lambda row: month (row), axis=1)
data = data.drop(['Occur Date'], 1)

# range(0 - 1000)
# two decimal places
data['Crime Score'] = data.apply(lambda row: random.randint(0 * 100, 1000 * 100), axis=1)

data.head()

Unnamed: 0,Neighborhood,Shift Occurence,Day of Week,Month,Crime Score
0,Greenbriar,Day,3,1,93912
1,Downtown,Day,3,1,20634
2,Lenox,Day,3,1,49766
3,Greenbriar,Evening,3,1,90549
4,Edgewood,Evening,3,1,69844


## Decision Tree Model

In [3]:
le = preprocessing.LabelEncoder()
le.fit(data["Neighborhood"])
data["Neighborhood"] = le.transform(data["Neighborhood"])
le.fit(data["Shift Occurence"])
data["Shift Occurence"] = le.transform(data["Shift Occurence"])
data.head()

Unnamed: 0,Neighborhood,Shift Occurence,Day of Week,Month,Crime Score
0,86,0,3,1,93912
1,64,0,3,1,20634
2,111,0,3,1,49766
3,86,1,3,1,90549
4,70,1,3,1,69844


In [69]:
# Reading data & split into training and testing sets
X = data.drop(['Crime Score'], 1)
# print(X)
y = data['Crime Score']
# print(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=100)
# Geeksforgeeks: "random_state" variable is a pseudo-random number generator state used for random sampling

# X_train = X[:int(len(X) * 0.7)]
# X_test = X[int(len(X) * 0.7):]
# y_train = y[:int(len(y) * 0.7)]
# y_test = y[int(len(y) * 0.7):]

In [76]:
# Model Definition & Fitting
clf = DecisionTreeClassifier()
clf = clf.fit(X_train[:int(len(X_train) * 0.25)], y_train[:int(len(y_train) * 0.25)])

# Predict based on the given features for the test set
y_pred = clf.predict(X_test[:int(len(X_test) * 0.25)])
predictions_probability = clf.predict_proba(X_test[:int(len(X_test) * 0.25)])

In [71]:
X_test.head()

Unnamed: 0,Neighborhood,Shift Occurence,Day of Week,Month
164059,67,2,5,12
106352,64,0,0,2
236678,149,0,0,7
214246,133,1,2,9
65066,151,0,1,11


In [73]:
# Accuracy within range 5000 (50 crime score point difference)
count = 0
i = 0
for keys in y_test:
    if i >= int(len(y_test) * 0.25):
        break
    if (keys - y_pred[i] >= 0 and keys - y_pred[i] <= 5000) or (keys - y_pred[i] <= 0 and keys - y_pred[i] >= -5000):
        count += 1
    i += 1
print("Accuracy:", count / int(len(y_test) * 0.25))

Accuracy: 0.09376949817274267


## Decision Tree Metrics & Plots

In [77]:
# Classification Metrics
# print("Accuracy:",metrics.accuracy_score(y_test[:int(len(y_test) * 0.25)], y_pred))
# print("Precision:",metrics.precision_score(y_test[:int(len(y_test) * 0.25)], y_pred))
# print("Recall:",metrics.recall_score(y_test[:int(len(y_test) * 0.25)], y_pred))

# # Compare to the actual labels
# '''Insert comparison methods here'''

# Plot the tree
plot_tree(clf)

# # Export a text file containing the rules for the decision tree
# '''decision_tree = DecisionTreeClassifier(random_state=0, max_depth=2) # 
# decision_tree = decision_tree.fit(X, Y)
# r = export_text(decision_tree, feature_names=iris['feature_names'])
# print(r)'''

[Text(314.24801486606583, 366.35789473684207, 'X[0] <= 198.5\ngini = 1.0\nsamples = 52356\nvalue = [1, 1, 1 ... 1, 1, 1]'),
 Text(134.04418841453935, 359.8736842105263, 'X[0] <= 0.5\ngini = 1.0\nsamples = 52236\nvalue = [1, 1, 1 ... 1, 1, 1]'),
 Text(2.6544592773047353, 353.3894736842105, 'X[3] <= 6.5\ngini = 0.997\nsamples = 323\nvalue = [0, 0, 0 ... 0, 0, 0]'),
 Text(1.41776597877343, 346.9052631578947, 'X[3] <= 5.5\ngini = 0.993\nsamples = 148\nvalue = [0, 0, 0 ... 0, 0, 0]'),
 Text(0.7125757515013575, 340.4210526315789, 'X[1] <= 0.5\ngini = 0.991\nsamples = 115\nvalue = [0, 0, 0 ... 0, 0, 0]'),
 Text(0.21596291539426418, 333.9368421052631, 'X[3] <= 1.5\ngini = 0.979\nsamples = 48\nvalue = [0, 0, 0 ... 0, 0, 0]'),
 Text(0.06519635181713636, 327.4526315789473, 'X[2] <= 0.5\ngini = 0.857\nsamples = 7\nvalue = [0, 0, 0 ... 0, 0, 0]'),
 Text(0.03259817590856818, 320.9684210526315, 'gini = 0.0\nsamples = 1\nvalue = [0, 0, 0 ... 0, 0, 0]'),
 Text(0.09779452772570454, 320.9684210526315, 'X