# Atlanta Crime - Supervised Learning

## Description
In this part of the project, we will use supervised learning algorithms, namely, Decision Trees/Random Forests, SVM, and Logistic Regression algorithms to predict crime score for a given location and time.

## Import Packages

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.tree import plot_tree
from sklearn.tree.export import export_text
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import datetime
import random
import numpy as np

## Load in Data

In [2]:
# data = pd.read_csv("data/examples/cobra_example_10.csv") #, sep=',', header=0, engine='python', usecols = lambda column : column in ["Neighborhood", "Month", "Day", "Shift"])
data = pd.read_csv("cobra-counts.csv") #, sep=',', header=0, usecols = lambda column : column in ["Occur Date", "Neighborhood", "Shift Occurence"])

# def day_of_week(row):
#     cur_date = row['Occur Date']
#     year, month, day = (int(x) for x in cur_date.split('-'))
#     dt = datetime.date(year, month, day)
#     return(dt.weekday())

# def month(row):
#     cur_date = row['Occur Date']
#     year, month, day = (int(x) for x in cur_date.split('-'))
#     return month

# data['Day of Week'] = data.apply(lambda row: day_of_week (row), axis=1)
# data['Month'] = data.apply(lambda row: month (row), axis=1)
# data = data.drop(['Occur Date'], 1)

# range(0 - 1000)
# two decimal places
#data['Crime Score'] = data.apply(lambda row: random.randint(0 * 100, 1000 * 100), axis=1)
data['Crime Score'] = 1000*data['Category 1'] + 100*data['Category 2'] + 10*data['Category 3'] + 1*data['Category 4']
data.head()

Unnamed: 0,Neighborhood,Month,Day,Shift,Category 1,Category 2,Category 3,Category 4,Crime Score
0,Adair Park,1,0,Morning,0,0,1,1,11
1,Adair Park,1,0,Day,0,0,3,4,34
2,Adair Park,1,0,Evening,0,1,2,4,124
3,Adair Park,1,1,Morning,0,0,2,1,21
4,Adair Park,1,1,Day,0,4,3,2,432


## Decision Tree Model

### Transforming the data

In [3]:
le = preprocessing.LabelEncoder()
le.fit(data["Neighborhood"])
data["Neighborhood"] = le.transform(data["Neighborhood"])
le.fit(data["Shift"])
data["Shift"] = le.transform(data["Shift"])

data.head()

Unnamed: 0,Neighborhood,Month,Day,Shift,Category 1,Category 2,Category 3,Category 4,Crime Score
0,0,1,0,2,0,0,1,1,11
1,0,1,0,0,0,0,3,4,34
2,0,1,0,1,0,1,2,4,124
3,0,1,1,2,0,0,2,1,21
4,0,1,1,0,0,4,3,2,432


In [4]:
# Reading data & split into training and testing sets
X = data.drop(['Crime Score','Category 1','Category 2','Category 3','Category 4'], 1)
# print(X)
y = data['Crime Score']
# print(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=100)
# Geeksforgeeks: "random_state" variable is a pseudo-random number generator state used for random sampling

# X_train = X[:int(len(X) * 0.7)]
# X_test = X[int(len(X) * 0.7):]
# y_train = y[:int(len(y) * 0.7)]
# y_test = y[int(len(y) * 0.7):]

In [5]:
# Model Definition & Fitting
clf = DecisionTreeClassifier()
clf = clf.fit(X_train[:int(len(X_train) * 0.25)], y_train[:int(len(y_train) * 0.25)])

# Predict based on the given features for the test set
y_pred = clf.predict(X_test[:int(len(X_test) * 0.25)])
predictions_probability = clf.predict_proba(X_test[:int(len(X_test) * 0.25)])

In [6]:
X_test.head()

Unnamed: 0,Neighborhood,Month,Day,Shift
19036,75,7,3,0
45882,182,1,6,2
2032,8,1,5,0
11314,44,11,5,0
31612,125,6,2,0


### Decision Tree Metrics & Plots

In [7]:
# Accuracy within range 5000 (50 crime score point difference)
count = 0
i = 0
for keys in y_test:
    if i >= int(len(y_test) * 0.25):
        break
    if (keys - y_pred[i] >= 0 and keys - y_pred[i] <= 5000) or (keys - y_pred[i] <= 0 and keys - y_pred[i] >= -5000):
        count += 1
    i += 1
print("Accuracy:", count / int(len(y_test) * 0.25))

Accuracy: 1.0


In [8]:
# Classification Metrics
# print("Accuracy:",metrics.accuracy_score(y_test[:int(len(y_test) * 0.25)], y_pred))
# print("Precision:",metrics.precision_score(y_test[:int(len(y_test) * 0.25)], y_pred))
# print("Recall:",metrics.recall_score(y_test[:int(len(y_test) * 0.25)], y_pred))

# # Compare to the actual labels
# '''Insert comparison methods here'''

# Plot the tree
plot_tree(clf)

# # Export a text file containing the rules for the decision tree
# '''decision_tree = DecisionTreeClassifier(random_state=0, max_depth=2) # 
# decision_tree = decision_tree.fit(X, Y)
# r = export_text(decision_tree, feature_names=iris['feature_names'])
# print(r)'''

[Text(361.1457158599588, 363.43999999999994, 'X[3] <= 1.5\nentropy = 0.903\nsamples = 8820\nvalue = [2555, 621, 215, 98, 51, 26, 20, 9, 7, 535, 289\n142, 71, 50, 26, 24, 10, 13, 11, 152, 134, 74\n52, 35, 15, 13, 12, 4, 6, 44, 65, 43, 35, 21\n14, 7, 10, 5, 5, 22, 23, 23, 20, 9, 9, 8, 9\n3, 5, 12, 14, 18, 15, 10, 7, 6, 2, 5, 4, 7\n6, 7, 3, 5, 2, 3, 1, 1, 4, 5, 3, 5, 5, 2\n4, 2, 2, 2, 1, 2, 1, 5, 4, 3, 1, 1, 4, 2\n2, 2, 1, 1, 1, 156, 85, 44, 23, 12, 6, 6, 3\n3, 2, 110, 80, 47, 23, 21, 15, 10, 6, 6, 2, 44\n56, 43, 35, 20, 13, 9, 6, 6, 8, 28, 37, 28, 27\n21, 3, 8, 2, 5, 4, 5, 19, 27, 12, 14, 17, 6\n7, 3, 5, 8, 10, 10, 15, 15, 5, 5, 6, 2, 2, 4\n4, 8, 9, 4, 4, 1, 5, 4, 5, 2, 3, 11, 6, 7\n5, 6, 3, 3, 6, 5, 2, 4, 2, 4, 1, 2, 1, 2\n1, 2, 3, 3, 1, 4, 2, 2, 3, 1, 19, 12, 9, 6\n1, 4, 1, 1, 30, 23, 15, 8, 11, 9, 3, 2, 20\n16, 17, 14, 12, 11, 5, 4, 13, 21, 10, 22, 12\n8, 6, 6, 1, 4, 8, 17, 16, 10, 7, 7, 3, 4, 2\n1, 5, 6, 11, 8, 9, 7, 6, 3, 2, 1, 1, 1, 9\n6, 4, 5, 5, 4, 3, 2, 4, 4, 3, 3, 2, 4, 4\n2, 2

## Random Forest Model

In [9]:
# Source: https://www.datacamp.com/community/tutorials/random-forests-classifier-python

# Keeps on prompting a restart of the kernel

# Create a Gaussian Classifier
rf_clf = RandomForestClassifier(n_estimators=50)

# Train the model using the training sets y_pred=clf.predict(X_test)
rf_clf = rf_clf.fit(X_train[:int(len(X_train) * 0.1)], y_train[:int(len(y_train) * 0.1)])

rf_y_pred = rf_clf.predict(X_test[:int(len(X_test) * 0.1)])
rf_predictions_probability = rf_clf.predict_proba(X_test[:int(len(X_test) * 0.1)])

In [10]:
# Accuracy within range 5000 (50 crime score point difference)
count = 0
i = 0
for keys in y_test:
    if i >= int(len(y_test) * 0.1):
        break
    if (keys - rf_y_pred[i] >= 0 and keys - rf_y_pred[i] <= 5000):
        count += 1
    elif (keys - rf_y_pred[i] <= 0 and keys - rf_y_pred[i] >= -5000):
        count += 1
    i += 1
print("Accuracy:", count / int(len(y_test) * 0.25))
# print("Precision:", metrics.precision_score(y_test[:int(len(y_test) * 0.25)], rf_y_pred))
# print([keys for keys in y_test[:int(len(y_test) * 0.25)]])
# print(rf_y_pred)
# print("Precision:", metrics.precision_score([keys for keys in y_test[:int(len(y_test) * 0.25)]], rf_y_pred))

Accuracy: 0.4


## NBC Model

In [11]:
gnb = GaussianNB()
gnb = gnb.fit(X_train[:int(len(X_train) * 0.25)], y_train[:int(len(y_train) * 0.25)])

nbc_y_pred = gnb.predict(X_test[:int(len(X_test) * 0.25)])
nbc_predictions_probability = gnb.predict_proba(X_test[:int(len(X_test) * 0.25)])

In [12]:
# Accuracy within range 5000 (50 crime score point difference)
count = 0
i = 0
for keys in y_test:
    if i >= int(len(y_test) * 0.25):
        break
    if (keys - nbc_y_pred[i] >= 0 and keys - nbc_y_pred[i] <= 5000):
        count += 1
    elif (keys - nbc_y_pred[i] <= 0 and keys - nbc_y_pred[i] >= -5000):
        count += 1
    i += 1
print("Accuracy:", count / int(len(y_test) * 0.25))

Accuracy: 1.0


## SVM Model

In [13]:
svm_clf = SVC(gamma='auto')
svm_clf = svm_clf.fit(X_train[:int(len(X_train) * 0.25)], y_train[:int(len(y_train) * 0.25)])

svm_y_pred = svm_clf.predict(X_test[:int(len(X_test) * 0.25)])
svm_predictions_probability = svm_clf.predict_proba(X_test[:int(len(X_test) * 0.25)])

AttributeError: predict_proba is not available when  probability=False

In [None]:
# Accuracy within range 5000 (50 crime score point difference)
count = 0
i = 0
for keys in y_test:
    if i >= int(len(y_test) * 0.25):
        break
    if (keys - svm_y_pred[i] >= 0 and keys - svm_y_pred[i] <= 5000):
        count += 1
    elif (keys - svm_y_pred[i] <= 0 and keys - svm_y_pred[i] >= -5000):
        count += 1
    i += 1
print("Accuracy:", count / int(len(y_test) * 0.25))