# Atlanta Crime - Supervised Learning

## Description
In this part of the project, we will use supervised learning algorithms, namely, Decision Trees/Random Forests, SVM, and Logistic Regression algorithms to predict crime score for a given location and time.

## Import Packages

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.tree import plot_tree
from sklearn.tree.export import export_text
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import datetime
import random

## Load in Data

In [None]:
# data = pd.read_csv("data/examples/cobra_example_10.csv") #, sep=',', header=0, engine='python', usecols = lambda column : column in ["Neighborhood", "Month", "Day", "Shift"])
data = pd.read_csv("cobra-counts.csv") #, sep=',', header=0, usecols = lambda column : column in ["Occur Date", "Neighborhood", "Shift Occurence"])

# def day_of_week(row):
#     cur_date = row['Occur Date']
#     year, month, day = (int(x) for x in cur_date.split('-'))
#     dt = datetime.date(year, month, day)
#     return(dt.weekday())

# def month(row):
#     cur_date = row['Occur Date']
#     year, month, day = (int(x) for x in cur_date.split('-'))
#     return month

# data['Day of Week'] = data.apply(lambda row: day_of_week (row), axis=1)
# data['Month'] = data.apply(lambda row: month (row), axis=1)
# data = data.drop(['Occur Date'], 1)

# range(0 - 1000)
# two decimal places
#data['Crime Score'] = data.apply(lambda row: random.randint(0 * 100, 1000 * 100), axis=1)
data['Crime Score'] = 1000*data['Category 1'] + 100*data['Category 2'] + 10*data['Category 3'] + 1*data['Category 4']
data.head()

## Decision Tree Model

### Transforming the data

In [None]:
le = preprocessing.LabelEncoder()
le.fit(data["Neighborhood"])
data["Neighborhood"] = le.transform(data["Neighborhood"])
le.fit(data["Shift"])
data["Shift"] = le.transform(data["Shift"])

data.head()

In [None]:
# Reading data & split into training and testing sets
X = data.drop(['Crime Score','Category 1','Category 2','Category 3','Category 4'], 1)
# print(X)
y = data['Crime Score']
# print(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=100)
# Geeksforgeeks: "random_state" variable is a pseudo-random number generator state used for random sampling

# X_train = X[:int(len(X) * 0.7)]
# X_test = X[int(len(X) * 0.7):]
# y_train = y[:int(len(y) * 0.7)]
# y_test = y[int(len(y) * 0.7):]

In [None]:
# Model Definition & Fitting
clf = DecisionTreeClassifier()
clf = clf.fit(X_train[:int(len(X_train) * 0.25)], y_train[:int(len(y_train) * 0.25)])

# Predict based on the given features for the test set
y_pred = clf.predict(X_test[:int(len(X_test) * 0.25)])
predictions_probability = clf.predict_proba(X_test[:int(len(X_test) * 0.25)])

In [None]:
X_test.head()

### Decision Tree Metrics & Plots

In [None]:
# Accuracy within range 5000 (50 crime score point difference)
count = 0
i = 0
for keys in y_test:
    if i >= int(len(y_test) * 0.25):
        break
    if (keys - y_pred[i] >= 0 and keys - y_pred[i] <= 5000) or (keys - y_pred[i] <= 0 and keys - y_pred[i] >= -5000):
        count += 1
    i += 1
print("Accuracy:", count / int(len(y_test) * 0.25))

In [None]:
# Classification Metrics
# print("Accuracy:",metrics.accuracy_score(y_test[:int(len(y_test) * 0.25)], y_pred))
# print("Precision:",metrics.precision_score(y_test[:int(len(y_test) * 0.25)], y_pred))
# print("Recall:",metrics.recall_score(y_test[:int(len(y_test) * 0.25)], y_pred))

# # Compare to the actual labels
# '''Insert comparison methods here'''

# Plot the tree
plot_tree(clf)

# # Export a text file containing the rules for the decision tree
# '''decision_tree = DecisionTreeClassifier(random_state=0, max_depth=2) # 
# decision_tree = decision_tree.fit(X, Y)
# r = export_text(decision_tree, feature_names=iris['feature_names'])
# print(r)'''

## Random Forest Model

In [None]:
# Source: https://www.datacamp.com/community/tutorials/random-forests-classifier-python

# Keeps on prompting a restart of the kernel

# Create a Gaussian Classifier
rf_clf = RandomForestClassifier(n_estimators=50)

# Train the model using the training sets y_pred=clf.predict(X_test)
rf_clf = rf_clf.fit(X_train[:int(len(X_train) * 0.1)], y_train[:int(len(y_train) * 0.1)])

rf_y_pred = rf_clf.predict(X_test[:int(len(X_test) * 0.1)])
rf_predictions_probability = rf_clf.predict_proba(X_test[:int(len(X_test) * 0.1)])

In [None]:
# Accuracy within range 5000 (50 crime score point difference)
count = 0
i = 0
for keys in y_test:
    if i >= int(len(y_test) * 0.1):
        break
    if (keys - rf_y_pred[i] >= 0 and keys - rf_y_pred[i] <= 5000):
        count += 1
    elif (keys - rf_y_pred[i] <= 0 and keys - rf_y_pred[i] >= -5000):
        count += 1
    i += 1
print("Accuracy:", count / int(len(y_test) * 0.25))
# print("Precision:", metrics.precision_score(y_test[:int(len(y_test) * 0.25)], rf_y_pred))
# print([keys for keys in y_test[:int(len(y_test) * 0.25)]])
# print(rf_y_pred)
# print("Precision:", metrics.precision_score([keys for keys in y_test[:int(len(y_test) * 0.25)]], rf_y_pred))

## NBC Model

In [None]:
gnb = GaussianNB()
gnb = gnb.fit(X_train[:int(len(X_train) * 0.25)], y_train[:int(len(y_train) * 0.25)])

nbc_y_pred = gnb.predict(X_test[:int(len(X_test) * 0.25)])
nbc_predictions_probability = gnb.predict_proba(X_test[:int(len(X_test) * 0.25)])

In [None]:
# Accuracy within range 5000 (50 crime score point difference)
count = 0
i = 0
for keys in y_test:
    if i >= int(len(y_test) * 0.25):
        break
    if (keys - nbc_y_pred[i] >= 0 and keys - nbc_y_pred[i] <= 5000):
        count += 1
    elif (keys - nbc_y_pred[i] <= 0 and keys - nbc_y_pred[i] >= -5000):
        count += 1
    i += 1
print("Accuracy:", count / int(len(y_test) * 0.25))

## SVM Model

In [None]:
svm_clf = SVC(gamma='auto')
svm_clf = svm_clf.fit(X_train[:int(len(X_train) * 0.25)], y_train[:int(len(y_train) * 0.25)])

svm_y_pred = svm_clf.predict(X_test[:int(len(X_test) * 0.25)])
svm_predictions_probability = svm_clf.predict_proba(X_test[:int(len(X_test) * 0.25)])

In [None]:
# Accuracy within range 5000 (50 crime score point difference)
count = 0
i = 0
for keys in y_test:
    if i >= int(len(y_test) * 0.25):
        break
    if (keys - svm_y_pred[i] >= 0 and keys - svm_y_pred[i] <= 5000):
        count += 1
    elif (keys - svm_y_pred[i] <= 0 and keys - svm_y_pred[i] >= -5000):
        count += 1
    i += 1
print("Accuracy:", count / int(len(y_test) * 0.25))