# Data 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier


In [2]:
# Load data file
df = pd.read_csv('terry-clean.csv')
pd.set_option('display.max_columns', 999)
df.head()

# Delete subject id

In [3]:
df.describe()

# Build a base model

First set the target: arrest flag

In [4]:
# Set target
y = df.arrest_flag
y = y.replace('N', 0).replace('Y', 1)
y.head()

## Random Forest:
I will use a random forest with the top attributes to build a baseline model.
* frisk flag
* call type
* subject age group
* subject perceived gender
* subject perceived race
* reported time - will create categories manually

In [5]:
# Top features to try
X = df[['frisk_flag', 'call_type', 'subject_age_group', 'subject_perceived_gender', 'subject_perceived_race']]
X.head()

### Create dummy variables

In [6]:
# Create dummy variables
data = pd.get_dummies(X)
display(data.head())


## Separate reported time

In [7]:
# Observe the hour
df.reported_time = df.reported_time.str[:2].astype('int')
df.reported_time.head()


In [8]:
sns.distplot(df.reported_time)
plt.title('Times')
plt.show()

In [9]:
# Create bins
bins = [0, 6, 12, 18, 24]

# Use pd.cut()
bins_time = pd.cut(df.reported_time, bins)

# Using pd.cut() returns unordered categories. Transform to ordered.
bins_time = bins_time.cat.as_ordered() 
bins_time.head()

In [10]:
# Plot the values
bins_time.value_counts().plot(kind='bar')
plt.title('Time Bins')

In [11]:
# Replace the existing 
df.reported_time = bins_time

In [12]:
# Perfom label encoding
df.reported_time = df.reported_time.cat.codes
df.reported_time.head()

In [13]:
# Add reported time category to other categoricals
data['reported_time'] = df.reported_time
data.head()

In [14]:
# Save the categorical data to use on other pages (notebooks)
data.to_csv('terry-cat-dummies.csv', index = False) 

## Train/test the data

In [15]:
# Split the train and test sets
data_train, data_test, target_train, target_test = train_test_split(data, y, 
                                                                    test_size = 0.25)

In [16]:
# Instantiate and fit a DecisionTreeClassifier
tree_clf = DecisionTreeClassifier(criterion='gini', max_depth=5) 
tree_clf.fit(data_train, target_train)

In [17]:
# Examine feature importance
display(tree_clf.feature_importances_)

## Plot the important features

In [18]:
# Plot of the important features
def plot_feature_importances(model):
    n_features = data_train.shape[1]
    plt.figure(figsize=(8,13))
    plt.barh(range(n_features), model.feature_importances_, align='center') 
    plt.yticks(np.arange(n_features), data_train.columns.values) 
    plt.xlabel('Feature importance')
    plt.ylabel('Feature')

plot_feature_importances(tree_clf)


## Model Performance
Generate predictions, print out confusion matrix, and a classification report.

In [19]:
# Test set predictions
pred = tree_clf.predict(data_test)

# Confusion matrix and classification report
print(confusion_matrix(target_test, pred))
print(classification_report(target_test, pred))

# View a tree

In [20]:
# Needed imports
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus

In [30]:
# Get a tree
tree_0 = tree_clf.estimators_[0]
tree_0

In [31]:
# Draw a tree
dot_data = StringIO()
export_graphviz(tree_0, out_file=dot_data, filled=True, special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())

# Bagged trees

In [24]:
# Instantiate a BaggingClassifier
bagged_tree = BaggingClassifier(DecisionTreeClassifier(criterion='gini', max_depth=5), n_estimators=20)

# Fit to training data
bagged_tree.fit(data_train, target_train)

In [25]:
# Training accuracy score
scr = bagged_tree.score(data_train, target_train)
print('Training Accuracy: ' + str(scr))

# Test accuracy score
scr = bagged_tree.score(data_test, target_test)
print('Test Accuracy: ' + str(scr))

# Random Forest Model

In [26]:
# Instantiate and fit a RandomForestClassifier
forest = RandomForestClassifier(n_estimators=100, max_depth= 5)
forest.fit(data_train, target_train)

In [27]:
# Training accuracy score
scr = forest.score(data_train, target_train)
print('Training Accuracy: ' + str(scr))

# Test accuracy score
scr = forest.score(data_test, target_test)
print('Test Accuracy: ' + str(scr))

In [28]:
# Feature Importance
plot_feature_importances(forest)

## Trees in the forest

In [29]:
# Instantiate and fit a RandomForestClassifier
forest_2 = RandomForestClassifier(n_estimators = 5, max_features= 10, max_depth= 2)
forest_2.fit(data_train, target_train)

In [None]:
# First tree from forest_2
rf_tree_1 = forest_2.estimators_[0]

# Feature importance
plot_feature_importances(rf_tree_1)

In [None]:
# Second tree from forest_2
rf_tree_2 = forest_2.estimators_[1]

# Feature importance
plot_feature_importances(rf_tree_2)

In [None]:
# Third tree from forest_2
rf_tree_3 = forest_2.estimators_[2]

# Feature importance
plot_feature_importances(rf_tree_3)

In [None]:
# Fourth tree from forest_2
rf_tree_4 = forest_2.estimators_[3]

# Feature importance
plot_feature_importances(rf_tree_4)

In [None]:
# Fifth tree from forest_2
rf_tree_5 = forest_2.estimators_[4]

# Feature importance
plot_feature_importances(rf_tree_5)

### Summary
Significant features contributing to the model:
* Call type None
* Call type 911
* Subject age group 18 - 25
* Subject perceived race Hispanic