In [91]:
#Import dependencies
import pandas as pd
import sqlite3
import argparse
import json
import pprint
import requests
import sys


In [92]:
#Assign database file to a variable
db_file = "Data/crimedata.db"

#Connect to database file
conn = sqlite3.connect(db_file)


In [93]:
#Query the 2017 & 2018 LA crime data table and put into a pandas dataframe
crime_df = pd.read_sql_query("select * from la_crime_1718;", conn)
crime_df.shape

(174766, 26)

In [94]:
#There are too many detailed crime types to put through a model, so let's focus on only those
#crimes where there were 6,000 or more committed.  I chose 6,000 because that gives 
#us the top 11.

crime_types = pd.read_sql_query("select count(*) as cnt, CrimeCode, CrimeCodeDescription from la_crime_1718 group by CrimeCode, CrimeCodeDescription having cnt > 6000 order by cnt desc;", conn)
crime_types

Unnamed: 0,cnt,CrimeCode,CrimeCodeDescription
0,16820,624,BATTERY - SIMPLE ASSAULT
1,14673,330,BURGLARY FROM VEHICLE
2,13046,440,THEFT PLAIN - PETTY ($950 & UNDER)
3,11412,510,VEHICLE - STOLEN
4,10286,310,BURGLARY
5,8903,626,INTIMATE PARTNER - SIMPLE ASSAULT
6,8644,230,"ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT"
7,7344,210,ROBBERY
8,6942,420,THEFT FROM MOTOR VEHICLE - PETTY ($950 & UNDER)
9,6670,354,THEFT OF IDENTITY


In [95]:
#In order to train a model, it would make sense to select crimes that greatly
#vary from one another.  Using both "Burglary From Vehicle" and "Burglary"
#would probably not produce good results.  I will only keep crime types that
#I want to focus on for this exercise.

#624 = Battery-Simple Assault
#440 = Theft Plain - Petty ($950 & Under)
#510 = Vehicle - Stolen
#310 = Burglary
#354 = Theft of Identity

new_crime_df = pd.read_sql_query("select * from la_crime_1718 where CrimeCode in ('624', '440', '510', '310', '354');", conn)
new_crime_df.shape

(58234, 26)

In [96]:
#Query the la weather history data table and put into a pandas dataframe
weather_df = pd.read_sql_query("select * from LA_Hourly_Weather_2016_to_2018;", conn)
weather_df.head()

Unnamed: 0,dt_iso,hour,farenheit,pressure,humidity,wind_speed,weather_code,weather_main
0,1/1/2016,1,58.982,1018,22,5,800,Clear
1,1/1/2016,3,52.016,1018,21,1,800,Clear
2,1/1/2016,5,50.414,1019,34,2,800,Clear
3,1/1/2016,6,47.93,1019,32,1,800,Clear
4,1/1/2016,7,45.662,1019,32,1,721,Haze


In [97]:
#Merge the weather history with the crime data

#First, strip the first two digits of the time occured column so that we can match it up with the weather
new_crime_df['hour'] = new_crime_df['TimeOccurred'].str.slice(0,2)
new_crime_df.shape


(58234, 27)

In [98]:
#Merge the crime and weather data on date and hour

new_df = pd.merge(new_crime_df, weather_df, left_on=['DateOccurred', 'hour'], right_on=['dt_iso', 'hour'])
new_df.shape

(50284, 34)

In [99]:
#Drop columns from the dataframe that will not be needed in our ML models

model_df = new_df.drop(['DRNumber', 'DateReported', 'DateOccurred', 'AreaName', 'CrimeCodeDescription', 'MOCodes',
                       'PremiseDescription', 'weather_main'], axis=1)

model_df = model_df.drop(['WeaponDescription', 'VictimSex', 'VictimDescent', 'StatusCode', 'StatusDescription',
                         'Address', 'CrossStreet', 'CrimeCode1', 'CrimeCode2', 'CrimeCode3', 'CrimeCode4','dt_iso'], axis=1)
model_df.head()

Unnamed: 0,TimeOccurred,AreaID,ReportingDistrict,CrimeCode,VictimAge,PremiseCode,WeaponUsedCode,Location,hour,farenheit,pressure,humidity,wind_speed,weather_code
0,1930,2,201,510,16,101,,"(34.0886, -118.2979)",19,80.492,1013,24,1,800
1,1915,1,153,440,48,502,,"(34.0467, -118.252)",19,80.492,1013,24,1,800
2,1900,8,836,310,64,502,,"(34.0573, -118.4206)",19,80.492,1013,24,1,800
3,1900,9,932,440,35,108,,"(34.1806, -118.4662)",19,80.492,1013,24,1,800
4,1930,9,971,440,16,210,,"(34.1564, -118.463)",19,80.492,1013,24,1,800


In [100]:
#Check the number of rows and columns in the model dataframe
#There are 13 possible features we can use and one field will be the target
model_df.shape

(50284, 14)

In [101]:
#Split the location into latitude and longitude columns
split_data = model_df['Location'].str.strip('()')
split_data = split_data.str.split(', ', expand=True)
split_data.rename(columns={0:'Latitude', 1:'Longitude'})

df1 = pd.DataFrame(split_data)

new_model_df = pd.merge(model_df, df1, right_index=True, left_index=True)
new_model_df.head()


Unnamed: 0,TimeOccurred,AreaID,ReportingDistrict,CrimeCode,VictimAge,PremiseCode,WeaponUsedCode,Location,hour,farenheit,pressure,humidity,wind_speed,weather_code,0,1
0,1930,2,201,510,16,101,,"(34.0886, -118.2979)",19,80.492,1013,24,1,800,34.0886,-118.2979
1,1915,1,153,440,48,502,,"(34.0467, -118.252)",19,80.492,1013,24,1,800,34.0467,-118.252
2,1900,8,836,310,64,502,,"(34.0573, -118.4206)",19,80.492,1013,24,1,800,34.0573,-118.4206
3,1900,9,932,440,35,108,,"(34.1806, -118.4662)",19,80.492,1013,24,1,800,34.1806,-118.4662
4,1930,9,971,440,16,210,,"(34.1564, -118.463)",19,80.492,1013,24,1,800,34.1564,-118.463


In [102]:
#Rename the lat and long columns
new_model_df = new_model_df.rename(columns={0:'lat', 1:'long'})
new_model_df.head()

Unnamed: 0,TimeOccurred,AreaID,ReportingDistrict,CrimeCode,VictimAge,PremiseCode,WeaponUsedCode,Location,hour,farenheit,pressure,humidity,wind_speed,weather_code,lat,long
0,1930,2,201,510,16,101,,"(34.0886, -118.2979)",19,80.492,1013,24,1,800,34.0886,-118.2979
1,1915,1,153,440,48,502,,"(34.0467, -118.252)",19,80.492,1013,24,1,800,34.0467,-118.252
2,1900,8,836,310,64,502,,"(34.0573, -118.4206)",19,80.492,1013,24,1,800,34.0573,-118.4206
3,1900,9,932,440,35,108,,"(34.1806, -118.4662)",19,80.492,1013,24,1,800,34.1806,-118.4662
4,1930,9,971,440,16,210,,"(34.1564, -118.463)",19,80.492,1013,24,1,800,34.1564,-118.463


In [103]:
#drop the original location field
new_model_df = new_model_df.drop(['Location'], axis=1)

#Confirm the number of rows is still correct
new_model_df.shape


(50284, 15)

In [119]:
#Copy this dataframe to use for one-hot encoding later

hot_df = new_model_df.copy()
hot_df.head()

Unnamed: 0,TimeOccurred,AreaID,ReportingDistrict,CrimeCode,VictimAge,PremiseCode,WeaponUsedCode,hour,farenheit,pressure,humidity,wind_speed,weather_code,lat,long
0,1930,2,201,510,16,101,,19,80.492,1013,24,1,800,34.0886,-118.2979
1,1915,1,153,440,48,502,,19,80.492,1013,24,1,800,34.0467,-118.252
2,1900,8,836,310,64,502,,19,80.492,1013,24,1,800,34.0573,-118.4206
3,1900,9,932,440,35,108,,19,80.492,1013,24,1,800,34.1806,-118.4662
4,1930,9,971,440,16,210,,19,80.492,1013,24,1,800,34.1564,-118.463


In [64]:
#Convert the lat and long columns to numeric
new_model_df['lat'] = new_model_df['lat'].apply(pd.to_numeric, errors='coerce')
new_model_df['long'] = new_model_df['long'].apply(pd.to_numeric, errors='coerce')

In [65]:
#Drop the WeaponUsedCode because it is not populated often enough (unreliable)
new_model_df = new_model_df.drop(['WeaponUsedCode'], axis=1)

In [66]:
#Convert the datatype of each column to numeric for machine learning
new_model_df['TimeOccurred'] = new_model_df['TimeOccurred'].apply(pd.to_numeric, errors='coerce')
new_model_df['AreaID'] = new_model_df['AreaID'].apply(pd.to_numeric, errors='coerce')
new_model_df['CrimeCode'] = new_model_df['CrimeCode'].apply(pd.to_numeric, errors='coerce')
new_model_df['ReportingDistrict'] = new_model_df['ReportingDistrict'].apply(pd.to_numeric, errors='coerce')
new_model_df['VictimAge'] = new_model_df['VictimAge'].apply(pd.to_numeric, errors='coerce')
new_model_df['PremiseCode'] = new_model_df['PremiseCode'].apply(pd.to_numeric, errors='coerce')
new_model_df['hour'] = new_model_df['hour'].apply(pd.to_numeric, errors='coerce')
new_model_df['farenheit'] = new_model_df['farenheit'].apply(pd.to_numeric, errors='coerce')
new_model_df['pressure'] = new_model_df['pressure'].apply(pd.to_numeric, errors='coerce')
new_model_df['humidity'] = new_model_df['humidity'].apply(pd.to_numeric, errors='coerce')
new_model_df['wind_speed'] = new_model_df['wind_speed'].apply(pd.to_numeric, errors='coerce')
new_model_df['weather_code'] = new_model_df['weather_code'].apply(pd.to_numeric, errors='coerce')


print(new_model_df.dtypes)
new_model_df.head()

TimeOccurred           int64
AreaID                 int64
ReportingDistrict      int64
CrimeCode              int64
VictimAge            float64
PremiseCode            int64
hour                   int64
farenheit            float64
pressure               int64
humidity               int64
wind_speed             int64
weather_code           int64
lat                  float64
long                 float64
dtype: object


Unnamed: 0,TimeOccurred,AreaID,ReportingDistrict,CrimeCode,VictimAge,PremiseCode,hour,farenheit,pressure,humidity,wind_speed,weather_code,lat,long
0,1930,2,201,510,16.0,101,19,80.492,1013,24,1,800,34.0886,-118.2979
1,1915,1,153,440,48.0,502,19,80.492,1013,24,1,800,34.0467,-118.252
2,1900,8,836,310,64.0,502,19,80.492,1013,24,1,800,34.0573,-118.4206
3,1900,9,932,440,35.0,108,19,80.492,1013,24,1,800,34.1806,-118.4662
4,1930,9,971,440,16.0,210,19,80.492,1013,24,1,800,34.1564,-118.463


In [67]:
#Drop NaN values.  I don't want to fill with zeros because that might be
#misinterpreted by the model.
new_model_df = new_model_df.dropna()
new_model_df.head()

Unnamed: 0,TimeOccurred,AreaID,ReportingDistrict,CrimeCode,VictimAge,PremiseCode,hour,farenheit,pressure,humidity,wind_speed,weather_code,lat,long
0,1930,2,201,510,16.0,101,19,80.492,1013,24,1,800,34.0886,-118.2979
1,1915,1,153,440,48.0,502,19,80.492,1013,24,1,800,34.0467,-118.252
2,1900,8,836,310,64.0,502,19,80.492,1013,24,1,800,34.0573,-118.4206
3,1900,9,932,440,35.0,108,19,80.492,1013,24,1,800,34.1806,-118.4662
4,1930,9,971,440,16.0,210,19,80.492,1013,24,1,800,34.1564,-118.463


In [68]:
#Reset the index on the dataframe and check the number of rows
#to make sure the drop na didn't get rid of too much data
new_model_df.reset_index()
new_model_df.shape

(48253, 14)

In [69]:
# Descriptive statistics for each column to get a better feel for the data
new_model_df.describe()

Unnamed: 0,TimeOccurred,AreaID,ReportingDistrict,CrimeCode,VictimAge,PremiseCode,hour,farenheit,pressure,humidity,wind_speed,weather_code,lat,long
count,48253.0,48253.0,48253.0,48253.0,48253.0,48253.0,48253.0,48253.0,48253.0,48253.0,48253.0,48253.0,48253.0,48253.0
mean,1614.090917,4.938677,541.321472,481.329202,35.797505,334.975297,15.986633,64.007857,1014.73353,67.692869,1.376018,690.900483,34.038955,-118.320041
std,381.501938,2.840431,281.888221,113.107853,18.049425,223.195606,3.805519,10.448191,7.342891,22.686316,1.412025,126.927669,0.10075,0.076448
min,1000.0,1.0,101.0,310.0,10.0,101.0,10.0,25.6226,943.0,2.0,0.0,200.0,33.706,-118.608
25%,1300.0,3.0,318.0,354.0,17.0,102.0,13.0,56.372,1013.0,54.0,1.0,500.0,34.0314,-118.3595
50%,1600.0,5.0,516.0,510.0,32.0,252.0,16.0,63.248,1015.0,72.0,1.0,721.0,34.0502,-118.3065
75%,1915.0,7.0,758.0,624.0,49.0,502.0,19.0,70.556,1018.0,87.0,2.0,800.0,34.0819,-118.2641
max,2359.0,21.0,2185.0,624.0,99.0,968.0,23.0,109.778,1033.0,100.0,16.0,804.0,34.2585,-118.1574


In [70]:
# Use numpy to convert to arrays
import numpy as np

# Labels are the values we want to predict.  We're trying to predict the type of crime.
labels = np.array(new_model_df['CrimeCode'])

# Remove the crimecode from the dataframe to leave us with just the predictive features
features= new_model_df.drop('CrimeCode', axis = 1)

# Save feature names for later use
feature_list = list(features.columns)

# Convert the features dataframe to a numpy array
features = np.array(features)

print(labels)
print(feature_list)


[510 440 310 ... 354 624 510]
['TimeOccurred', 'AreaID', 'ReportingDistrict', 'VictimAge', 'PremiseCode', 'hour', 'farenheit', 'pressure', 'humidity', 'wind_speed', 'weather_code', 'lat', 'long']


In [71]:
# Use Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

In [72]:
#Check the shape of our features and labels for the training and testing data
#We need to make sure that they each contain the same amount of columns
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (36189, 13)
Training Labels Shape: (36189,)
Testing Features Shape: (12064, 13)
Testing Labels Shape: (12064,)


In [73]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor

# Instantiate model with 20 decision trees
rf = RandomForestRegressor(n_estimators = 20, max_depth = 4)

# Train the model on training data
rf.fit(train_features, train_labels);

In [82]:
#Determine which features held the most weight
feature_weight = rf.feature_importances_

zipped_list = zip(feature_list, feature_weight)

for item in zipped_list:
  print(item)

('TimeOccurred', 0.04885202643031349)
('AreaID', 0.0)
('ReportingDistrict', 0.017220513701361316)
('VictimAge', 0.11742307394843361)
('PremiseCode', 0.7464031528806984)
('hour', 0.00244285089223973)
('farenheit', 0.0006915241591186646)
('pressure', 0.0002963352485959139)
('humidity', 0.00010354965665630932)
('wind_speed', 0.0)
('weather_code', 2.482323665821067e-05)
('lat', 0.00516349591914639)
('long', 0.06137865392677799)


In [43]:
# Use the forest's predict method on the test data
# to see if the model can accurately predict the crime codes
predictions = rf.predict(test_features)


# Calculate the absolute errors
errors = abs(predictions - test_labels)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

#Crime codes range from 110 to 956.  These codes should probably be
#one-hot encoded because the model probably weights the numbers rather
#than classifies them.


Mean Absolute Error: 74.18 degrees.


In [44]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')


Accuracy: 83.28 %.


In [47]:
#Let's print out one of the decision trees to see which features our model uses to predict
#First, import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot

#Extract one of the trees
tree = rf.estimators_[3]

#Save the tree as a png
export_graphviz(tree, out_file='crime_tree.dot', feature_names = feature_list, rounded=True, precision=1)

(graph, ) = pydot.graph_from_dot_file('crime_tree.dot')

graph.write_png('crime_tree.png')

In [46]:
#Conclusions based on tree printout:

#The model uses these features to predict: 

#PremiseCode
#VictimAge
#Lat/Lon
#TimeOccurred

#But, it definitely appears that the model is using math on the crime codes
#and the premise codes.  This is not correct so in my next attempt I will 
#use one-hot encoding.

In [50]:
predictions_df = pd.DataFrame(predictions)

test_labels_df = pd.DataFrame(test_labels)
test_labels_df

line_up_df = pd.merge(predictions_df, test_labels_df, left_index=True, right_index=True)
line_up_df

line_up_df.to_csv('random_forest_predictions_to_actuals.csv')

print(test_labels_df.head())
print(predictions_df.head())


     0
0  310
1  624
2  440
3  440
4  624
            0
0  406.139920
1  478.755918
2  425.818775
3  460.286065
4  527.679047


In [50]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)

# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 88.3 %.


In [120]:
hot_df.head()

Unnamed: 0,TimeOccurred,AreaID,ReportingDistrict,CrimeCode,VictimAge,PremiseCode,WeaponUsedCode,hour,farenheit,pressure,humidity,wind_speed,weather_code,lat,long
0,1930,2,201,510,16,101,,19,80.492,1013,24,1,800,34.0886,-118.2979
1,1915,1,153,440,48,502,,19,80.492,1013,24,1,800,34.0467,-118.252
2,1900,8,836,310,64,502,,19,80.492,1013,24,1,800,34.0573,-118.4206
3,1900,9,932,440,35,108,,19,80.492,1013,24,1,800,34.1806,-118.4662
4,1930,9,971,440,16,210,,19,80.492,1013,24,1,800,34.1564,-118.463


In [121]:
#Let's try the above using one-hot encoding

#Drop the lat, long, reporting district column.  One hot encoding will produce too many columns
hot_df = hot_df.drop(columns = ['lat', 'long', 'ReportingDistrict', 'WeaponUsedCode', 'TimeOccurred'])

#Convert features that should be looked at as numeric to numerics
hot_df['VictimAge'] = new_model_df['VictimAge'].apply(pd.to_numeric, errors='coerce')
hot_df['farenheit'] = new_model_df['farenheit'].apply(pd.to_numeric, errors='coerce')
hot_df['pressure'] = new_model_df['pressure'].apply(pd.to_numeric, errors='coerce')
hot_df['humidity'] = new_model_df['humidity'].apply(pd.to_numeric, errors='coerce')
hot_df['wind_speed'] = new_model_df['wind_speed'].apply(pd.to_numeric, errors='coerce')


#Get rid of any NaN values
hot_df = hot_df.dropna()

# Labels are the values we want to predict.  We're trying to predict the type of crime.
#labels_new = hot_df['CrimeCode']

# Remove the crimecode from the dataframe to leave us with just the predictive features
#features_new = hot_df.drop('CrimeCode', axis = 1)
hot_df.head()

Unnamed: 0,AreaID,CrimeCode,VictimAge,PremiseCode,hour,farenheit,pressure,humidity,wind_speed,weather_code
0,2,510,16.0,101,19,80.492,1013,24,1,800
1,1,440,48.0,502,19,80.492,1013,24,1,800
2,8,310,64.0,502,19,80.492,1013,24,1,800
3,9,440,35.0,108,19,80.492,1013,24,1,800
4,9,440,16.0,210,19,80.492,1013,24,1,800


In [122]:
#Define X and y

#Target column
targets = hot_df['CrimeCode']

#Let's try to predict the crime code
hot_df_features = hot_df.drop(columns=['CrimeCode'])



In [123]:
#Use one-hot encoding to turn all of the classifiers into binary numbers

features_hot = pd.get_dummies(hot_df_features)
print(features_hot.shape)
features_hot.head()


(48253, 275)


Unnamed: 0,VictimAge,farenheit,pressure,humidity,wind_speed,AreaID_1,AreaID_10,AreaID_11,AreaID_12,AreaID_13,...,hour_22,hour_23,weather_code_200,weather_code_300,weather_code_500,weather_code_711,weather_code_721,weather_code_761,weather_code_800,weather_code_804
0,16.0,80.492,1013,24,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,48.0,80.492,1013,24,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,64.0,80.492,1013,24,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,35.0,80.492,1013,24,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,16.0,80.492,1013,24,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [124]:
from sklearn.preprocessing import LabelEncoder

# Step 1: Label-encode data set
label_encoder = LabelEncoder()
encoded_y = label_encoder.fit_transform(targets)

encoded_y.shape

(48253,)

In [125]:
from keras.utils import to_categorical

# Step 2: One-hot encoding
one_hot_y = to_categorical(encoded_y)
one_hot_y

array([[0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0.],
       ...,
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.]], dtype=float32)

In [148]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features_hot, one_hot_y, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(36189, 275)
(12064, 275)
(36189, 5)
(12064, 5)


In [150]:
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier

# Instantiate model with 20 decision trees
rf = RandomForestClassifier(n_estimators = 20, max_depth = 4)

# Train the model on training data
rf.fit(X_train, y_train);
rf.score(X_test, y_test)

0.000663129973474801

In [166]:
feature_importances = pd.DataFrame(rf.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)
feature_importances.head(10)

Unnamed: 0,importance
VictimAge,0.21985
PremiseCode_101,0.215186
PremiseCode_501,0.123302
PremiseCode_102,0.115482
hour_12,0.055351
PremiseCode_108,0.036687
PremiseCode_104,0.02597
PremiseCode_119,0.025173
AreaID_4,0.022169
PremiseCode_221,0.01945


In [169]:
predictions = rf.predict(X_test)

# Calculate the absolute errors
errors = abs(predictions - y_test)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

print(predictions.shape)
print(errors.shape)
print(rf.score(X_train, y_train))


Mean Absolute Error: 0.2 degrees.
(12064, 5)
(12064, 5)
0.0005526541214181105


In [171]:
# Calculate mean absolute percentage error (MAPE)
#mape = 100 * (errors / predictions)
# Calculate and display accuracy
#accuracy = 100 - np.mean(mape)
#print('Accuracy:', round(accuracy, 2), '%.')

c = np.divide(errors, predictions, out=np.zeros_like(errors), where=predictions!=0)

mape = 100 * c
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')



Accuracy: 99.99 %.


In [172]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor

# Instantiate model with 20 decision trees
rf = RandomForestRegressor(n_estimators = 20, max_depth = 4)

# Train the model on training data
rf.fit(X_train, y_train);
rf.score(X_test, y_test)

0.3364173839197181

In [173]:
feature_importances = pd.DataFrame(rf.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)
feature_importances.head(10)

Unnamed: 0,importance
VictimAge,0.6729
PremiseCode_501,0.110049
hour_12,0.044701
PremiseCode_101,0.043133
PremiseCode_502,0.041511
PremiseCode_108,0.04049
PremiseCode_221,0.027252
PremiseCode_102,0.007519
AreaID_4,0.004461
AreaID_8,0.002062


In [174]:
predictions = rf.predict(X_test)

# Calculate the absolute errors
errors = abs(predictions - y_test)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

print(predictions.shape)
print(errors.shape)
print(rf.score(X_train, y_train))


Mean Absolute Error: 0.21 degrees.
(12064, 5)
(12064, 5)
0.3325740286344713


In [None]:
#It makes a HUGE difference when the features are one-hot encoded!