In [46]:
import pandas as pd

In [47]:
df = pd.read_csv('crimedata.csv')
df.columns

Index(['communityName', 'state', 'countyCode', 'communityCode', 'population',
       'householdsize', 'racepctblack', 'racePctWhite', 'racePctAsian',
       'racePctHisp',
       ...
       'burglaries', 'burglPerPop', 'larcenies', 'larcPerPop', 'autoTheft',
       'autoTheftPerPop', 'arsons', 'arsonsPerPop', 'ViolentCrimesPerPop',
       'nonViolPerPop'],
      dtype='object', length=146)

We need to handle the data, as some columns are missing values. For countyCode and communityCode we will replace these values with zero. For the other columns we will work with, we will replace their missing value by the rounded median of these values. 

In [48]:
feature_names = ["population", "householdsize", "racePctWhite", "PctPopUnderPov", "PctUnemployed", "PolicOperBudg"]
updated_df = df.dropna(subset=['PolicOperBudg', 'ViolentCrimesPerPop'])

X = updated_df[feature_names]
y = updated_df.ViolentCrimesPerPop

In [49]:
print(X.describe())

         population  householdsize  racePctWhite  PctPopUnderPov  \
count  3.190000e+02     319.000000    319.000000      319.000000   
mean   1.919145e+05       2.629028     72.563292       15.153166   
std    4.798622e+05       0.296868     18.030164        7.369609   
min    1.074700e+04       1.880000      7.210000        2.270000   
25%    6.140600e+04       2.460000     62.185000        9.080000   
50%    9.444000e+04       2.590000     75.510000       15.320000   
75%    1.675830e+05       2.740000     87.115000       19.485000   
max    7.322564e+06       4.100000     98.230000       43.860000   

       PctUnemployed  PolicOperBudg  
count     319.000000   3.190000e+02  
mean        7.060972   2.896217e+07  
std         2.522005   9.899444e+07  
min         2.110000   2.380215e+06  
25%         5.305000   7.247164e+06  
50%         6.580000   1.075497e+07  
75%         8.610000   2.047340e+07  
max        16.600000   1.617293e+09  


In [50]:
print(y.head())

9     1544.24
13    1476.93
17     374.07
19     772.77
21    2097.71
Name: ViolentCrimesPerPop, dtype: float64


In [51]:
from sklearn.tree import DecisionTreeRegressor
#specify the model. 
#For model reproducibility, set a numeric value for random_state when specifying the model
iowa_model = DecisionTreeRegressor(random_state = 1)

# Fit the model
iowa_model.fit(X,y)

In [52]:
predictions = iowa_model.predict(X)
print(predictions)

[1544.24 1476.93  374.07  772.77 2097.71  689.42  512.19 1087.25  984.96
 1279.6   753.42  566.64 1693.42  969.6  1841.38  620.53 2127.02 1981.45
 1497.92 2414.77  795.08  871.02 1501.93 3235.45 1744.19 1195.53 3239.2
  448.84 1631.98  294.6  1389.87  530.57  136.48  488.86  687.49 1185.33
  266.81  458.28  874.69  797.66 1756.32 2105.73 1933.77 1363.32 1846.45
 2541.38  987.42   52.25 1290.43 1523.3   288.4   387.2   509.18 1093.15
 2423.47  772.3  1810.07  332.   1651.73  770.88 1038.27 1073.8   875.19
  889.17  845.63  423.52  152.07 1968.89  321.68 1311.2  1571.14  223.06
 2978.69 2078.85  435.6  1304.78  223.14 1627.33 3255.71  736.68  581.98
 2264.09 1818.82 1424.74 1629.43  366.34 1035.05  321.08 3047.66 1171.84
  230.   1274.62 1565.55  842.7  3530.78 1318.6   599.78  141.06  578.8
 2460.11  737.62  360.34  646.17 3414.57 1787.2  1960.67 2038.98  893.29
  514.99 1076.95 2186.47 2109.96  265.95 1118.78  572.76  984.34  985.62
  728.32 3081.26  739.5   373.88 2169.92 1633.81 1644

In [44]:
from sklearn.metrics import mean_absolute_error

predictions = iowa_model.predict(X)
mean_absolute_error(y, predictions)

0.0

In [45]:
from sklearn.model_selection import train_test_split

# split data into training and validation data, for both features and target
# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)
# Define model
crime_model = DecisionTreeRegressor()
# Fit model
crime_model.fit(train_X, train_y)

# get predicted prices on validation data
val_predictions = crime_model.predict(val_X)
print(mean_absolute_error(val_y, val_predictions))

557.389375
