In [1]:
# Import data science environment.
import math
import warnings

from IPython.display import display
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import neighbors
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
import statsmodels.formula.api as smf

# Display preferences
% matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

# Suppress annoying harmless error.
warnings.filterwarnings(
    action='ignore',
    module='scipy',
    message='^internal gelsd'
)
warnings.filterwarnings('ignore')

  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/master/New_York_offenses/NEW_YORK-Offenses_Known_to_Law_Enforcement_by_City_2013%20-%2013tbl8ny.csv', skiprows=3, header=1)
df.head()

Unnamed: 0,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape (revised definition)1,Rape (legacy definition)2,Robbery,Aggravated assault,Property crime,Burglary,Larceny- theft,Motor vehicle theft,Arson3
0,Adams Village,1861,0,0.0,,0,0,0,12,2,10,0,0.0
1,Addison Town and Village,2577,3,0.0,,0,0,3,24,3,20,1,0.0
2,Akron Village,2846,3,0.0,,0,0,3,16,1,15,0,0.0
3,Albany,97956,791,8.0,,30,227,526,4090,705,3243,142,
4,Albion Village,6388,23,0.0,,3,4,16,223,53,165,5,


In [3]:
# Change long column names.
df.rename(columns={
    "Murder and\nnonnegligent\nmanslaughter":"Murder",
    "Rape\n(revised\ndefinition)1":"Rape_1",
    "Rape\n(legacy\ndefinition)2":"Rape_2",
    "Aggravated\nassault":"Aggravated_Assault",
    "Property\ncrime":"Property_Crime",
    "Larceny-\ntheft":"Larceny",
    "Motor\nvehicle\ntheft":"Motor_Vehicle","Arson3":"Arson"
})

Unnamed: 0,City,Population,Violent crime,Murder,Rape_1,Rape_2,Robbery,Aggravated_Assault,Property_Crime,Burglary,Larceny,Motor_Vehicle,Arson
0,Adams Village,1861,0,0.000,,0,0,0,12,2,10,0,0.000
1,Addison Town and Village,2577,3,0.000,,0,0,3,24,3,20,1,0.000
2,Akron Village,2846,3,0.000,,0,0,3,16,1,15,0,0.000
3,Albany,97956,791,8.000,,30,227,526,4090,705,3243,142,
4,Albion Village,6388,23,0.000,,3,4,16,223,53,165,5,
5,Alfred Village,4089,5,0.000,,0,3,2,46,10,36,0,
6,Allegany Village,1781,3,0.000,,0,0,3,10,0,10,0,0.000
7,Amherst Town,118296,107,1.000,,7,31,68,2118,204,1882,32,3.000
8,Amityville Village,9519,9,0.000,,2,4,3,210,16,188,6,1.000
9,Amsterdam,18182,30,0.000,,0,12,18,405,99,291,15,0.000


In [4]:
# View column names, size, types, and tendencies.
print(df.columns)
print(df.shape)
print(df.dtypes)
df.describe()

Index(['City', 'Population', 'Violent\ncrime',
       'Murder and\nnonnegligent\nmanslaughter',
       'Rape\n(revised\ndefinition)1', 'Rape\n(legacy\ndefinition)2',
       'Robbery', 'Aggravated\nassault', 'Property\ncrime', 'Burglary',
       'Larceny-\ntheft', 'Motor\nvehicle\ntheft', 'Arson3'],
      dtype='object')
(351, 13)
City                                       object
Population                                 object
Violent\ncrime                             object
Murder and\nnonnegligent\nmanslaughter    float64
Rape\n(revised\ndefinition)1              float64
Rape\n(legacy\ndefinition)2                object
Robbery                                    object
Aggravated\nassault                        object
Property\ncrime                            object
Burglary                                   object
Larceny-\ntheft                            object
Motor\nvehicle\ntheft                      object
Arson3                                    float64
dtype: object


Unnamed: 0,Murder and nonnegligent manslaughter,Rape (revised definition)1,Arson3
count,348.0,0.0,187.0
mean,1.566,,1.872
std,18.304,,10.693
min,0.0,,0.0
25%,0.0,,0.0
50%,0.0,,0.0
75%,0.0,,1.0
max,335.0,,132.0


In [5]:
# Eliminate commas from numbers > 999.
def convert_number(number):
    try:
        converted = float(number.replace(',', ''))
    except:
        converted = number
        
    return converted

In [6]:
# Convert object types to floats.
df['Population'] = df['Population'].apply(lambda x: convert_number(x))
df['Population^2'] = df['Population']**2
df['Violent_Crime'] = df['Violent\ncrime'].apply(lambda x: convert_number(x))
df['Murder'] = df['Murder and\nnonnegligent\nmanslaughter'].apply(lambda x: convert_number(x))
df['Rape_1'] = df['Rape\n(revised\ndefinition)1'].apply(lambda x: convert_number(x))
df['Rape_2'] = df['Rape\n(legacy\ndefinition)2'].apply(lambda x: convert_number(x))
df['Robbery'] = df['Robbery'].apply(lambda x: convert_number(x))
df['Aggravated_Assault'] = df['Aggravated\nassault'].apply(lambda x: convert_number(x))
df['Property_Crime'] = df['Property\ncrime'].apply(lambda x: convert_number(x))
df['Burglary'] = df['Burglary'].apply(lambda x: convert_number(x))
df['Larceny'] = df['Larceny-\ntheft'].apply(lambda x: convert_number(x))
df['Motor_Vehicle'] = df['Motor\nvehicle\ntheft'].apply(lambda x: convert_number(x))
df['Arson'] = df['Arson3'].apply(lambda x: convert_number(x))

In [7]:
df.head()

Unnamed: 0,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape (revised definition)1,Rape (legacy definition)2,Robbery,Aggravated assault,Property crime,Burglary,...,Population^2,Violent_Crime,Murder,Rape_1,Rape_2,Aggravated_Assault,Property_Crime,Larceny,Motor_Vehicle,Arson
0,Adams Village,1861.0,0,0.0,,0,0.0,0,12,2.0,...,3463321.0,0.0,0.0,,0.0,0.0,12.0,10.0,0.0,0.0
1,Addison Town and Village,2577.0,3,0.0,,0,0.0,3,24,3.0,...,6640929.0,3.0,0.0,,0.0,3.0,24.0,20.0,1.0,0.0
2,Akron Village,2846.0,3,0.0,,0,0.0,3,16,1.0,...,8099716.0,3.0,0.0,,0.0,3.0,16.0,15.0,0.0,0.0
3,Albany,97956.0,791,8.0,,30,227.0,526,4090,705.0,...,9595377936.0,791.0,8.0,,30.0,526.0,4090.0,3243.0,142.0,
4,Albion Village,6388.0,23,0.0,,3,4.0,16,223,53.0,...,40806544.0,23.0,0.0,,3.0,16.0,223.0,165.0,5.0,


In [8]:
# Create new data frame with only relevant columns.
df_fbi = df[['Population', 'Population^2', 'Violent_Crime', 'Murder', 'Rape_1', 'Rape_2',
             'Property_Crime', 'Larceny', 'Aggravated_Assault', 'Motor_Vehicle', 'Arson']]

In [9]:
df_fbi.tail()

Unnamed: 0,Population,Population^2,Violent_Crime,Murder,Rape_1,Rape_2,Property_Crime,Larceny,Aggravated_Assault,Motor_Vehicle,Arson
346,199134.0,39654349956.0,1036.0,6.0,,25.0,2368.0,1662.0,615.0,236.0,10.0
347,36643.0,1342709449.0,15.0,0.0,,0.0,334.0,287.0,13.0,2.0,
348,,,,,,,,,,,
349,,,,,,,,,,,
350,,,,,,,,,,,


In [10]:
# Eliminate the last three lines of the dataframe.
df_fbi = df_fbi[:348]
df_fbi.tail()

Unnamed: 0,Population,Population^2,Violent_Crime,Murder,Rape_1,Rape_2,Property_Crime,Larceny,Aggravated_Assault,Motor_Vehicle,Arson
343,10685.0,114169225.0,3.0,0.0,,0.0,541.0,529.0,1.0,3.0,
344,829.0,687241.0,7.0,0.0,,0.0,17.0,9.0,7.0,0.0,0.0
345,5931.0,35176761.0,2.0,0.0,,0.0,58.0,45.0,2.0,0.0,
346,199134.0,39654349956.0,1036.0,6.0,,25.0,2368.0,1662.0,615.0,236.0,10.0
347,36643.0,1342709449.0,15.0,0.0,,0.0,334.0,287.0,13.0,2.0,


In [11]:
# Change NaN values to 0.
df_fbi.dropna(inplace=True)
df_fbi.head()

Unnamed: 0,Population,Population^2,Violent_Crime,Murder,Rape_1,Rape_2,Property_Crime,Larceny,Aggravated_Assault,Motor_Vehicle,Arson


In [12]:
# Show null column counts.
null_columns=df_fbi.columns[df_fbi.isnull().any()]
df_fbi[null_columns].isnull().sum()

Series([], dtype: float64)

In [13]:
# Eliminate two columns from dataframe.
df_fbi = df[['Population', 'Violent_Crime', 'Murder', 'Rape_2',
             'Property_Crime', 'Larceny', 'Aggravated_Assault', 'Motor_Vehicle']]

In [14]:
df_fbi.head()

Unnamed: 0,Population,Violent_Crime,Murder,Rape_2,Property_Crime,Larceny,Aggravated_Assault,Motor_Vehicle
0,1861.0,0.0,0.0,0.0,12.0,10.0,0.0,0.0
1,2577.0,3.0,0.0,0.0,24.0,20.0,3.0,1.0
2,2846.0,3.0,0.0,0.0,16.0,15.0,3.0,0.0
3,97956.0,791.0,8.0,30.0,4090.0,3243.0,526.0,142.0
4,6388.0,23.0,0.0,3.0,223.0,165.0,16.0,5.0


In [15]:
null_columns=df_fbi.columns[df_fbi.isnull().any()]
df_fbi[null_columns].isnull().sum()

Population            3
Violent_Crime         3
Murder                3
Rape_2                3
Property_Crime        3
Larceny               3
Aggravated_Assault    3
Motor_Vehicle         3
dtype: int64

In [16]:
df_fbi.tail()

Unnamed: 0,Population,Violent_Crime,Murder,Rape_2,Property_Crime,Larceny,Aggravated_Assault,Motor_Vehicle
346,199134.0,1036.0,6.0,25.0,2368.0,1662.0,615.0,236.0
347,36643.0,15.0,0.0,0.0,334.0,287.0,13.0,2.0
348,,,,,,,,
349,,,,,,,,
350,,,,,,,,


In [17]:
# Eliminate the last three lines of the dataframe.
df_fbi = df_fbi[:348]
df_fbi.tail()

Unnamed: 0,Population,Violent_Crime,Murder,Rape_2,Property_Crime,Larceny,Aggravated_Assault,Motor_Vehicle
343,10685.0,3.0,0.0,0.0,541.0,529.0,1.0,3.0
344,829.0,7.0,0.0,0.0,17.0,9.0,7.0,0.0
345,5931.0,2.0,0.0,0.0,58.0,45.0,2.0,0.0
346,199134.0,1036.0,6.0,25.0,2368.0,1662.0,615.0,236.0
347,36643.0,15.0,0.0,0.0,334.0,287.0,13.0,2.0


In [18]:
null_columns=df_fbi.columns[df_fbi.isnull().any()]
df_fbi[null_columns].isnull().sum()

Series([], dtype: float64)

In [19]:
# Set up timer.
from datetime import datetime
start_time = datetime.now()

# Move on to set up decision tree.
from sklearn import tree
from IPython.display import Image
import pydotplus
import graphviz

# Set up variables.
X = pd.DataFrame()
X = df_fbi.drop('Property_Crime', axis=1)
Y = pd.DataFrame()
Y = df_fbi['Property_Crime']

# Initialize and train our tree.
decision_tree = tree.DecisionTreeRegressor()
decision_tree.fit(X, Y)

end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))

Duration: 0:00:00.028729


In [20]:
cross_val_score(decision_tree, X, Y, cv=10)

array([ 0.91438614,  0.95846809,  0.98517069,  0.98831974,  0.97899869,
        0.90102349,  0.13945986,  0.86636733,  0.8379506 , -0.14629508])

In [21]:
start_time = datetime.now()

from sklearn import ensemble
rfc = ensemble.RandomForestRegressor()
rfc.fit(X, Y)

end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))

Duration: 0:00:00.042028


In [22]:
cross_val_score(rfc, X, Y, cv=10)

array([0.90150544, 0.91327775, 0.99019214, 0.99055202, 0.8100562 ,
       0.96556707, 0.12490706, 0.99626229, 0.80914011, 0.35389913])