# **Challenge: Validating a linear regression**

In [944]:
import math
import warnings

from IPython.display import display
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import datasets, linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import cross_validation
from sklearn import model_selection
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import statsmodels.formula.api as smf

# Display preferences.
%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

# Suppress annoying harmless error.
warnings.filterwarnings(
    action="ignore",
    module="scipy",
    message="^internal gelsd"
)

In [945]:
df = pd.read_csv('ny_crime.csv')

df = df.fillna(0)

df.columns = ['index', 'city', 'population', 'violent_crime', 'murder', 'rape_1', 'rape_2', 'robbery', 'aggravated_assault', 'property_crime', 'burglary', 'larceny-theft', 'motor_vehicle_theft', 'arson', 'population_sq', 'murder_cat', 'robbery_cat']

df = df.drop('index', axis=1)

print(df.dtypes)

df.head()

city                    object
population             float64
violent_crime           object
murder                 float64
rape_1                 float64
rape_2                  object
robbery                float64
aggravated_assault      object
property_crime         float64
burglary                object
larceny-theft           object
motor_vehicle_theft     object
arson                  float64
population_sq          float64
murder_cat               int64
robbery_cat              int64
dtype: object


Unnamed: 0,city,population,violent_crime,murder,rape_1,rape_2,robbery,aggravated_assault,property_crime,burglary,larceny-theft,motor_vehicle_theft,arson,population_sq,murder_cat,robbery_cat
0,Adams Village,1861.0,0,0.0,0.0,0,0.0,0,12.0,2,10,0,0.0,3463321.0,0,0
1,Addison Town and Village,2577.0,3,0.0,0.0,0,0.0,3,24.0,3,20,1,0.0,6640929.0,0,0
2,Akron Village,2846.0,3,0.0,0.0,0,0.0,3,16.0,1,15,0,0.0,8099716.0,0,0
3,Albany,97956.0,791,8.0,0.0,30,227.0,526,4090.0,705,3243,142,0.0,9595377936.0,1,1
4,Albion Village,6388.0,23,0.0,0.0,3,4.0,16,223.0,53,165,5,0.0,40806544.0,0,1


In [946]:
data = df[['population', 'population_sq', 'murder_cat', 'robbery_cat', 'property_crime']].dropna()
data.head()

Unnamed: 0,population,population_sq,murder_cat,robbery_cat,property_crime
0,1861.0,3463321.0,0,0,12.0
1,2577.0,6640929.0,0,0,24.0
2,2846.0,8099716.0,0,0,16.0
3,97956.0,9595377936.0,1,1,4090.0
4,6388.0,40806544.0,0,1,223.0


# **Initial Model: Simple Linear Regression**

In [947]:
regr = linear_model.LinearRegression()
X = df[['population', 'population_sq', 'murder_cat', 'robbery_cat']]
y = df['property_crime']
regr.fit(X, y)

print('\nCoefficients:\n', regr.coef_)
print('\nIntercept:\n', regr.intercept_)
print('\nR-squared:\n', regr.score(X, y))


Coefficients:
 [  1.40081564e-02   1.13436742e-07   8.42743726e+01   9.99603923e+01]

Intercept:
 -20.6331275019

R-squared:
 0.842167048595


# **Cross Validation**

In [948]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)

scores = cross_val_score(regr, X_train, y_train, cv=3)

print('Accuracy:\n', regr.score(X_test, y_test))
print('\nCross Validation Scores:\n', regr.coef_)
print('\nMean:\n', regr.intercept_)
print('\nStandard Deviation:\n', regr.score(X, Y))
print('\nRMSE:\n', np.sqrt(mean_squared_error(y_test, y_pred)))

Accuracy:
 0.74814561224

Cross Validation Scores:
 [  1.31341133e-02   1.16683085e-07   3.91563824e+01   1.07077750e+02]

Mean:
 -14.5082632531

Standard Deviation:
 0.841661122549

RMSE:
 310.793190468


# **Ordinary Least Squares Regression**

## **Iteration 1**

In [949]:
linear_formula = 'property_crime ~ population+population_sq+murder_cat+robbery_cat'

lm = smf.ols(formula=linear_formula, data=data).fit()

print('Parameters:\n', lm.params)
print('\nP-values:\n', lm.pvalues)
print('\nR-squared:\n', lm.rsquared)
print('\nConfidence Interval:')
lm.conf_int()

Parameters:
 Intercept       -20.633
population        0.014
population_sq     0.000
murder_cat       84.274
robbery_cat      99.960
dtype: float64

P-values:
 Intercept       0.569
population      0.000
population_sq   0.000
murder_cat      0.272
robbery_cat     0.054
dtype: float64

R-squared:
 0.842167048595

Confidence Interval:


Unnamed: 0,0,1
Intercept,-91.739,50.473
population,0.01,0.018
population_sq,0.0,0.0
murder_cat,-66.512,235.06
robbery_cat,-1.6,201.521


## **Iteration 2**

In [950]:
linear_formula = 'property_crime ~ population+murder_cat+robbery_cat'

lm = smf.ols(formula=linear_formula, data=data).fit()

print('Parameters:\n', lm.params)
print('\nP-values:\n', lm.pvalues)
print('\nR-squared:\n', lm.rsquared)
print('\nConfidence Interval:\n')
lm.conf_int()

Parameters:
 Intercept     -109.887
population       0.035
murder_cat    -192.262
robbery_cat    -67.261
dtype: float64

P-values:
 Intercept     0.007
population    0.000
murder_cat    0.020
robbery_cat   0.232
dtype: float64

R-squared:
 0.792267884045

Confidence Interval:



Unnamed: 0,0,1
Intercept,-189.021,-30.754
population,0.033,0.037
murder_cat,-354.301,-30.223
robbery_cat,-177.835,43.313


## **Iteration 3**

In [951]:
linear_formula = 'property_crime ~ population+robbery_cat'

lm = smf.ols(formula=linear_formula, data=data).fit()

print('Parameters:\n', lm.params)
print('\nP-values:\n', lm.pvalues)
print('\nR-squared:\n', lm.rsquared)
print('\nConfidence Interval:\n')
lm.conf_int()

Parameters:
 Intercept     -107.909
population       0.034
robbery_cat    -93.411
dtype: float64

P-values:
 Intercept     0.008
population    0.000
robbery_cat   0.093
dtype: float64

R-squared:
 0.788979001538

Confidence Interval:



Unnamed: 0,0,1
Intercept,-187.533,-28.286
population,0.032,0.036
robbery_cat,-202.461,15.639


# **Updated Model**

## **Remove population_sq feature**

In [952]:
regr = linear_model.LinearRegression()
X = df[['population', 'murder_cat', 'robbery_cat']]
y = df['property_crime']
regr.fit(X, y)

print('Intercept:\n', regr.coef_)
print('\nR-squared:\n', regr.coef_)
print('\nCoefficients:\n', regr.score(X, y))

Intercept:
 [  3.52053648e-02  -1.92261770e+02  -6.72605959e+01]

R-squared:
 [  3.52053648e-02  -1.92261770e+02  -6.72605959e+01]

Coefficients:
 0.792267884045


In [953]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)

scores = cross_val_score(regr, X_train, y_train, cv=3)

print('Accuracy:\n', regr.score(X_test, y_test))
print('\nCross Validation Scores:\n', regr.coef_)
print('\nMean:\n', regr.intercept_)
print('\nStandard Deviation:\n', regr.score(X, y))
print('\nRMSE:\n', np.sqrt(mean_squared_error(y_test, y_pred)))

Accuracy:
 0.648811053746

Cross Validation Scores:
 [  3.61202828e-02  -2.85028829e+02  -7.28313204e+01]

Mean:
 -104.803925707

Standard Deviation:
 0.791285668056

RMSE:
 367.000920053


## **Remove population_sq and murder features**

In [954]:
regr = linear_model.LinearRegression()
X = df[['population', 'robbery_cat']]
y = df['property_crime']
regr.fit(X, y)

print('Intercept:\n', regr.coef_)
print('\nR-squared:\n', regr.coef_)
print('\nCoefficients:\n', regr.score(X, y))

Intercept:
 [  3.44661269e-02  -9.34108858e+01]

R-squared:
 [  3.44661269e-02  -9.34108858e+01]

Coefficients:
 0.788979001538


In [955]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)

scores = cross_val_score(regr, X_train, y_train, cv=3)

print('Accuracy:\n', regr.score(X_test, y_test))
print('\nCross Validation Scores:\n', regr.coef_)
print('\nMean:\n', regr.intercept_)
print('\nStandard Deviation:\n', regr.score(X, y))
print('\nRMSE:\n', np.sqrt(mean_squared_error(y_test, y_pred)))

Accuracy:
 0.692031352078

Cross Validation Scores:
 [  3.51167136e-02  -1.10740237e+02]

Mean:
 -103.121658149

Standard Deviation:
 0.788696361995

RMSE:
 343.676618985


# **Validating Model with 2014 NY Crime Dataset**

In [956]:
df_2014 = pd.read_csv('Table_8_Offenses_Known_to_Law_Enforcement_by_New_York_by_City_2014.csv')

df_2014.columns = ['city', 'population', 'violent_crime', 'murder', 'rape_1', 'rape_2', 'robbery', 'aggravated_assault', 'property_crime', 'burglary', 'larceny-theft', 'motor_vehicle_theft', 'arson']

print(df_2014.dtypes)

df_2014.head(100)

city                    object
population              object
violent_crime           object
murder                 float64
rape_1                  object
rape_2                 float64
robbery                 object
aggravated_assault      object
property_crime          object
burglary                object
larceny-theft           object
motor_vehicle_theft     object
arson                  float64
dtype: object


Unnamed: 0,city,population,violent_crime,murder,rape_1,rape_2,robbery,aggravated_assault,property_crime,burglary,larceny-theft,motor_vehicle_theft,arson
0,Adams Village,1851,0,0.000,,0.000,0,0,11,1,10,0,0.000
1,Addison Town and Village,2568,2,0.000,,0.000,1,1,49,1,47,1,0.000
2,Afton Village4,820,0,0.000,0,,0,0,1,0,1,0,0.000
3,Akron Village,2842,1,0.000,,0.000,0,1,17,0,17,0,0.000
4,Albany4,98595,802,8.000,54,,237,503,3888,683,3083,122,12.000
5,Albion Village4,5872,26,0.000,3,,2,21,204,41,159,4,0.000
6,Alexandria Bay Village4,1107,0,0.000,0,,0,0,7,2,5,0,0.000
7,Alfred Village4,4032,11,1.000,1,,0,9,30,6,24,0,0.000
8,Altamont Village4,1723,1,0.000,0,,0,1,2,2,0,0,0.000
9,Amherst Town4,118860,128,1.000,16,,43,68,2066,176,1846,44,2.000


In [957]:
df_2014 = df_2014.fillna('0')

# Remove commas from numeric strings
df_2014['population'] = df_2014['population'].map(lambda x: int(x.replace(',', '')))

df_2014['murder'] = df_2014['murder'].map(lambda x: int(x))

df_2014['robbery'] = df_2014['robbery'].map(lambda x: int(x.replace(',', '')))

df_2014['property_crime'] = df_2014['property_crime'].map(lambda x: int(x.replace(',', '')))

print('df_2014.dtypes: ', df_2014.dtypes)

df_2014.dtypes:  city                   object
population              int64
violent_crime          object
murder                  int64
rape_1                 object
rape_2                 object
robbery                 int64
aggravated_assault     object
property_crime          int64
burglary               object
larceny-theft          object
motor_vehicle_theft    object
arson                  object
dtype: object


In [958]:
df_2014 = df_2014.fillna(0)

# Filter out any outliers over two standard deviations above the mean
pop_cutoff = df_2014['population'].mean() + 2*df_2014['population'].std()
mur_cutoff = df_2014['murder'].mean() + 2*df_2014['murder'].std()
rob_cutoff = df_2014['robbery'].mean() + 2*df_2014['robbery'].std()
prop_cutoff = df_2014['property_crime'].mean() + 2*df_2014['property_crime'].std()

df_2014['population'] = df_2014['population'].map(lambda x: x if x < pop_cutoff else None)
df_2014['murder'] = df_2014['murder'].map(lambda x: x if x < mur_cutoff else None)
df_2014['robbery'] = df_2014['robbery'].map(lambda x: x if x < rob_cutoff else None)
df_2014['property_crime'] = df_2014['property_crime'].map(lambda x: x if x < prop_cutoff else None)

df_2014.describe()

Unnamed: 0,population,murder,robbery,property_crime
count,370.0,369.0,370.0,370.0
mean,14810.086,0.388,15.684,327.703
std,26519.851,1.982,84.386,954.565
min,0.0,0.0,0.0,0.0
25%,2603.5,0.0,0.0,24.0
50%,6507.5,0.0,1.0,75.0
75%,15237.25,0.0,4.0,267.5
max,258419.0,27.0,1277.0,12449.0


In [959]:
# Create new feature
df_2014['population_sq'] = df_2014['population']**2

df_2014.head()

Unnamed: 0,city,population,violent_crime,murder,rape_1,rape_2,robbery,aggravated_assault,property_crime,burglary,larceny-theft,motor_vehicle_theft,arson,population_sq
0,Adams Village,1851.0,0,0.0,0,0.0,0.0,0,11.0,1,10,0,0.0,3426201.0
1,Addison Town and Village,2568.0,2,0.0,0,0.0,1.0,1,49.0,1,47,1,0.0,6594624.0
2,Afton Village4,820.0,0,0.0,0,0.0,0.0,0,1.0,0,1,0,0.0,672400.0
3,Akron Village,2842.0,1,0.0,0,0.0,0.0,1,17.0,0,17,0,0.0,8076964.0
4,Albany4,98595.0,802,8.0,54,0.0,237.0,503,3888.0,683,3083,122,12.0,9720974025.0


In [960]:
# Create categorical features
df_2014['murder_cat'] = df_2014['murder'].dropna().map(lambda x: 1 if x > 0 else 0)
df_2014['robbery_cat'] = df_2014['robbery'].dropna().map(lambda x: 1 if x > 0 else 0)

df_2014.head()

Unnamed: 0,city,population,violent_crime,murder,rape_1,rape_2,robbery,aggravated_assault,property_crime,burglary,larceny-theft,motor_vehicle_theft,arson,population_sq,murder_cat,robbery_cat
0,Adams Village,1851.0,0,0.0,0,0.0,0.0,0,11.0,1,10,0,0.0,3426201.0,0.0,0.0
1,Addison Town and Village,2568.0,2,0.0,0,0.0,1.0,1,49.0,1,47,1,0.0,6594624.0,0.0,1.0
2,Afton Village4,820.0,0,0.0,0,0.0,0.0,0,1.0,0,1,0,0.0,672400.0,0.0,0.0
3,Akron Village,2842.0,1,0.0,0,0.0,0.0,1,17.0,0,17,0,0.0,8076964.0,0.0,0.0
4,Albany4,98595.0,802,8.0,54,0.0,237.0,503,3888.0,683,3083,122,12.0,9720974025.0,1.0,1.0


In [961]:
data = df_2014[['population', 'robbery_cat', 'property_crime']].dropna()

linear_formula = 'property_crime ~ population+robbery_cat'

lm = smf.ols(formula=linear_formula, data=data).fit()

y_pred_2014 = lm.predict(data[['population', 'robbery_cat']])

print('RMSE (2014 Data): ', np.sqrt(mean_squared_error(data['property_crime'], y_pred_2014)))

RMSE (2014 Data):  447.451763791
