In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sklearn import linear_model
import statsmodels.formula.api as smf
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import neighbors

%matplotlib inline


## Data cleaning

In [2]:
df = pd.read_csv('table_8_offenses_known_to_law_enforcement_new_york_by_city_2013.csv')
df.columns = df.columns.str.replace('\n', ' ')
df.drop(['Unnamed: 13'], axis=1, inplace=True)
df.columns

Index(['City', 'Population', 'Violent crime',
       'Murder and nonnegligent manslaughter', 'Rape (revised definition)1',
       'Rape (legacy definition)2', 'Robbery', 'Aggravated assault',
       'Property crime', 'Burglary', 'Larceny- theft', 'Motor vehicle theft',
       'Arson3'],
      dtype='object')

In [3]:
fix = ['Population', 'Violent crime', 'Rape (legacy definition)2', 'Robbery', 'Aggravated assault', 
       'Property crime', 'Burglary', 'Larceny- theft', 'Motor vehicle theft']
#fixing commas in the numeric columns
for key in fix:
    df[key] = pd.to_numeric(df[key].astype(str).str.replace(',',''), errors='coerce')
df.dropna(axis=0, how='all', inplace=True)
df.drop(df.loc[df['Population'].isnull()].index, inplace=True)
df.drop('Rape (revised definition)1', axis=1, inplace=True)

In [4]:
#removing outliers in the data
label = df.loc[
    df['Property crime']>(df['Property crime'].quantile(0.9))]
df = df.drop(label.index)
df.shape

(313, 12)

## Feature engineering

In [5]:
df['Population^2'] = df['Population'].map(lambda x: x**2)
df['Murder'] = np.where(df['Murder and nonnegligent manslaughter']>0, 1, 0)
df['Robbery_feature'] = np.where(df['Robbery']>0, 1, 0)
df['below_avg_larceny'] = np.where(df['Larceny- theft']<df['Larceny- theft'].mean(), 1, 0)

ndf = df.copy()

## Fitting OLS regression

In [6]:
# Instantiate and fit our model.
regr = linear_model.LinearRegression()
Y = ndf['Property crime'].values.reshape(-1, 1)
X = ndf[['Population^2', 'Murder', 'Robbery_feature', 'below_avg_larceny']]
regr.fit(X, Y)

# Inspect the results.
print('\nCoefficients: \n', regr.coef_)
print('\nIntercept: \n', regr.intercept_)
print('\nR-squared:')
print(regr.score(X, Y))


Coefficients: 
 [[ 4.89200124e-08  1.34150969e+02  5.16283015e+01 -2.57870486e+02]]

Intercept: 
 [292.35045209]

R-squared:
0.7382864354875093


In [9]:
ols_score = cross_val_score(regr, X, Y, cv=5)
print("Avg. Accuracy: %0.2f (+/- %0.2f)" % (ols_score.mean(), ols_score.std() * 2))
print("Cross validation results:\n", ols_score)

Avg. Accuracy: 0.66 (+/- 0.29)
Cross validation results:
 [0.79756909 0.68956991 0.72566842 0.38376397 0.70008135]


In [10]:
ndf['Property_crime'] = ndf['Property crime']
linear_formula = 'Property_crime ~ Population+Murder+Robbery_feature+below_avg_larceny'
lm = smf.ols(linear_formula, ndf).fit()
print("\nParameter coefficients\n",lm.params)
print("\nP-Values\n",lm.pvalues)
print("\nR squared value\n", lm.rsquared)


Parameter coefficients
 Intercept            233.889544
Population             0.005177
Murder               111.839853
Robbery_feature       40.304773
below_avg_larceny   -219.100318
dtype: float64

P-Values
 Intercept            3.278312e-29
Population           3.331712e-12
Murder               1.474027e-07
Robbery_feature      2.401201e-03
below_avg_larceny    2.238071e-34
dtype: float64

R squared value
 0.7647023866080331


# KNN regression

In [20]:
knn = neighbors.KNeighborsRegressor(n_neighbors=5)
knn_w = neighbors.KNeighborsRegressor(n_neighbors=5, weights='distance')
Y = df['Property crime']
X = df[['Population^2', 'Murder', 'Robbery_feature', 'below_avg_larceny']]
knn.fit(X, Y)
score_w = knn_w.fit(X, Y)

score = cross_val_score(knn, X, Y, cv=5)
score_w = cross_val_score(knn_w, X, Y, cv=5)
print("Unweighted Accuracy: %0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))
print("Cross validation results:\n", score)
print("R-squared:\n", knn.score(X, Y))

print("\nWeighted Accuracy: %0.2f (+/- %0.2f)" % (score_w.mean(), score_w.std() * 2))
print("Cross validation results:\n", score_w)
print("R-squared:\n", knn_w.score(X, Y))
#clear signs of overfitting in the weighted model

Unweighted Accuracy: 0.44 (+/- 0.31)
Cross validation results:
 [0.65004077 0.17397157 0.46542781 0.44492267 0.48970406]
R-squared:
 0.6413457457840361

Weighted Accuracy: 0.34 (+/- 0.39)
Cross validation results:
 [0.61161573 0.01021231 0.38281527 0.42244886 0.28855554]
R-squared:
 1.0


## Comparing results

In [17]:
print("OLS results: \n")
print("Avg. Accuracy: %0.2f (+/- %0.2f)" % (ols_score.mean(), ols_score.std() * 2))
print("Cross validation results:\n", ols_score)
print("R-squared: ", regr.score(X, Y))

print("\nKNN results:\n")
print("Unweighted Accuracy: %0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))
print("Cross validation results:\n", score)
print("R-squared:\n", knn.score(X, Y))

OLS results: 

Avg. Accuracy: 0.66 (+/- 0.29)
Cross validation results:
 [0.79756909 0.68956991 0.72566842 0.38376397 0.70008135]
R-squared:  0.7382864354875093

KNN results:

Unweighted Accuracy: 0.51 (+/- 0.27)
Cross validation results:
 [0.67258691 0.28498522 0.47702886 0.48245067 0.6138753 ]
R-squared:
 0.6126324994915151


OLS had the better results overall. Although this was run when the data was the same for both. Outliers were cleaned, but observations which affect the error distribution are still in the OLS model, so problems of multivariate normality and heteroscadasticity are not addressed. The assumption here is that if the model is going to be flawed for one, it might as well be for both to see what works. K was set to 10, but then moved down to 5 for a slightly better model. The KNN model is easier to fit and has fewer assumptions to satisfy, but the OLS, when properly preapred, will have better results. This might be because the data still experiences outliers, even after data cleaning which can affect the results. A weighted model was used to maybe address this but resulted in an extremely overfit model. OLS has the advantage of not being susceptible to outliers as badly during its regression, while KNN can have more drastic results because if the neighbors are outliers, then that specific prediction region is not generalizable. Due to not having too many observations in this dataset, it is wise not to set the k too high for KNN. This can somewhat counteract the effect of outliers but results in a slightly worse model. The biggest impact I see between the 2 models has to do with the non-parametric nature of KNN. With no assumptions to satify before the model can even begin, it can be easier to start off with this model, however the complexity of OLS allows for more fine tuned results if all assumptions are satisfied. The drawback to OLS is that observations have to either be deleted or imputed to run the model, which can take away some of the generalizability of the model. 