In [1]:
import pandas as pd
import numpy as np
import locale
from sklearn.linear_model import LinearRegression

In [2]:
df = pd.read_csv("data/nyc crime.csv")

In [3]:
df.head(12)

Unnamed: 0,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape,Robbery,Aggravated assault,Property crime,Burglary,Larceny-theft,Motor vehicle theft,Arson
0,Adams Village,1861,0,0,0,0,0,12,2,10,0,0.0
1,Addison Town and Village,2577,3,0,0,0,3,24,3,20,1,0.0
2,Akron Village,2846,3,0,0,0,3,16,1,15,0,0.0
3,Albany,97956,791,8,30,227,526,4090,705,3243,142,
4,Albion Village,6388,23,0,3,4,16,223,53,165,5,
5,Alfred Village,4089,5,0,0,3,2,46,10,36,0,
6,Allegany Village,1781,3,0,0,0,3,10,0,10,0,0.0
7,Amherst Town,118296,107,1,7,31,68,2118,204,1882,32,3.0
8,Amityville Village,9519,9,0,2,4,3,210,16,188,6,1.0
9,Amsterdam,18182,30,0,0,12,18,405,99,291,15,0.0


Create our linear model that follows the following equation:

$$ Property crime = \alpha + Population + Population^2 + Murder + Robbery$$

In [4]:
# convert all the numbers to ints or floats
locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')

'en_US.UTF-8'

In [5]:
df["Population"] = df["Population"].apply(lambda population: locale.atoi(population))
# df["Murder and nonnegligent manslaughter"] = df["Murder and nonnegligent manslaughter"].apply(lambda murder: locale.atoi(murder))
df["Property crime"] = df["Property crime"].apply(lambda crime: locale.atoi(crime))
df["Robbery"] = df["Robbery"].apply(lambda value: locale.atoi(value))

In [6]:
df

Unnamed: 0,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape,Robbery,Aggravated assault,Property crime,Burglary,Larceny-theft,Motor vehicle theft,Arson
0,Adams Village,1861,0,0,0,0,0,12,2,10,0,0.0
1,Addison Town and Village,2577,3,0,0,0,3,24,3,20,1,0.0
2,Akron Village,2846,3,0,0,0,3,16,1,15,0,0.0
3,Albany,97956,791,8,30,227,526,4090,705,3243,142,
4,Albion Village,6388,23,0,3,4,16,223,53,165,5,
5,Alfred Village,4089,5,0,0,3,2,46,10,36,0,
6,Allegany Village,1781,3,0,0,0,3,10,0,10,0,0.0
7,Amherst Town,118296,107,1,7,31,68,2118,204,1882,32,3.0
8,Amityville Village,9519,9,0,2,4,3,210,16,188,6,1.0
9,Amsterdam,18182,30,0,0,12,18,405,99,291,15,0.0


In [7]:
df["Population^2"] = df["Population"].apply(lambda value: value**2)

In [8]:
df.head()

Unnamed: 0,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape,Robbery,Aggravated assault,Property crime,Burglary,Larceny-theft,Motor vehicle theft,Arson,Population^2
0,Adams Village,1861,0,0,0,0,0,12,2,10,0,0.0,3463321
1,Addison Town and Village,2577,3,0,0,0,3,24,3,20,1,0.0,6640929
2,Akron Village,2846,3,0,0,0,3,16,1,15,0,0.0,8099716
3,Albany,97956,791,8,30,227,526,4090,705,3243,142,,9595377936
4,Albion Village,6388,23,0,3,4,16,223,53,165,5,,40806544


In [11]:
features = df[["Population", "Population^2", "Murder and nonnegligent manslaughter", "Robbery"]]
target = df["Property crime"]

In [12]:
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.25, random_state=20)

In [17]:
model = LinearRegression()

model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [21]:
predictions = model.predict(X_test)

In [24]:
predictions

array([ 2.64783570e+01,  2.60586893e+02,  9.56425615e+01,  6.24334342e+01,
        1.46111175e+03,  1.52944163e+02,  5.65736947e+02,  1.71274293e+02,
        2.12890034e+02,  3.88303222e+01,  1.16000687e+02,  3.46793438e+02,
        4.08315227e+01,  2.28206151e+01,  8.28309364e+01,  5.37988890e+03,
        3.21741701e+02,  5.65835107e+01,  4.58618971e+02,  1.04678366e+02,
        7.26922579e+01,  5.97498029e+02,  1.63993508e+01,  1.65964005e+01,
        1.75023575e+02,  4.74407925e+01,  7.37683668e+01,  1.18095069e+02,
        6.57850226e+02,  3.37390165e+02,  1.62064373e+02,  1.33354638e+03,
        2.40771323e+02,  1.99289098e+02,  6.50052000e+01,  2.85027487e+03,
        4.34246471e+02,  1.68342585e+02,  5.50163840e+02,  3.71795212e+01,
        1.56156591e+02,  3.64989249e+02,  2.71021959e+01,  1.41651944e+02,
        4.74092881e+02,  5.52588303e+01,  2.91545499e+02,  3.68803422e+01,
        3.90643501e+01,  1.27836345e+02,  5.42332157e+02,  1.46637903e+02,
        5.84195651e+01,  

In [22]:
model.coef_

array([ 1.81110199e-02, -7.78128531e-08,  8.59971849e+01,  6.46281926e+00])

In [23]:
model.intercept_

-6.350742698679198