In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
% matplotlib inline

In [2]:
from sklearn import preprocessing

In [3]:
fbi = pd.read_excel('..\..\Downloads/table_8_offenses_known_to_law_enforcement_new_york_by_city_2013.xls')

In [4]:
# info about data
fbi.iloc[0:3, 0]

0                             NEW YORK
1    Offenses Known to Law Enforcement
2                        by City, 2013
Name: Table 8, dtype: object

In [5]:
# dropping first 3 rows
df = fbi.drop([0, 1, 2], axis=0)

In [6]:
# using row as column titles
df.rename(columns=df.iloc[0], inplace=True)

In [7]:
df = df.reindex(df.index.drop(3)).reset_index(drop=True)

In [8]:
df.tail()

Unnamed: 0,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape (revised definition)1,Rape (legacy definition)2,Robbery,Aggravated assault,Property crime,Burglary,Larceny- theft,Motor vehicle theft,Arson3
346,Yonkers,199134.0,1036.0,6.0,,25.0,390.0,615.0,2368.0,470.0,1662.0,236.0,10.0
347,Yorktown Town,36643.0,15.0,0.0,,0.0,2.0,13.0,334.0,45.0,287.0,2.0,
348,1 The figures shown in this column for the off...,,,,,,,,,,,,
349,2 The figures shown in this column for the off...,,,,,,,,,,,,
350,3 The FBI does not publish arson data unless i...,,,,,,,,,,,,


In [9]:
# notes for columns (Rape and Arson)
note = df.iloc[348:, 0]

In [10]:
# dropping notes from df
df.drop([348, 349, 350], inplace=True)

In [11]:
# 2 'Rape' columns; the count of the first column...
df.iloc[:, 4].count()

0

In [12]:
# dropping 'revised Rape column', contains only NaN
df.drop('Rape\n(revised\ndefinition)1', axis=1, inplace=True)

In [13]:
# renaming columns
df.rename(columns={'Violent\ncrime': 'violent_crime', 'Murder and\nnonnegligent\nmanslaughter':'murder', 'Rape\n(legacy\ndefinition)2':'rape', 'Aggravated\nassault': 'aggrvt_asslt', 'Property\ncrime':'property_crime', 'Larceny-\ntheft': 'larceny_theft', 'Motor\nvehicle\ntheft':'motor_veh_theft', 'Robbery': 'robbery', 'Arson3': 'arson', 'Burglary':'burglary'}, inplace=True)

In [14]:
df.columns

Index(['City', 'Population', 'violent_crime', 'murder', 'rape', 'robbery',
       'aggrvt_asslt', 'property_crime', 'burglary', 'larceny_theft',
       'motor_veh_theft', 'arson'],
      dtype='object')

In [15]:
df['population_sqr'] = np.square(df['Population'])

In [16]:
# arson data not shown if full calendar year is not given
# assume not given because 0
df.isnull().sum()

City                 0
Population           0
violent_crime        0
murder               0
rape                 0
robbery              0
aggrvt_asslt         0
property_crime       0
burglary             0
larceny_theft        0
motor_veh_theft      0
arson              161
population_sqr       0
dtype: int64

In [17]:
df['arson'] = df['arson'].fillna(0)

In [18]:
# # using 'City' column as index
df.rename(index=df.iloc[:, 0], inplace=True)

In [19]:
df.drop('City', axis=1, inplace=True)

In [20]:
# creating copy of df in its raw(ish) state
df_o = df.copy()

In [21]:
# how to fix
df_o.Population.describe()

count      348
unique     347
top       9517
freq         2
Name: Population, dtype: int64

In [22]:
# minmaxscaling
df = pd.DataFrame(preprocessing.StandardScaler().fit_transform(df), columns=df.columns)

In [23]:
df.head()

Unnamed: 0,Population,violent_crime,murder,rape,robbery,aggrvt_asslt,property_crime,burglary,larceny_theft,motor_veh_theft,arson,population_sqr
0,-0.084952,-0.071711,-0.085685,-0.097201,-0.07081,-0.071176,-0.102057,-0.127416,-0.098947,-0.089129,-0.127742,-0.053943
1,-0.083359,-0.070644,-0.085685,-0.097201,-0.07081,-0.069415,-0.100488,-0.126333,-0.097368,-0.086647,-0.127742,-0.053942
2,-0.08276,-0.070644,-0.085685,-0.097201,-0.07081,-0.069415,-0.101534,-0.128499,-0.098158,-0.089129,-0.127742,-0.053942
3,0.128882,0.209662,0.352015,0.399994,0.149675,0.237567,0.431104,0.63372,0.411237,0.263365,-0.127742,-0.051401
4,-0.074878,-0.063529,-0.085685,-0.047481,-0.066925,-0.061785,-0.074471,-0.072199,-0.074487,-0.076718,-0.127742,-0.053933


In [24]:
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing

In [25]:
binarizer = preprocessing.Binarizer().fit(df)

In [26]:
binarizer

Binarizer(copy=True, threshold=0.0)

In [27]:
bin_df = pd.DataFrame(binarizer.transform(df))

In [28]:
bin_df = bin_df.iloc[:, 1:-1]

In [29]:
['feat_' + string for string in df.columns.tolist() ]

['feat_Population',
 'feat_violent_crime',
 'feat_murder',
 'feat_rape',
 'feat_robbery',
 'feat_aggrvt_asslt',
 'feat_property_crime',
 'feat_burglary',
 'feat_larceny_theft',
 'feat_motor_veh_theft',
 'feat_arson',
 'feat_population_sqr']

In [30]:
bin_df.columns = ['feat_' + string for string in df.iloc[:, 1:-1].columns.tolist() ]

In [31]:
bin_df.index = [string for string in df.index.tolist()]

In [32]:
df = pd.concat([df, bin_df], axis=1)

In [33]:
a = df[['Population', 'population_sqr', 'feat_murder', 'feat_robbery']]

In [34]:
b = df[['Population', 'population_sqr', 'feat_murder', 'feat_robbery']].values

In [35]:
regr = LinearRegression()

In [36]:
regr.fit(b, df.property_crime)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [37]:
regr.coef_

array([ 1.79513607, -0.80583276,  0.00321374,  0.09131733])

In [38]:
regr.intercept_

-0.0046271317990218015

In [39]:
regr.score(b, df.property_crime)

0.9963249434513605

In [40]:
import statsmodels.formula.api as smf
from statsmodels.sandbox.regression.predstd import wls_prediction_std


In [41]:
# Write out the model formula.
# Your dependent variable on the right, independent variables on the left
# Use a ~ to represent an '=' from the functional form
linear_formula = 'property_crime ~ Population+violent_crime+murder+rape+robbery+aggrvt_asslt+burglary+larceny_theft+motor_veh_theft+arson'

# Fit the model to our data using the formula.
lm = smf.ols(formula=linear_formula, data=df).fit()

In [42]:
df.columns

Index(['Population', 'violent_crime', 'murder', 'rape', 'robbery',
       'aggrvt_asslt', 'property_crime', 'burglary', 'larceny_theft',
       'motor_veh_theft', 'arson', 'population_sqr', 'feat_violent_crime',
       'feat_murder', 'feat_rape', 'feat_robbery', 'feat_aggrvt_asslt',
       'feat_property_crime', 'feat_burglary', 'feat_larceny_theft',
       'feat_motor_veh_theft', 'feat_arson'],
      dtype='object')

In [43]:
lm.params

Intercept         -6.938894e-18
Population         9.853229e-16
violent_crime     -9.436896e-16
murder            -1.942890e-16
rape               1.457168e-15
robbery            4.440892e-15
aggrvt_asslt      -3.441691e-15
burglary           1.207548e-01
larceny_theft      8.284964e-01
motor_veh_theft    5.266819e-02
arson              9.714451e-17
dtype: float64

In [44]:
lm.pvalues

Intercept          0.872286
Population         0.630047
violent_crime      0.232979
murder             0.863249
rape               0.250696
robbery            0.320392
aggrvt_asslt       0.314046
burglary           0.000000
larceny_theft      0.000000
motor_veh_theft    0.000000
arson              0.167849
dtype: float64

In [45]:
lm.rsquared

1.0

In [46]:
# Write out the model formula.
# Your dependent variable on the right, independent variables on the left
# Use a ~ to represent an '=' from the functional form
linear_formula = 'property_crime ~ Population+violent_crime+murder+rape+robbery+aggrvt_asslt+arson'

# Fit the model to our data using the formula.
lm = smf.ols(formula=linear_formula, data=df).fit()

In [47]:
lm.params

Intercept       -6.938894e-18
Population       6.514759e-01
violent_crime   -9.225959e-02
murder           7.417341e-02
rape             5.145890e-01
robbery          6.261875e-02
aggrvt_asslt    -2.090984e-01
arson            1.146454e-02
dtype: float64

In [48]:
lm.pvalues

Intercept        1.000000e+00
Population       5.159585e-31
violent_crime    1.657251e-03
murder           8.402272e-02
rape             8.002374e-41
robbery          6.564893e-01
aggrvt_asslt     7.279348e-02
arson            5.462439e-06
dtype: float64

In [49]:
lm.rsquared

0.9990612340053507

In [50]:
# Write out the model formula.
# Your dependent variable on the right, independent variables on the left
# Use a ~ to represent an '=' from the functional form
linear_formula = 'property_crime ~ Population+violent_crime+rape++arson'

# Fit the model to our data using the formula.
lm = smf.ols(formula=linear_formula, data=df).fit()

In [51]:
lm.params

Intercept       -6.938894e-18
Population       5.780638e-01
violent_crime   -1.461078e-01
rape             5.689848e-01
arson            1.559857e-02
dtype: float64

In [52]:
lm.pvalues

Intercept        1.000000e+00
Population       4.317059e-31
violent_crime    1.787605e-02
rape             7.546606e-75
arson            5.331993e-14
dtype: float64

In [53]:
lm.rsquared

0.9990358727248779

In [133]:
df_o.head()

Unnamed: 0,Population,violent_crime,murder,rape,robbery,aggrvt_asslt,property_crime,burglary,larceny_theft,motor_veh_theft,arson,population_sqr
Adams Village,1861,0,0,0,0,0,12,2,10,0,0,3463321
Addison Town and Village,2577,3,0,0,0,3,24,3,20,1,0,6640929
Akron Village,2846,3,0,0,0,3,16,1,15,0,0,8099716
Albany,97956,791,8,30,227,526,4090,705,3243,142,0,9595377936
Albion Village,6388,23,0,3,4,16,223,53,165,5,0,40806544


In [55]:
# 'property_crime' = 'burglary' + 'larceny_theft' + 'motor_veh_theft'

In [56]:
# Write out the model formula.
# Your dependent variable on the right, independent variables on the left
# Use a ~ to represent an '=' from the functional form
linear_formula = 'property_crime ~ Population+violent_crime+arson'

# Fit the model to our data using the formula.
lm = smf.ols(formula=linear_formula, data=df).fit()

In [57]:
lm.params

Intercept       -6.938894e-18
Population      -1.784884e-02
violent_crime    1.014425e+00
arson            3.596888e-02
dtype: float64

In [58]:
lm.pvalues

Intercept        1.000000e+00
Population       7.699446e-01
violent_crime    5.712103e-46
arson            4.140757e-29
dtype: float64

In [59]:
lm.rsquared

0.9974357426119952

In [60]:
df.columns

Index(['Population', 'violent_crime', 'murder', 'rape', 'robbery',
       'aggrvt_asslt', 'property_crime', 'burglary', 'larceny_theft',
       'motor_veh_theft', 'arson', 'population_sqr', 'feat_violent_crime',
       'feat_murder', 'feat_rape', 'feat_robbery', 'feat_aggrvt_asslt',
       'feat_property_crime', 'feat_burglary', 'feat_larceny_theft',
       'feat_motor_veh_theft', 'feat_arson'],
      dtype='object')

In [61]:
# Write out the model formula.
# Your dependent variable on the right, independent variables on the left
# Use a ~ to represent an '=' from the functional form
linear_formula = 'property_crime ~ Population+murder+robbery+rape+aggrvt_asslt+arson'

# Fit the model to our data using the formula.
lm = smf.ols(formula=linear_formula, data=df).fit()

In [62]:
lm.params

Intercept      -6.938894e-18
Population      6.514759e-01
murder          7.357358e-02
robbery         2.883062e-02
rape            5.126088e-01
aggrvt_asslt   -2.650103e-01
arson           1.146454e-02
dtype: float64

In [63]:
lm.pvalues

Intercept       1.000000e+00
Population      5.159585e-31
murder          8.657733e-02
robbery         8.320834e-01
rape            6.624211e-41
aggrvt_asslt    4.259993e-02
arson           5.462439e-06
dtype: float64

In [64]:
lm.rsquared

0.9990612340053507

In [65]:
# Write out the model formula.
# Your dependent variable on the right, independent variables on the left
# Use a ~ to represent an '=' from the functional form
linear_formula = 'property_crime ~ Population+murder+rape+aggrvt_asslt+arson'

# Fit the model to our data using the formula.
lm = smf.ols(formula=linear_formula, data=df).fit()

In [66]:
lm.params

Intercept      -6.938894e-18
Population      6.525343e-01
murder          7.970755e-02
rape            5.104439e-01
aggrvt_asslt   -2.411672e-01
arson           1.137669e-02
dtype: float64

In [67]:
lm.pvalues

Intercept       1.000000e+00
Population      2.058682e-31
murder          1.191871e-02
rape            7.808686e-44
aggrvt_asslt    2.806406e-04
arson           4.627237e-06
dtype: float64

In [68]:
lm.rsquared

0.9990611100502965

In [69]:
# Write out the model formula.
# Your dependent variable on the right, independent variables on the left
# Use a ~ to represent an '=' from the functional form
linear_formula = 'property_crime ~ Population+rape+aggrvt_asslt+arson'

# Fit the model to our data using the formula.
lm = smf.ols(formula=linear_formula, data=df).fit()

In [70]:
lm.params

Intercept      -6.938894e-18
Population      6.094669e-01
rape            5.698883e-01
aggrvt_asslt   -1.783775e-01
arson           1.494029e-02
dtype: float64

In [71]:
lm.pvalues

Intercept       1.000000e+00
Population      1.135951e-30
rape            1.966479e-85
aggrvt_asslt    3.853302e-03
arson           8.898150e-13
dtype: float64

In [72]:
lm.rsquared

0.9990435644998936

In [73]:
# Write out the model formula.
# Your dependent variable on the right, independent variables on the left
# Use a ~ to represent an '=' from the functional form
linear_formula = 'property_crime ~ Population+feat_rape+feat_aggrvt_asslt+feat_arson'

# Fit the model to our data using the formula.
lm = smf.ols(formula=linear_formula, data=df).fit()

In [74]:
lm.params

Intercept           -0.015682
Population           0.980395
feat_rape            0.062050
feat_aggrvt_asslt    0.208530
feat_arson           0.011090
dtype: float64

In [75]:
lm.pvalues

Intercept            7.826619e-05
Population           0.000000e+00
feat_rape            1.400454e-04
feat_aggrvt_asslt    7.059753e-17
feat_arson           4.094333e-01
dtype: float64

In [76]:
lm.rsquared

0.9954596204126502

In [77]:
# Write out the model formula.
# Your dependent variable on the right, independent variables on the left
# Use a ~ to represent an '=' from the functional form
linear_formula = 'property_crime ~ Population+feat_rape+aggrvt_asslt+feat_arson'

# Fit the model to our data using the formula.
lm = smf.ols(formula=linear_formula, data=df).fit()

In [78]:
lm.params

Intercept      -0.015064
Population     -0.213847
feat_rape       0.115571
aggrvt_asslt    1.204311
feat_arson      0.049809
dtype: float64

In [79]:
lm.pvalues

Intercept       5.051545e-06
Population      3.732283e-03
feat_rape       3.789301e-22
aggrvt_asslt    3.947232e-45
feat_arson      8.042313e-06
dtype: float64

In [80]:
lm.rsquared

0.9968851538788961

In [81]:
# Write out the model formula.
# Your dependent variable on the right, independent variables on the left
# Use a ~ to represent an '=' from the functional form
linear_formula = 'property_crime ~ Population+rape+arson'

# Fit the model to our data using the formula.
lm = smf.ols(formula=linear_formula, data=df).fit()

In [82]:
lm.params

Intercept    -6.938894e-18
Population    4.766518e-01
rape          5.240285e-01
arson         1.628259e-02
dtype: float64

In [83]:
lm.pvalues

Intercept      1.000000e+00
Population    1.787074e-107
rape          6.123692e-118
arson          4.007607e-15
dtype: float64

In [84]:
lm.rsquared

0.9990199557423033

In [85]:
# Write out the model formula.
# Your dependent variable on the right, independent variables on the left
# Use a ~ to represent an '=' from the functional form
linear_formula = 'property_crime ~ Population+rape'

# Fit the model to our data using the formula.
lm = smf.ols(formula=linear_formula, data=df).fit()

In [86]:
lm.params

Intercept    -6.938894e-18
Population    4.147150e-01
rape          5.869271e-01
dtype: float64

In [87]:
lm.pvalues

Intercept      1.000000e+00
Population     1.606270e-99
rape          1.293107e-140
dtype: float64

In [88]:
lm.rsquared

0.9988271358839578

In [89]:
np.percentile(df_o.Population, 90)

35034.10000000002

In [90]:
df_high = df_o[df_o['Population']>35034.10000000002]

In [91]:
df_high_sc = pd.DataFrame(preprocessing.StandardScaler().fit_transform(df_high), columns=df_high.columns)

In [92]:
df_low = df_o[df_o['Population']<=35034.10000000002]

In [93]:
bin_df_low = pd.DataFrame(binarizer.transform(df_low))

In [94]:
bin_df_low = bin_df_low.iloc[:, 1:-1]

In [95]:
bin_df_low.columns = ['feat_' + string for string in df_low.iloc[:, 1:-1].columns.tolist() ]
bin_df_low.index = [string for string in df_low.index.tolist()]
df_low = pd.concat([df_low, bin_df_low], axis=1)

In [96]:
df_low_sc = pd.DataFrame(preprocessing.StandardScaler().fit_transform(df_low), columns=df_low.columns)

In [97]:
# Write out the model formula.
# Your dependent variable on the right, independent variables on the left
# Use a ~ to represent an '=' from the functional form
linear_formula = 'property_crime ~ Population+population_sqr+violent_crime+murder+rape+robbery+aggrvt_asslt+arson'

# Fit the model to our data using the formula.
lm = smf.ols(formula=linear_formula, data=df_high_sc).fit()

In [98]:
lm.params

Intercept         0.000000
Population        0.167926
population_sqr    0.118076
violent_crime     0.079917
murder           -0.044445
rape              0.529983
robbery           0.091992
aggrvt_asslt      0.058129
arson             0.014183
dtype: float64

In [99]:
lm.pvalues

Intercept         1.000000
Population        0.497767
population_sqr    0.553956
violent_crime     0.457869
murder            0.747818
rape              0.000422
robbery           0.815239
aggrvt_asslt      0.867972
arson             0.035412
dtype: float64

In [100]:
lm.rsquared

0.999562998794713

In [101]:
# Write out the model formula.
# Your dependent variable on the right, independent variables on the left
# Use a ~ to represent an '=' from the functional form
linear_formula = 'property_crime ~ rape+arson'

# Fit the model to our data using the formula.
lm = smf.ols(formula=linear_formula, data=df_high_sc).fit()

In [102]:
lm.params

Intercept    0.000000
rape         0.998810
arson       -0.018582
dtype: float64

In [103]:
lm.pvalues

Intercept    1.000000e+00
rape         1.233244e-41
arson        6.980326e-02
dtype: float64

In [104]:
lm.rsquared

0.9968632972321297

In [105]:
# Write out the model formula.
# Your dependent variable on the right, independent variables on the left
# Use a ~ to represent an '=' from the functional form
linear_formula = 'property_crime ~ rape'

# Fit the model to our data using the formula.
lm = smf.ols(formula=linear_formula, data=df_high_sc).fit()

In [106]:
lm.params

Intercept    0.000000
rape         0.998258
dtype: float64

In [107]:
lm.pvalues

Intercept    1.000000e+00
rape         3.798553e-42
dtype: float64

In [108]:
lm.rsquared

0.9965183225864596

In [109]:
# Write out the model formula.
# Your dependent variable on the right, independent variables on the left
# Use a ~ to represent an '=' from the functional form
linear_formula = 'property_crime ~ burglary+larceny_theft+motor_veh_theft'

# Fit the model to our data using the formula.
lm = smf.ols(formula=linear_formula, data=df_high_sc).fit()

In [110]:
lm.params

Intercept          0.000000
burglary           0.118788
larceny_theft      0.830255
motor_veh_theft    0.052700
dtype: float64

In [111]:
lm.pvalues

Intercept          1.0
burglary           0.0
larceny_theft      0.0
motor_veh_theft    0.0
dtype: float64

In [112]:
lm.rsquared

1.0

In [113]:
# Write out the model formula.
# Your dependent variable on the right, independent variables on the left
# Use a ~ to represent an '=' from the functional form
linear_formula = 'property_crime ~ Population+population_sqr+murder+rape+robbery+aggrvt_asslt+arson+feat_murder+feat_rape+feat_robbery+feat_aggrvt_asslt+feat_arson'

# Fit the model to our data using the formula.
lm = smf.ols(formula=linear_formula, data=df_low_sc).fit()

In [114]:
lm.params

Intercept           -2.428613e-17
Population           2.967912e-01
population_sqr       1.490811e-01
murder              -3.042864e-01
rape                 2.371856e-01
robbery              3.360290e-01
aggrvt_asslt         7.351516e-02
arson               -1.371470e-01
feat_murder          1.094812e-01
feat_rape            5.102656e-02
feat_robbery         1.238171e-01
feat_aggrvt_asslt    1.542568e-02
feat_arson           1.411734e-01
dtype: float64

In [115]:
lm.pvalues

Intercept            1.000000
Population           0.012132
population_sqr       0.184903
murder               0.000004
rape                 0.000107
robbery              0.001007
aggrvt_asslt         0.466078
arson                0.011247
feat_murder          0.018447
feat_rape            0.182660
feat_robbery         0.000744
feat_aggrvt_asslt    0.635076
feat_arson           0.001257
dtype: float64

In [116]:
lm.rsquared

0.7659015535110191

In [117]:
# Write out the model formula.
# Your dependent variable on the right, independent variables on the left
# Use a ~ to represent an '=' from the functional form
linear_formula = 'property_crime ~ Population+murder+rape+robbery+aggrvt_asslt+arson+feat_murder+feat_robbery+feat_arson'

# Fit the model to our data using the formula.
lm = smf.ols(formula=linear_formula, data=df_low_sc).fit()

In [118]:
lm.params

Intercept      -2.428613e-17
Population      4.529988e-01
murder         -3.078687e-01
rape            2.687916e-01
robbery         3.527748e-01
aggrvt_asslt    5.986408e-02
arson          -1.512766e-01
feat_murder     1.106294e-01
feat_robbery    1.236069e-01
feat_arson      1.540207e-01
dtype: float64

In [119]:
lm.pvalues

Intercept       1.000000e+00
Population      3.355635e-28
murder          2.900981e-06
rape            1.427781e-06
robbery         3.837726e-04
aggrvt_asslt    5.376402e-01
arson           4.691324e-03
feat_murder     1.695274e-02
feat_robbery    2.086612e-04
feat_arson      3.489567e-04
dtype: float64

In [120]:
lm.rsquared

0.7633211964354317

In [121]:
# Write out the model formula.
# Your dependent variable on the right, independent variables on the left
# Use a ~ to represent an '=' from the functional form
linear_formula = 'property_crime ~ Population+murder+rape+robbery+arson+feat_robbery+feat_arson'

# Fit the model to our data using the formula.
lm = smf.ols(formula=linear_formula, data=df_low_sc).fit()

In [122]:
lm.params

Intercept      -2.428613e-17
Population      4.683360e-01
murder         -1.964691e-01
rape            3.160374e-01
robbery         3.142882e-01
arson          -1.300601e-01
feat_robbery    1.351136e-01
feat_arson      1.444760e-01
dtype: float64

In [123]:
lm.pvalues

Intercept       1.000000e+00
Population      4.131064e-30
murder          1.738177e-05
rape            8.285596e-13
robbery         7.363572e-06
arson           1.376763e-02
feat_robbery    4.720863e-05
feat_arson      8.013394e-04
dtype: float64

In [124]:
lm.rsquared

0.7585996746579508

In [125]:
# Write out the model formula.
# Your dependent variable on the right, independent variables on the left
# Use a ~ to represent an '=' from the functional form
linear_formula = 'property_crime ~ Population+murder+rape+robbery+arson'

# Fit the model to our data using the formula.
lm = smf.ols(formula=linear_formula, data=df_low_sc).fit()

In [126]:
lm.params

Intercept    -2.428613e-17
Population    5.568189e-01
murder       -1.824415e-01
rape          3.518824e-01
robbery       2.206947e-01
arson         2.799059e-03
dtype: float64

In [127]:
lm.pvalues

Intercept     1.000000e+00
Population    7.105040e-43
murder        1.037257e-04
rape          1.527988e-14
robbery       7.477990e-04
arson         9.404485e-01
dtype: float64

In [128]:
lm.rsquared

0.7341987949748324

In [129]:
# Write out the model formula.
# Your dependent variable on the right, independent variables on the left
# Use a ~ to represent an '=' from the functional form
linear_formula = 'property_crime ~ Population+rape'

# Fit the model to our data using the formula.
lm = smf.ols(formula=linear_formula, data=df_low_sc).fit()

In [130]:
lm.params

Intercept    -2.428613e-17
Population    5.766759e-01
rape          4.064297e-01
dtype: float64

In [131]:
lm.pvalues

Intercept     1.000000e+00
Population    8.550344e-46
rape          3.880354e-27
dtype: float64

In [132]:
lm.rsquared

0.7182889326145071

#### 'Population' and 'rape' appear to be the most significant variables influencing 'property_crime' (not including the individual categories that comprise 'property_crime')
#### Running a regression on the dataset as a whole and on the higher-population of cities returns a very high R^2
#### not so high for lower-population cities (~<35000)
#### could create feature for high/low population