### Notebook to run ols lag model on housing prices as a funciton of actual and perceived crime

In [4]:
##import packages
import pandas as pd
import numpy as np
import os
from plotnine import *
import scipy.stats as stats 
import statsmodels.api as sm
import warnings
warnings.filterwarnings("ignore")

In [42]:
crime_csv = pd.read_csv("../../data/sentiment_and_crime.csv")
crime_csv.head()

Unnamed: 0.1,Unnamed: 0,OFFENSE,Name,YEAR,TotalNeighborhoodCrime,Name_lower,neighborhood,year,average_sentiment,date
0,0,THEFT/OTHER,Shaw,2024,1394,shaw,shaw,2024,5.664683,2024-06-24 18:40:00.000000000
1,1,THEFT/OTHER,Columbia Heights,2024,1545,columbia heights,columbia heights,2024,4.836393,2024-06-11 05:20:00.000000000
2,2,THEFT/OTHER,Capitol Hill,2024,1138,capitol hill,capitol hill,2024,5.838586,2024-06-14 18:40:00.000000000
3,3,THEFT/OTHER,Adams Morgan,2024,686,adams morgan,adams morgan,2024,4.875923,2024-06-14 18:40:00.000000000
4,4,MOTOR VEHICLE THEFT,Anacostia,2024,177,anacostia,anacostia,2024,4.352189,2024-05-11 18:40:00.000000000


In [44]:
home_value_csv = pd.read_csv("../../data/zillow_house_values.csv")
home_value_csv.head()

Unnamed: 0,RegionName,State,City,EndMonth,MedianHomeValue
0,Columbia Heights,DC,Washington,2019-01-31,599525.939267
1,Capitol Hill,DC,Washington,2019-01-31,819029.5454
2,Petworth,DC,Washington,2019-01-31,593648.793579
3,Adams Morgan,DC,Washington,2019-01-31,548296.6294
4,Shaw,DC,Washington,2019-01-31,757805.290735


In [58]:
home_value_csv['Name_lower'] = home_value_csv['RegionName'].str.lower()
home_value_csv['EndMonth'] = pd.to_datetime(home_value_csv['EndMonth'])
home_value_csv['Year'] = home_value_csv["EndMonth"].dt.year

In [62]:
home_value_csv.tail()

Unnamed: 0,RegionName,State,City,EndMonth,MedianHomeValue,Name_lower,Year
518,Adams Morgan,DC,Washington,2024-10-31,581161.059991,adams morgan,2024
519,Shaw,DC,Washington,2024-10-31,779773.578514,shaw,2024
520,Navy Yard,DC,Washington,2024-10-31,669250.473366,navy yard,2024
521,NoMa,DC,Washington,2024-10-31,461919.845801,noma,2024
522,Anacostia,DC,Washington,2024-10-31,387563.3008,anacostia,2024


In [84]:
all_data = crime_csv.merge(home_value_csv, left_on=['Name_lower', 'year'], right_on=['Name_lower', 'Year'])

In [86]:
all_data.tail()

Unnamed: 0.1,Unnamed: 0,OFFENSE,Name,YEAR,TotalNeighborhoodCrime,Name_lower,neighborhood,year,average_sentiment,date,RegionName,State,City,EndMonth,MedianHomeValue,Year
435146,38571,THEFT/OTHER,Capitol Hill,2019,1598,capitol hill,capitol hill,2019,6.5,2019-02-01 00:00:00.000000000,Capitol Hill,DC,Washington,2019-08-31,830874.09159,2019
435147,38571,THEFT/OTHER,Capitol Hill,2019,1598,capitol hill,capitol hill,2019,6.5,2019-02-01 00:00:00.000000000,Capitol Hill,DC,Washington,2019-09-30,830638.3189,2019
435148,38571,THEFT/OTHER,Capitol Hill,2019,1598,capitol hill,capitol hill,2019,6.5,2019-02-01 00:00:00.000000000,Capitol Hill,DC,Washington,2019-10-31,830928.932787,2019
435149,38571,THEFT/OTHER,Capitol Hill,2019,1598,capitol hill,capitol hill,2019,6.5,2019-02-01 00:00:00.000000000,Capitol Hill,DC,Washington,2019-11-30,831961.839447,2019
435150,38571,THEFT/OTHER,Capitol Hill,2019,1598,capitol hill,capitol hill,2019,6.5,2019-02-01 00:00:00.000000000,Capitol Hill,DC,Washington,2019-12-31,833991.363163,2019


In [100]:
yearly_average = all_data.groupby(['year', 'neighborhood'])['MedianHomeValue'].mean().reset_index()

yearly_average.tail()

Unnamed: 0,year,neighborhood,MedianHomeValue
35,2024,columbia heights,638632.69091
36,2024,navy yard,673385.16622
37,2024,noma,474726.304292
38,2024,petworth,672578.069226
39,2024,shaw,786305.343135


In [166]:
all_data_filtered = (all_data.filter(['neighborhood', 'year', 'average_sentiment', 'TotalNeighborhoodCrime',
                                      'MedianHomeValue']))

In [168]:
all_data_filtered.head()

Unnamed: 0,neighborhood,year,average_sentiment,TotalNeighborhoodCrime,MedianHomeValue
0,shaw,2024,5.664683,1394,789753.519493
1,shaw,2024,5.664683,1394,789798.788072
2,shaw,2024,5.664683,1394,791519.112031
3,shaw,2024,5.664683,1394,793128.619088
4,shaw,2024,5.664683,1394,791712.106424


In [170]:
all_data_avg = pd.merge(all_data_filtered, yearly_average, on=['year', 'neighborhood'], suffixes=('', '_avg'))
all_data_avg.head()

Unnamed: 0,neighborhood,year,average_sentiment,TotalNeighborhoodCrime,MedianHomeValue,MedianHomeValue_avg
0,shaw,2024,5.664683,1394,789753.519493,786305.343135
1,shaw,2024,5.664683,1394,789798.788072,786305.343135
2,shaw,2024,5.664683,1394,791519.112031,786305.343135
3,shaw,2024,5.664683,1394,793128.619088,786305.343135
4,shaw,2024,5.664683,1394,791712.106424,786305.343135


In [174]:
all_data_avg = all_data_avg.drop('MedianHomeValue', axis=1)

In [178]:
all_data_avg_no_duplicates = all_data_avg.drop_duplicates().reset_index()
all_data_avg_no_duplicates.head()

Unnamed: 0,index,neighborhood,year,average_sentiment,TotalNeighborhoodCrime,MedianHomeValue_avg
0,0,shaw,2024,5.664683,1394,786305.343135
1,10,columbia heights,2024,4.836393,1545,638632.69091
2,20,capitol hill,2024,5.838586,1138,907238.986881
3,30,adams morgan,2024,4.875923,686,581093.788565
4,40,anacostia,2024,4.352189,177,390788.644231


In [182]:
##create the lagged variables
all_data_avg_no_duplicates['HomeValue_lagged'] = all_data_avg_no_duplicates['MedianHomeValue_avg'].shift(1)
all_data_avg_no_duplicates.head()

Unnamed: 0,index,neighborhood,year,average_sentiment,TotalNeighborhoodCrime,MedianHomeValue_avg,HomeValue_lagged
0,0,shaw,2024,5.664683,1394,786305.343135,
1,10,columbia heights,2024,4.836393,1545,638632.69091,786305.343135
2,20,capitol hill,2024,5.838586,1138,907238.986881,638632.69091
3,30,adams morgan,2024,4.875923,686,581093.788565,907238.986881
4,40,anacostia,2024,4.352189,177,390788.644231,581093.788565
