In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# CD to project code directory
import os
os.chdir('/content/drive/MyDrive/Group_Project/Project_Code')

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
from geopy import distance
import math

pd.set_option('display.max_columns', 3000)
pd.set_option('display.max_rows', 3000)

In [42]:
# import pittsburgh crime data
# documentation: https://data.wprdc.org/dataset/uniform-crime-reporting-data

crime_data = pd.read_csv('/content/drive/MyDrive/Group_Project/PGH_Crime_Data/archive-police-blotter-2006.csv')

# import redfin data
redfin_data = pd.read_csv('/content/drive/MyDrive/Group_Project/Redfin/Pittsburgh/redfin_new.csv')

In [43]:
print(len(redfin_data))
print(len(crime_data))

7420
297683


In [44]:
def clean_crime(crime_data):
  crime_loc = crime_data.copy()
  # Extract year, month from crime
  crime_loc['Year'] = crime_loc.INCIDENTTIME.str.split("-").str[0]
  crime_loc['Month'] = crime_loc.INCIDENTTIME.str.split("-").str[1]
  # Drop unnecessary columns
  crime_loc = crime_loc[['Year', 'Month', 'X', 'Y']]
  # Drop NAs
  crime_loc.dropna(inplace=True)
  # Combine location into one tuple
  crime_loc["Location"] = list(zip(crime_loc.X, crime_loc.Y))
  # Drop X and Y columns
  crime_loc.drop(columns = ['X', 'Y'], inplace = True)
  # Select years to keep - This we will only be looking at data past 2016 so
  # this can be hardcoded here
  crime_loc = crime_loc[crime_loc.Year > '2015']
  # Convert Year to integer
  crime_loc['Year'] = crime_loc['Year'].astype(int)
  # Convert Month to integer
  crime_loc['Month'] = crime_loc['Month'].astype(int)
  # Create datetime column
  crime_loc['dt'] = pd.to_datetime(crime_loc['Year'].astype(str) + crime_loc['Month'].astype(str), format = '%Y%m')
  # Drop old Year and Month Columns
  crime_loc.drop(columns = ['Year', 'Month'], inplace = True)
  # Return df
  return crime_loc


In [45]:
def clean_redfin(redfin_data):
  clean_red = redfin_data.copy()
  # Clean redfin data
  # Only look at single family homes for now
  clean_red = clean_red[clean_red['PROPERTY TYPE'] == 'Single Family Residential']
  # Rename "Location" column to "Neighborhood"
  clean_red['Neighborhood'] = clean_red['LOCATION'].copy()
  # Combine location into one tuple
  clean_red['Location'] = list(zip(clean_red.LONGITUDE, clean_red.LATITUDE))
  # Drop unnecassary columns
  clean_red = clean_red.drop(columns = ['SALE TYPE', 'DAYS ON MARKET', 'STATUS', 'NEXT OPEN HOUSE START TIME', 'NEXT OPEN HOUSE END TIME', 'SOURCE', 'FAVORITE', 'INTERESTED', 'PROPERTY TYPE',
                                      'ADDRESS', 'ZIP OR POSTAL CODE', 'HOA/MONTH', 'MLS#', 'CITY', 'STATE OR PROVINCE', '$/SQUARE FEET', 'LONGITUDE', 'LATITUDE', 'LOCATION'])
  # Eliminate rows missing sold date
  clean_red = clean_red[~clean_red['SOLD DATE'].isna()]
  # Eliminate or impute values with missing square footage values - eliminating for now - can impute later
  clean_red = clean_red[~clean_red['SQUARE FEET'].isna()]
  # Eliminate values without price
  clean_red = clean_red[~clean_red['PRICE'].isna()]
  # Eliminate or impute values without lot size
  clean_red = clean_red[~clean_red['LOT SIZE'].isna()]
  # Eliminate values without # Beds or # Baths (or impute with mode)
  clean_red = clean_red[~clean_red['BEDS'].isna()]
  clean_red = clean_red[~clean_red['BATHS'].isna()]
  # Eliminate values without neighborhood
  clean_red = clean_red[~clean_red['Neighborhood'].isna()]
  # Eliminate values without Year built
  clean_red = clean_red[~clean_red['YEAR BUILT'].isna()]
  # Extract Year Sold
  clean_red['Year'] = clean_red['SOLD DATE'].str.split("-").str[2]
  # Convert Year to integer
  clean_red['Year'] = clean_red['Year'].astype(int)
  # Extract Month Sold - convert to 1-12
  clean_red['Month'] = clean_red['SOLD DATE'].str.split("-").str[0]
  month_dict = {"January": 1, "February" : 2, "March" : 3, "April" : 4, "May" : 5, "June" : 6,
                "July" : 7, "August" : 8, "September" : 9, "October" : 10, "November" : 11, "December" : 12}
  clean_red.replace({'Month' : month_dict}, inplace=  True)
  # Drop Sold Date Column
  clean_red.drop(columns = 'SOLD DATE', inplace = True)
  # Create datetime columns
  clean_red['dt'] = pd.to_datetime(clean_red['Year'].astype(str) + clean_red['Month'].astype(str), format = '%Y%m')
  # Drop old Year and Month Columns
  clean_red.drop(columns = ['Year', 'Month'], inplace = True)
  # Create Crime Score Column
  clean_red['Crime Score'] = 0
  return clean_red

In [46]:
crime = clean_crime(crime_data)
redfin = clean_redfin(redfin_data)

In [47]:
display(crime.head())
display(redfin.head())
print(len(crime))
print(crime.dtypes)
print(redfin.dtypes)
print(crime.isna().sum())
print(len(redfin))

Unnamed: 0,Location,dt
0,"(-80.01233658, 40.44626305)",2016-01-01
1,"(-79.950295, 40.48228989)",2016-01-01
2,"(-80.00096629, 40.47865057)",2016-01-01
3,"(-80.00125059, 40.43891788)",2016-01-01
4,"(-80.00096629, 40.47865057)",2016-01-01


Unnamed: 0,PRICE,BEDS,BATHS,SQUARE FEET,LOT SIZE,YEAR BUILT,URL (SEE https://www.redfin.com/buy-a-home/comparative-market-analysis FOR INFO ON PRICING),Neighborhood,Location,dt,Crime Score
2,275000.0,4.0,4.0,1785.0,1306.0,1900.0,https://www.redfin.com/PA/Pittsburgh/45-Greele...,South Side,"(-79.9743145, 40.4229551)",2021-10-01,0
3,302500.0,5.0,2.0,3034.0,2178.0,1900.0,https://www.redfin.com/PA/Pittsburgh/259-45th-...,Lawrenceville,"(-79.9576202, 40.4711406)",2021-06-01,0
4,301000.0,3.0,1.5,1972.0,1306.0,1890.0,https://www.redfin.com/PA/Pittsburgh/431-Taylo...,Bloomfield,"(-79.9523315, 40.4605557)",2021-10-01,0
9,379900.0,2.0,1.5,1226.0,1742.0,1890.0,https://www.redfin.com/PA/Pittsburgh/159-1-2-3...,Lawrenceville,"(-79.9655419, 40.4679636)",2021-09-01,0
11,390000.0,3.0,3.0,1519.0,2178.0,1910.0,https://www.redfin.com/PA/Pittsburgh/4919-Hatf...,Lawrenceville,"(-79.9573142, 40.4780842)",2020-12-01,0


261299
Location            object
dt          datetime64[ns]
dtype: object
PRICE                                                                                                 float64
BEDS                                                                                                  float64
BATHS                                                                                                 float64
SQUARE FEET                                                                                           float64
LOT SIZE                                                                                              float64
YEAR BUILT                                                                                            float64
URL (SEE https://www.redfin.com/buy-a-home/comparative-market-analysis FOR INFO ON PRICING)            object
Neighborhood                                                                                           object
Location                                     

In [55]:
def crimeScore(location1, location2, maxRadius = 0.25):
  d = distance.distance(location1, location2).miles
  score = max(0, 1 - (math.sqrt(d/maxRadius)))
  return score

In [49]:
sub_crime = crime.head(100)
sub_red = redfin.head(5)

In [50]:
display(sub_crime.tail())
display(sub_red)

Unnamed: 0,Location,dt
96,"(-79.93532738, 40.45770893)",2016-01-01
97,"(-80.03895426, 40.48567353)",2016-01-01
98,"(-79.98125718, 40.46574269)",2016-01-01
99,"(-79.99536382, 40.45000615)",2016-01-01
100,"(-80.03552648, 40.47178987)",2016-01-01


Unnamed: 0,PRICE,BEDS,BATHS,SQUARE FEET,LOT SIZE,YEAR BUILT,URL (SEE https://www.redfin.com/buy-a-home/comparative-market-analysis FOR INFO ON PRICING),Neighborhood,Location,dt,Crime Score
2,275000.0,4.0,4.0,1785.0,1306.0,1900.0,https://www.redfin.com/PA/Pittsburgh/45-Greele...,South Side,"(-79.9743145, 40.4229551)",2021-10-01,0
3,302500.0,5.0,2.0,3034.0,2178.0,1900.0,https://www.redfin.com/PA/Pittsburgh/259-45th-...,Lawrenceville,"(-79.9576202, 40.4711406)",2021-06-01,0
4,301000.0,3.0,1.5,1972.0,1306.0,1890.0,https://www.redfin.com/PA/Pittsburgh/431-Taylo...,Bloomfield,"(-79.9523315, 40.4605557)",2021-10-01,0
9,379900.0,2.0,1.5,1226.0,1742.0,1890.0,https://www.redfin.com/PA/Pittsburgh/159-1-2-3...,Lawrenceville,"(-79.9655419, 40.4679636)",2021-09-01,0
11,390000.0,3.0,3.0,1519.0,2178.0,1910.0,https://www.redfin.com/PA/Pittsburgh/4919-Hatf...,Lawrenceville,"(-79.9573142, 40.4780842)",2020-12-01,0


In [57]:
def get_Crime_Scores(crime, homes, max_radius = 0.25, timeWindow = 52):
  for i in range(len(homes)):
    # consider only crimes that occured in the timeWindow around when the house was sold (in weeks)
    sub_crimes = crime[abs((crime.dt - homes.dt.iloc[i]) / pd.Timedelta(1, 'W')) < timeWindow]
    homes["Crime Score"].iloc[i] = sum(sub_crimes.Location.apply(lambda x: crimeScore(x, homes.Location.iloc[i], max_radius)))
    print("Score Calculated for Home", i+1)

In [58]:
# This takes about 6.5 hours, but generates a pickle with crime scores with your max radius and time window
get_Crime_Scores(sub_crime, sub_red)
#redfin.to_pickle('Redfin_pickle2')

Score Calculated for Home 1
Score Calculated for Home 2
Score Calculated for Home 3
Score Calculated for Home 4
Score Calculated for Home 5


In [59]:
display(sub_red.head())
print(sub_red['Crime Score'].max())
print(sub_red['Crime Score'].min())

Unnamed: 0,PRICE,BEDS,BATHS,SQUARE FEET,LOT SIZE,YEAR BUILT,URL (SEE https://www.redfin.com/buy-a-home/comparative-market-analysis FOR INFO ON PRICING),Neighborhood,Location,dt,Crime Score
2,275000.0,4.0,4.0,1785.0,1306.0,1900.0,https://www.redfin.com/PA/Pittsburgh/45-Greele...,South Side,"(-79.9743145, 40.4229551)",2021-10-01,0.0
3,302500.0,5.0,2.0,3034.0,2178.0,1900.0,https://www.redfin.com/PA/Pittsburgh/259-45th-...,Lawrenceville,"(-79.9576202, 40.4711406)",2021-06-01,0.0
4,301000.0,3.0,1.5,1972.0,1306.0,1890.0,https://www.redfin.com/PA/Pittsburgh/431-Taylo...,Bloomfield,"(-79.9523315, 40.4605557)",2021-10-01,0.0
9,379900.0,2.0,1.5,1226.0,1742.0,1890.0,https://www.redfin.com/PA/Pittsburgh/159-1-2-3...,Lawrenceville,"(-79.9655419, 40.4679636)",2021-09-01,0.0
11,390000.0,3.0,3.0,1519.0,2178.0,1910.0,https://www.redfin.com/PA/Pittsburgh/4919-Hatf...,Lawrenceville,"(-79.9573142, 40.4780842)",2020-12-01,0.0


0.0
0.0


In [None]:
# Create a copy to play with
redfin_cr = redfin.copy()
# Normalize Crime Score between 0-100
redfin_cr['Crime Score'] = ((redfin_cr['Crime Score'] - redfin_cr['Crime Score'].min()) / (redfin_cr['Crime Score'].max() - redfin_cr['Crime Score'].min()))*100 

In [None]:
display(redfin_cr.head(25))

NameError: ignored