## Connect to drive / Access directory

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# CD to project code directory
import os
os.chdir('/content/drive/MyDrive/Group_Project/Project_Code')

## Import Packages

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', 3000)
pd.set_option('display.max_rows', 3000)

## Load in Datasets

In [None]:
# import pittsburgh crime data
# documentation: https://data.wprdc.org/dataset/uniform-crime-reporting-data

crime_data = pd.read_csv('/content/drive/MyDrive/Group_Project/PGH_Crime_Data/archive-police-blotter-2006.csv')

# import redfin data
redfin_data = pd.read_csv('/content/drive/MyDrive/Group_Project/Redfin/Pittsburgh/redfin_2022-11-03-08-46-51.csv')


In [None]:
# EDA for Pittsburgh crime
print(crime_data.size)
print(crime_data.dtypes)

4465245
PK                         int64
CCR                      float64
HIERARCHY                  int64
INCIDENTTIME              object
INCIDENTLOCATION          object
CLEAREDFLAG               object
INCIDENTNEIGHBORHOOD      object
INCIDENTZONE              object
INCIDENTHIERARCHYDESC     object
OFFENSES                  object
INCIDENTTRACT            float64
COUNCIL_DISTRICT         float64
PUBLIC_WORKS_DIVISION    float64
X                        float64
Y                        float64
dtype: object


In [None]:
display(crime_data.head())

Unnamed: 0,PK,CCR,HIERARCHY,INCIDENTTIME,INCIDENTLOCATION,CLEAREDFLAG,INCIDENTNEIGHBORHOOD,INCIDENTZONE,INCIDENTHIERARCHYDESC,OFFENSES,INCIDENTTRACT,COUNCIL_DISTRICT,PUBLIC_WORKS_DIVISION,X,Y
0,2802309,16000001.0,10,2016-01-01T00:00:00,"400 Block North Shore DR Pittsburgh, PA 15212",Y,North Shore,1,HARRASSMENT/THREAT/ATTEMPT/PHY,2702 Aggravated Assault. / 2709(a) Harassment....,2205.0,1.0,6.0,-80.012337,40.446263
1,2803174,16004547.0,11,2016-01-01T00:01:00,"5400 Block Carnegie ST Pittsburgh, PA 15201",N,Upper Lawrenceville,2,THEFT BY DECEPTION,3922 Theft by Deception.,1011.0,7.0,2.0,-79.950295,40.48229
2,2801809,16000367.0,4,2016-01-01T00:10:00,"500 Block Mt Pleasant RD Pittsburgh, PA 15214",N,Northview Heights,1,DISCHARGE OF FIREARM INTO OCC.STRUCTURE,2707.1 Discharge of a Firearm into Occupied St...,2609.0,1.0,1.0,-80.000966,40.478651
3,2802315,16000035.0,10,2016-01-01T00:15:00,"300 Block Wood ST Pittsburgh, PA 15222",Y,Golden Triangle/Civic Arena,2,HARRASSMENT/THREAT/ATTEMPT/PHY,2709(a)(3) Harassment No Legitimate Purpose,201.0,6.0,6.0,-80.001251,40.438918
4,2802312,16000024.0,4,2016-01-01T00:16:00,"500 Block Mt Pleasant RD Pittsburgh, PA 15214",N,Northview Heights,1,PROP MISSILE INTO OCC VEHICLE/OR ROADWAY,2705 Recklessy Endangering Another Person. / 3...,2609.0,1.0,1.0,-80.000966,40.478651


In [None]:
# Crime Cleaning
crime_loc = crime_data.copy()
# Extract year, month from crime
crime_loc['Year'] = crime_loc.INCIDENTTIME.str.split("-").str[0]
crime_loc['Month'] = crime_loc.INCIDENTTIME.str.split("-").str[1]
# Drop unnecassary columns
crime_loc = crime_loc[['Year', 'Month', 'X', 'Y']]
display(crime_loc.dtypes)
print(len(crime_loc))

Year      object
Month     object
X        float64
Y        float64
dtype: object

297683


In [None]:
crime_loc = crime_loc[crime_loc.Year > '2016']

In [None]:
display(crime_loc.head())
print(len(crime_loc))
print(crime_loc.Year.value_counts())

Unnamed: 0,Year,Month,X,Y
62211,2017,1,-79.944664,40.467466
62329,2017,1,-79.888097,40.45547
62369,2017,1,-79.979979,40.443581
62370,2017,1,-79.90854,40.47046
62371,2017,1,-80.000844,40.450893


223940
2017    42186
2018    41539
2019    39492
2021    38309
2020    34644
2022    27770
Name: Year, dtype: int64


In [None]:
print(crime_data.INCIDENTNEIGHBORHOOD.value_counts())

South Side Flats               16889
Central Business District      16161
Carrick                        10457
Bloomfield                      7854
Shadyside                       7654
East Liberty                    7250
Squirrel Hill South             6780
Brookline                       6712
Mount Washington                6701
Homewood South                  6528
Lincoln-Lemington-Belmar        5987
Homewood North                  5840
Knoxville                       5691
Brighton Heights                5531
Sheraden                        5307
East Allegheny                  5297
Beechview                       4975
Marshall-Shadeland              4915
Hazelwood                       4836
Allentown                       4692
Central Oakland                 4688
East Hills                      4513
Golden Triangle/Civic Arena     4278
North Oakland                   4194
Bluff                           4078
Highland Park                   4051
Central Lawrenceville           3948
S

In [None]:
# EDA for Redfin data
print(len(redfin_data))
print(redfin_data.dtypes)

10001
SALE TYPE                                                                                       object
SOLD DATE                                                                                       object
PROPERTY TYPE                                                                                   object
ADDRESS                                                                                         object
CITY                                                                                            object
STATE OR PROVINCE                                                                               object
ZIP OR POSTAL CODE                                                                             float64
PRICE                                                                                          float64
BEDS                                                                                           float64
BATHS                                                              

In [None]:
display(redfin_data.head())

Unnamed: 0,SALE TYPE,SOLD DATE,PROPERTY TYPE,ADDRESS,CITY,STATE OR PROVINCE,ZIP OR POSTAL CODE,PRICE,BEDS,BATHS,LOCATION,SQUARE FEET,LOT SIZE,YEAR BUILT,DAYS ON MARKET,$/SQUARE FEET,HOA/MONTH,STATUS,NEXT OPEN HOUSE START TIME,NEXT OPEN HOUSE END TIME,URL (SEE https://www.redfin.com/buy-a-home/comparative-market-analysis FOR INFO ON PRICING),SOURCE,MLS#,FAVORITE,INTERESTED,LATITUDE,LONGITUDE
0,PAST SALE,June-23-2022,Multi-Family (2-4 Unit),141 S 22nd St,South Side,PA,15203.0,295000.0,3.0,2.0,South Side,,1441.0,1899.0,,,,Sold,,,https://www.redfin.com/PA/Pittsburgh/141-S-22n...,West Penn MLS,1549168.0,N,Y,40.426322,-79.974692
1,PAST SALE,July-31-2019,Condo/Co-op,144 N Dithridge St #510,Oakland,PA,15213.0,210000.0,2.0,1.0,Oakland,1557.0,,1940.0,,135.0,853.0,Sold,,,https://www.redfin.com/PA/Pittsburgh/144-N-Dit...,West Penn MLS,1395181.0,N,Y,40.448234,-79.950786
2,PAST SALE,August-15-2019,Single Family Residential,1144 Wightman St,Squirrel Hill,PA,15217.0,654000.0,6.0,3.5,Squirrel Hill,2678.0,,1901.0,,244.0,,Sold,,,https://www.redfin.com/PA/Pittsburgh/1144-Wigh...,West Penn MLS,1396759.0,N,Y,40.446259,-79.929086
3,PAST SALE,August-28-2020,Condo/Co-op,128 N Craig St #303,Oakland,PA,15213.0,258500.0,3.0,2.0,Oakland,1680.0,,1957.0,,154.0,922.0,Sold,,,https://www.redfin.com/PA/Pittsburgh/128-N-Cra...,West Penn MLS,1427065.0,N,Y,40.448078,-79.94832
4,PAST SALE,August-19-2022,Condo/Co-op,928 Bellefonte St,Shadyside,PA,15232.0,353000.0,3.0,1.5,Shadyside,1368.0,,1935.0,,258.0,258.0,Sold,,,https://www.redfin.com/PA/Pittsburgh/928-Belle...,West Penn MLS,1563273.0,N,Y,40.449387,-79.933005


In [None]:
clean_red = redfin_data.copy()
# Clean redfin data
# Only look at single family homes for now
clean_red = clean_red[clean_red['PROPERTY TYPE'] == 'Single Family Residential']

# Drop unnecassary columns
clean_red = clean_red.drop(columns = ['SALE TYPE', 'DAYS ON MARKET', 'STATUS', 'NEXT OPEN HOUSE START TIME', 'NEXT OPEN HOUSE END TIME', 'SOURCE', 'FAVORITE', 'INTERESTED', 'PROPERTY TYPE',
                                      'ADDRESS', 'ZIP OR POSTAL CODE', 'HOA/MONTH', 'MLS#', 'CITY', 'STATE OR PROVINCE'])

# Eliminate rows missing sold date
clean_red = clean_red[~clean_red['SOLD DATE'].isna()]

# Eliminate or impute values with missing square footage values - eliminating for now - can impute later
clean_red = clean_red[~clean_red['SQUARE FEET'].isna()]

# Eliminate values without price
clean_red = clean_red[~clean_red['PRICE'].isna()]

# Eliminate or impute values without lot size
clean_red = clean_red[~clean_red['LOT SIZE'].isna()]

# Eliminate values without # Beds or # Baths (or impute with mode)
clean_red = clean_red[~clean_red['BEDS'].isna()]
clean_red = clean_red[~clean_red['BATHS'].isna()]

# Eliminate values without location
clean_red = clean_red[~clean_red['LOCATION'].isna()]

# Eliminate values without Year built
clean_red = clean_red[~clean_red['YEAR BUILT'].isna()]

# Extract Year Sold
clean_red['Year'] = clean_red['SOLD DATE'].str.split("-").str[2]

# Extract Month Sold - convert to 1-12
clean_red['Month'] = clean_red['SOLD DATE'].str.split("-").str[0]
month_dict = {"January": 1, "February" : 2, "March" : 3, "April" : 4, "May" : 5, "June" : 6,
              "July" : 7, "August" : 8, "September" : 9, "October" : 10, "November" : 11, "December" : 12}
clean_red.replace({'Month' : month_dict})

# Drop Sold Date Column
clean_red.drop(columns = 'SOLD DATE', inplace = True)


In [None]:
print(clean_red['LOT SIZE'].mean())

121022.75813295615


In [None]:
display(clean_red.head())


Unnamed: 0,PRICE,BEDS,BATHS,LOCATION,SQUARE FEET,LOT SIZE,YEAR BUILT,$/SQUARE FEET,URL (SEE https://www.redfin.com/buy-a-home/comparative-market-analysis FOR INFO ON PRICING),LATITUDE,LONGITUDE,Year,Month
6,379900.0,2.0,1.5,Lawrenceville,1226.0,1742.0,1890.0,310.0,https://www.redfin.com/PA/Pittsburgh/159-1-2-3...,40.467964,-79.965542,2021,September
8,390000.0,3.0,3.0,Lawrenceville,1519.0,2178.0,1910.0,257.0,https://www.redfin.com/PA/Pittsburgh/4919-Hatf...,40.478084,-79.957314,2020,December
14,302500.0,5.0,2.0,Lawrenceville,3034.0,2178.0,1900.0,100.0,https://www.redfin.com/PA/Pittsburgh/259-45th-...,40.471141,-79.95762,2021,June
16,301000.0,3.0,1.5,Bloomfield,1972.0,1306.0,1890.0,153.0,https://www.redfin.com/PA/Pittsburgh/431-Taylo...,40.460556,-79.952331,2021,October
24,396000.0,3.0,2.0,Highland Park,1462.0,4081.0,1926.0,271.0,https://www.redfin.com/PA/Pittsburgh/6529-Stan...,40.472322,-79.911563,2022,August


In [None]:
print(len(clean_red))
print(clean_red.isna().sum())
print(clean_red.LOCATION.value_counts())

1414
PRICE                                                                                          0
BEDS                                                                                           0
BATHS                                                                                          0
LOCATION                                                                                       0
SQUARE FEET                                                                                    0
LOT SIZE                                                                                       0
YEAR BUILT                                                                                     0
$/SQUARE FEET                                                                                  0
URL (SEE https://www.redfin.com/buy-a-home/comparative-market-analysis FOR INFO ON PRICING)    0
LATITUDE                                                                                       0
LONGITUDE                