Import necessary packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

Import datasets and packages

In [2]:
df_lookup = pd.read_csv('../../data/raw/EXTR_LookUp.csv', dtype='str')
#df_parcel = pd.read_csv('../../data/raw/EXTR_PARCEL.csv', dtype='str')
df_resbldg = pd.read_csv('../../data/raw/EXTR_ResBldg.csv', dtype='str')
df_rpsale = pd.read_csv('../../data/raw/EXTR_RPSale.csv', dtype='str')

Strip leading and trailing spaces

In [3]:
def strip_spaces(df):
    for col in df.columns:
        df[col] = df[col].str.strip()
    return df

df_lookup = strip_spaces(df_lookup)
#df_parcel = strip_spaces(df_parcel)
df_resbldg = strip_spaces(df_resbldg)
df_rpsale = strip_spaces(df_rpsale)

Define function to remove *x* std deviations of data from an input

In [4]:
def remove_extremes(data, devct):
    data = pd.Series([float(num) for num in data])
    cleaned = data.loc[data>0].copy()

    std = cleaned.std()
    med = cleaned.median()

    cleaned = cleaned.loc[(cleaned > (med - std*devct)) & (cleaned < (med+std*devct))].copy() 
    return cleaned

## Refine *Sales* DataFrame

In [48]:
res_sales = df_rpsale.copy()

principal_use_codes_to_keep = ['6'] # Consider investigating '4' too 
principal_use_codes_to_drop = np.setdiff1d(res_sales.PrincipalUse.unique(), principal_use_codes_to_keep)

property_class_codes_to_keep = ['8']
property_class_codes_to_drop = np.setdiff1d(res_sales.PropertyClass.unique(), property_class_codes_to_keep)

# Need to consider where to classify codes like 2,3 since there isn't an explicit statement
#    - Possible to disregard entirely, explore data first
# Consider looking into single-family and multiple-family separately
property_type_codes_to_keep = ['2', '3', '6', '10', '11', '12', '13', '18', '19']
property_type_codes_to_drop = np.setdiff1d(res_sales.PropertyType.unique(), property_type_codes_to_keep)

for code in principal_use_codes_to_drop:
    res_sales['PrincipalUse'].replace(to_replace=code, value=np.nan, inplace=True)

for code in property_class_codes_to_drop:
    res_sales['PropertyClass'].replace(to_replace=code, value=np.nan, inplace=True)

for code in property_type_codes_to_drop:
    res_sales['PropertyType'].replace(to_replace=code, value=np.nan, inplace=True)

# CREATE PARCEL ID
res_sales['ParcelID'] = res_sales.Major + '-' + res_sales.Minor



# KEEP ONLY 2019 SALES
res_sales['DocumentDate'] = res_sales.DocumentDate.astype(np.datetime64)
res_sales['SaleYear'] = [sale.year for sale in res_sales['DocumentDate']]
res_sales = res_sales.loc[res_sales['SaleYear']==2019].copy()



# ELIMINATE SALES OF ZERO DOLLARS
res_sales['SalePrice'] = res_sales.SalePrice.astype('int')
res_sales['SalePrice'].replace(0, np.nan, inplace=True)

# DROP SALES DETERMINED TO BE INVALID
res_sales.dropna(inplace=True)


# CREATE COLUMN TO IDENTIFY DUPLICATES WHEN MAPPING TO RESBLDG DATAFRAME
res_sales['SaleCount'] = list(map(dict(res_sales.ParcelID.value_counts()).get, res_sales.ParcelID))
res_sales

Unnamed: 0,ExciseTaxNbr,Major,Minor,DocumentDate,SalePrice,RecordingNbr,Volume,Page,PlatNbr,PlatType,PlatLot,PlatBlock,SellerName,BuyerName,PropertyType,PrincipalUse,SaleInstrument,AFForestLand,AFCurrentUseLand,AFNonProfitUse,AFHistoricProperty,SaleReason,PropertyClass,SaleWarning,ParcelID,SaleYear,SaleCount
9,3027422,213043,0120,2019-12-20,560000.0,20191226000848,,,,,,,DOYLE REGAN M+STERLING C,SHAW HEATHER C,11,6,3,N,N,N,N,1,8,,213043-0120,2019,1
10,3002257,940652,0630,2019-07-22,435000.0,20190730001339,,,,,,,CRAMER NEIL C+PAM R+ET AL,HANSON BRYAN L+KAILI,11,6,3,N,N,N,N,1,8,,940652-0630,2019,1
21,2993601,140281,0020,2019-06-04,450000.0,20190614000489,,,,,,,BRASHER DAVID+MONIQUE A,SEITZ ANDREW+MODELSKI EVE,3,6,3,N,N,N,N,1,8,,140281-0020,2019,1
35,3015264,124550,0098,2019-09-27,193000.0,20191015000395,,,,,,,LEWIS WILBERT,TALOFA LLC,3,6,15,N,N,N,N,18,8,18 51 52,124550-0098,2019,1
36,2980648,797320,2320,2019-03-27,540000.0,,,,,,,,VONG SAM A+NGUYEN THO THI+PHAM KINH,MARTINEZ JAVIER JIMENEZ,3,6,3,N,N,N,N,1,8,,797320-2320,2019,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351036,2992593,405940,0095,2019-05-21,442000.0,20190607001670,,,,,,,PREMIER LOTS 2 LLC,RUSSELL ALBERT,11,6,3,N,N,N,N,1,8,,405940-0095,2019,1
351037,3003079,381670,0025,2019-07-22,680000.0,20190802000628,,,,,,,HIRAGA SHYOKO,STOECKMANN DOUGLAS A,11,6,3,N,N,N,N,1,8,,381670-0025,2019,1
351038,3025068,615020,0505,2019-11-18,745000.0,20191212000602,,,,,,,KYLE JASON C+HEATHER,SAYAH MICHAEL+PRIYA,11,6,3,N,N,N,N,1,8,,615020-0505,2019,1
351040,2986698,033310,0255,2019-04-30,6500000.0,20190508000847,,,,,,,MAYNE LAURA L,ARMSTRONG ANALISA L+JOHN MACY,11,6,3,N,N,N,N,1,8,,033310-0255,2019,1


In [47]:
#res_sales.loc[res_sales.ParcelID == '532310-0141']
#res_sales.loc[res_sales.PropertyType == '11']




[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 6,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 8,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 4,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,


In [38]:
df_resbldg.loc[(df_resbldg.Major=='532310')]# & (df_resbldg.Minor=='0141')]

Unnamed: 0,Major,Minor,BldgNbr,NbrLivingUnits,Address,BuildingNumber,Fraction,DirectionPrefix,StreetName,StreetType,DirectionSuffix,ZipCode,Stories,BldgGrade,BldgGradeVar,SqFt1stFloor,SqFtHalfFloor,SqFt2ndFloor,SqFtUpperFloor,SqFtUnfinFull,SqFtUnfinHalf,SqFtTotLiving,SqFtTotBasement,SqFtFinBasement,FinBasementGrade,SqFtGarageBasement,SqFtGarageAttached,DaylightBasement,SqFtOpenPorch,SqFtEnclosedPorch,SqFtDeck,HeatSystem,HeatSource,BrickStone,ViewUtilization,Bedrooms,BathHalfCount,Bath3qtrCount,BathFullCount,FpSingleStory,FpMultiStory,FpFreestanding,FpAdditional,YrBuilt,YrRenovated,PcntComplete,Obsolescence,PcntNetCondition,Condition,AddnlCost
66570,532310,115,1,1,3032 61ST AVE SW,3032,,,61ST,AVE,SW,,2,8,0,630,0,630,0,0,0,1700,620,440,8,180,0,Y,0,0,110,3,2,0,N,3,1,0,3,1,0,0,0,2010,0,0,0,0,3,0
66599,532310,110,1,1,3024 A 61ST AVE SW,3024,A,,61ST,AVE,SW,,2,9,0,630,0,580,0,0,0,1780,570,570,8,0,0,Y,60,100,530,5,2,0,N,3,1,1,1,0,0,0,0,2016,0,0,0,0,3,0
66600,532310,134,1,1,3042 B 61ST AVE SW 98116,3042,B,,61ST,AVE,SW,98116.0,2,8,0,400,0,370,0,0,0,1040,270,270,7,0,0,Y,60,0,0,5,2,0,N,2,0,1,1,1,0,0,0,2006,0,0,0,0,3,0
66609,532310,130,1,1,3044 61ST AVE SW 98116,3044,,,61ST,AVE,SW,98116.0,1,7,0,890,0,0,0,0,0,1780,890,890,7,0,0,N,0,0,0,5,2,0,N,3,0,0,2,1,0,0,0,1941,0,0,0,0,4,0
66629,532310,122,1,1,3036 61ST AVE SW,3036,,,61ST,AVE,SW,,2,8,0,560,0,560,0,0,0,1360,540,240,7,300,0,Y,0,0,100,3,2,0,,3,1,0,2,1,0,0,0,2010,0,0,0,0,3,0
66807,532310,112,1,1,3024 C 61ST AVE SW,3024,C,,61ST,AVE,SW,,2,9,0,610,0,610,0,0,0,1620,400,400,8,0,0,Y,0,100,510,5,2,0,,3,1,1,1,0,0,0,0,2016,0,0,0,0,3,0
67054,532310,132,1,1,3042 A 61ST AVE SW 98116,3042,A,,61ST,AVE,SW,98116.0,2,8,0,400,0,370,0,0,0,970,310,200,7,110,0,Y,60,0,0,5,2,0,N,2,0,1,1,1,0,0,0,2006,0,0,0,0,3,0
84734,532310,111,1,1,3024 B 61ST AVE SW,3024,B,,61ST,AVE,SW,,2,9,0,630,0,580,0,0,0,1780,570,570,8,0,0,Y,60,100,530,5,2,0,,3,1,1,1,0,0,0,0,2016,0,0,0,0,3,0
84939,532310,117,1,1,3030 61ST AVE SW,3030,,,61ST,AVE,SW,,2,8,0,560,0,560,0,0,0,1360,540,240,8,300,0,Y,0,0,100,3,2,0,N,3,1,0,2,1,0,0,0,2010,0,0,0,0,3,0


**Create DataFrame for Porch metrics**

In [5]:
porch = df_resbldg[['SqFtOpenPorch', 'SqFtEnclosedPorch']].copy()
porch['Parcel_ID'] = df_resbldg['Major'] + '-' + df_resbldg['Minor']
porch = porch[['Parcel_ID', 'SqFtOpenPorch', 'SqFtEnclosedPorch']].copy()

porch['SqFtOpenPorch'] = porch['SqFtOpenPorch'].astype('int')
porch['SqFtEnclosedPorch'] = porch['SqFtEnclosedPorch'].astype('int')
porch

Unnamed: 0,Parcel_ID,SqFtOpenPorch,SqFtEnclosedPorch
0,009800-0720,0,0
1,009802-0140,380,0
2,009830-0020,360,0
3,009830-0160,690,0
4,010050-0180,60,0
...,...,...,...
181505,197220-1902,0,0
181506,197220-2224,0,0
181507,197220-5172,0,0
181508,197220-5173,0,0


**Create DataFrame for Heating metrics**

In [6]:
from custom_functions import one_hot
from custom_functions import get_lookups

heating = one_hot(df_resbldg['HeatSystem'], name_lookup = get_lookups(108, df_lookup))
heating['binary_notforced'] = heating.drop('Forced Air', axis=1).sum(axis=1)
heating['binary_forced'] = heating['Forced Air']

**Bedroom and garage specs**

In [7]:
garage = df_resbldg[['SqFtGarageAttached', 'SqFtGarageBasement']].astype('int')
bedrooms = df_resbldg['Bedrooms'].astype('int')

**Aggregate Porch, Heating and Bedroom/Garage metrics**

In [8]:
porch_heating_rooms = pd.concat([porch, heating, garage, bedrooms], axis=1)
porch_heating_rooms.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181510 entries, 0 to 181509
Data columns (total 16 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   Parcel_ID           181510 non-null  object
 1   SqFtOpenPorch       181510 non-null  int32 
 2   SqFtEnclosedPorch   181510 non-null  int32 
 3   Floor-Wall          181510 non-null  int32 
 4   Gravity             181510 non-null  int32 
 5   Radiant             181510 non-null  int32 
 6   Elec BB             181510 non-null  int32 
 7   Forced Air          181510 non-null  int32 
 8   Hot Water           181510 non-null  int32 
 9   Heat Pump           181510 non-null  int32 
 10  Other               181510 non-null  int32 
 11  binary_notforced    181510 non-null  int64 
 12  binary_forced       181510 non-null  int32 
 13  SqFtGarageAttached  181510 non-null  int32 
 14  SqFtGarageBasement  181510 non-null  int32 
 15  Bedrooms            181510 non-null  int32 
dtypes:

In [9]:
porch_heating_rooms

Unnamed: 0,Parcel_ID,SqFtOpenPorch,SqFtEnclosedPorch,Floor-Wall,Gravity,Radiant,Elec BB,Forced Air,Hot Water,Heat Pump,Other,binary_notforced,binary_forced,SqFtGarageAttached,SqFtGarageBasement,Bedrooms
0,009800-0720,0,0,0,0,0,0,1,0,0,0,0,1,750,0,4
1,009802-0140,380,0,0,0,0,0,1,0,0,0,0,1,660,0,4
2,009830-0020,360,0,0,0,0,0,1,0,0,0,0,1,1020,0,4
3,009830-0160,690,0,0,0,0,0,1,0,0,0,0,1,1000,0,4
4,010050-0180,60,0,0,0,0,0,1,0,0,0,0,1,440,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181505,197220-1902,0,0,0,0,0,0,1,0,0,0,0,1,240,0,2
181506,197220-2224,0,0,0,0,0,0,1,0,0,0,0,1,270,0,3
181507,197220-5172,0,0,0,0,0,0,1,0,0,0,0,1,290,0,3
181508,197220-5173,0,0,0,0,0,0,1,0,0,0,0,1,230,0,3


### Eliminate all sales not in 2019

In [11]:
res_sales.loc[res_sales['ParcelID']=='082211-9001']

Unnamed: 0,ExciseTaxNbr,Major,Minor,DocumentDate,SalePrice,RecordingNbr,Volume,Page,PlatNbr,PlatType,PlatLot,PlatBlock,SellerName,BuyerName,PropertyType,PrincipalUse,SaleInstrument,AFForestLand,AFCurrentUseLand,AFNonProfitUse,AFHistoricProperty,SaleReason,PropertyClass,SaleWarning,ParcelID,SaleYear
134445,3017076,82211,9001,2019-10-21,70000,,,,,,,,GROVE PAUL,DEMATTEO JOSEPH,3,6,26,N,N,N,N,18,8,,082211-9001,2019


In [12]:
df_resbldg.loc[(df_resbldg.Major=='082211')& (df_resbldg.Minor == '9001')].head()

Unnamed: 0,Major,Minor,BldgNbr,NbrLivingUnits,Address,BuildingNumber,Fraction,DirectionPrefix,StreetName,StreetType,DirectionSuffix,ZipCode,Stories,BldgGrade,BldgGradeVar,SqFt1stFloor,SqFtHalfFloor,SqFt2ndFloor,SqFtUpperFloor,SqFtUnfinFull,SqFtUnfinHalf,SqFtTotLiving,SqFtTotBasement,SqFtFinBasement,FinBasementGrade,SqFtGarageBasement,SqFtGarageAttached,DaylightBasement,SqFtOpenPorch,SqFtEnclosedPorch,SqFtDeck,HeatSystem,HeatSource,BrickStone,ViewUtilization,Bedrooms,BathHalfCount,Bath3qtrCount,BathFullCount,FpSingleStory,FpMultiStory,FpFreestanding,FpAdditional,YrBuilt,YrRenovated,PcntComplete,Obsolescence,PcntNetCondition,Condition,AddnlCost
59918,82211,9001,20,1,20904 677TH PL SE 98045,20904,,,677TH,PL,SE,98045,1.0,3,0,340,0,0,0,0,0,340,0,0,0,0,0,,100,0,0,0,0,0,,0,0,0,0,1,0,0,0,1940,0,0,25,0,4,0
59919,82211,9001,1,1,20904 677TH PL SE 98045,20904,,,677TH,PL,SE,98045,1.0,3,0,430,0,0,0,0,0,430,0,0,0,0,0,,0,0,0,0,0,0,,0,0,0,0,1,0,0,0,1957,0,0,25,0,3,0
59920,82211,9001,14,1,20904 677TH PL SE 98045,20904,,,677TH,PL,SE,98045,1.0,4,0,320,0,0,0,0,0,320,0,0,0,0,0,,25,0,0,0,0,0,,0,0,0,0,0,0,0,0,1940,0,0,25,0,3,0
59921,82211,9001,17,1,20904 677TH PL SE 98045,20904,,,677TH,PL,SE,98045,1.0,5,0,430,0,0,0,0,0,430,0,0,0,0,0,,70,0,0,0,0,0,,0,0,0,0,1,0,0,0,1946,0,0,25,0,4,0
59922,82211,9001,5,1,20904 677TH PL SE 98045,20904,,,677TH,PL,SE,98045,1.5,5,0,540,270,0,0,0,0,810,0,0,0,0,0,,200,0,0,0,0,0,,0,0,0,0,0,0,0,0,1940,0,0,25,0,3,0
