Import necessary packages

In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

Import datasets and packages

In [2]:
df_lookup = pd.read_csv('../../data/raw/EXTR_LookUp.csv', dtype='str')
df_parcel = pd.read_csv('../../data/raw/EXTR_PARCEL.csv', dtype='str')
df_resbldg = pd.read_csv('../../data/raw/EXTR_ResBldg.csv', dtype='str')
df_rpsale = pd.read_csv('../../data/raw/EXTR_RPSale.csv', dtype='str')

Strip leading and trailing spaces

In [3]:
def strip_spaces(df):
    for col in df.columns:
        df[col] = df[col].str.strip()
    return df

df_lookup = strip_spaces(df_lookup)
df_parcel = strip_spaces(df_parcel)
df_resbldg = strip_spaces(df_resbldg)
df_rpsale = strip_spaces(df_rpsale)

Define function to remove *x* std deviations of data from an input

In [4]:
def remove_extremes(data, devct):
    data = pd.Series([float(num) for num in data])
    cleaned = data.loc[data>0].copy()

    std = cleaned.std()
    med = cleaned.median()

    cleaned = cleaned.loc[(cleaned > (med - std*devct)) & (cleaned < (med+std*devct))].copy() 
    return cleaned

Define function to retrieve list of lookups from file dictionary

In [5]:
def get_lookups(LUType):
    LUType = str(LUType)
    
    category = df_lookup.loc[df_lookup['LUType'] == LUType].copy()
    category = category.sort_values(by='LUItem')
    result = dict(zip(category.LUItem.str.strip(), category.LUDescription))
    return result

Define function to one-hot encode and rename columns

**Create DataFrame for Porch metrics**

In [6]:
porch = df_resbldg[['SqFtOpenPorch', 'SqFtEnclosedPorch']].copy()
porch['Parcel_ID'] = df_resbldg['Major'] + '-' + df_resbldg['Minor']
porch = porch[['Parcel_ID', 'SqFtOpenPorch', 'SqFtEnclosedPorch']].copy()

porch['SqFtOpenPorch'] = porch['SqFtOpenPorch'].astype('int')
porch['SqFtEnclosedPorch'] = porch['SqFtEnclosedPorch'].astype('int')
porch

Unnamed: 0,Parcel_ID,SqFtOpenPorch,SqFtEnclosedPorch
0,009800-0720,0,0
1,009802-0140,380,0
2,009830-0020,360,0
3,009830-0160,690,0
4,010050-0180,60,0
...,...,...,...
181505,197220-1902,0,0
181506,197220-2224,0,0
181507,197220-5172,0,0
181508,197220-5173,0,0


**Create DataFrame for Heating metrics**

In [7]:
from custom_functions import one_hot

heating = one_hot(df_resbldg['HeatSystem'], name_lookup = get_lookups(108))
heating['binary_notforced'] = heating.drop('Forced Air', axis=1).sum(axis=1)
heating['binary_forced'] = heating['Forced Air']

**Bedroom and garage specs**

In [8]:
garage = df_resbldg[['SqFtGarageAttached', 'SqFtGarageBasement']].astype('int')
bedrooms = df_resbldg['Bedrooms'].astype('int')

**Aggregate Porch, Heating and Bedroom/Garage metrics**

In [9]:
porch_heating_rooms = pd.concat([porch, heating, garage, bedrooms], axis=1)
porch_heating_rooms.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181510 entries, 0 to 181509
Data columns (total 16 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   Parcel_ID           181510 non-null  object
 1   SqFtOpenPorch       181510 non-null  int32 
 2   SqFtEnclosedPorch   181510 non-null  int32 
 3   Floor-Wall          181510 non-null  int32 
 4   Gravity             181510 non-null  int32 
 5   Radiant             181510 non-null  int32 
 6   Elec BB             181510 non-null  int32 
 7   Forced Air          181510 non-null  int32 
 8   Hot Water           181510 non-null  int32 
 9   Heat Pump           181510 non-null  int32 
 10  Other               181510 non-null  int32 
 11  binary_notforced    181510 non-null  int64 
 12  binary_forced       181510 non-null  int32 
 13  SqFtGarageAttached  181510 non-null  int32 
 14  SqFtGarageBasement  181510 non-null  int32 
 15  Bedrooms            181510 non-null  int32 
dtypes:

**Pricing data aggregation**

In [10]:
Parcel_ID = df_rpsale['Major'].copy() + '-' + df_rpsale['Minor'].copy()
Sale_ID   = Parcel_ID + '-' + df_rpsale['ExciseTaxNbr']
sale = pd.concat([Parcel_ID, Sale_ID], axis=1)
sale.columns = ['Parcel_ID', 'Sale_ID']
sale['SalePrice'] = df_rpsale['SalePrice'].astype('int')
sale['DocumentDate'] = df_rpsale['DocumentDate'].astype(np.datetime64)
sale

Unnamed: 0,Parcel_ID,Sale_ID,SalePrice,DocumentDate
0,198920-1430,198920-1430-2857854,0,2017-03-28
1,638580-0110,638580-0110-2743355,190000,2015-07-14
2,919715-0200,919715-0200-2999169,192000,2019-07-08
3,894677-0240,894677-0240-2841697,818161,2016-12-21
4,445872-0260,445872-0260-2826129,0,2016-10-03
...,...,...,...,...
351062,219331-0270,219331-0270-2935450,850000,2018-06-07
351063,886030-0550,886030-0550-2942886,900000,2016-07-16
351064,769791-0030,769791-0030-2845806,133000,2016-07-16
351065,924600-0650,924600-0650-2971374,0,2019-01-02


## Narrow to Residential Homes

In [11]:
principal_use = get_lookups(2) # Determine that '6' is the code for 'RESIDENTIAL'
principal_use_residential = '6'
res_sales = df_rpsale.loc[df_rpsale['PrincipalUse']==principal_use_residential].copy()
res_sales

Unnamed: 0,ExciseTaxNbr,Major,Minor,DocumentDate,SalePrice,RecordingNbr,Volume,Page,PlatNbr,PlatType,...,PropertyType,PrincipalUse,SaleInstrument,AFForestLand,AFCurrentUseLand,AFNonProfitUse,AFHistoricProperty,SaleReason,PropertyClass,SaleWarning
1,2743355,638580,0110,07/14/2015,190000,20150715002686,,,,,...,3,6,3,N,N,N,N,1,8,15
3,2841697,894677,0240,12/21/2016,818161,20161228000896,,,,,...,2,6,3,N,N,N,N,1,8,
5,2860712,408330,4150,03/15/2017,0,20170426000922,,,,,...,3,6,15,N,N,N,N,11,8,18 31 38
6,2813396,510140,4256,07/25/2016,0,20160802000964,,,,,...,2,6,15,N,N,N,N,13,8,18 31 51
7,2899303,126320,0110,10/30/2017,375000,,,,,,...,3,6,3,N,N,N,N,1,8,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351062,2935450,219331,0270,06/07/2018,850000,20180612000762,,,,,...,11,6,3,N,N,N,N,1,8,
351063,2942886,886030,0550,07/16/2016,900000,20180720000629,,,,,...,3,6,3,N,N,N,N,1,8,
351064,2845806,769791,0030,07/16/2016,133000,20170125000407,,,,,...,3,6,15,N,N,N,N,1,8,15 18 51
351065,2971374,924600,0650,01/02/2019,0,,,,,,...,11,6,3,N,N,N,N,1,8,


In [12]:
property_type_codes = principal_use = get_lookups(1)

prop_types = pd.DataFrame(res_sales['PropertyType'])
prop_types['decoded'] = list(map(property_type_codes.get, res_sales['PropertyType']))
prop_types.decoded.value_counts()

LAND WITH PREV USED BLDG                              140794
Household, single family units                         89288
LAND ONLY                                              12327
LAND WITH NEW BUILDING                                 11561
Land with new building                                  4509
Undeveloped land (land only)                            2365
LAND WITH MOBILE HOME                                   1780
Multiple family residence (Residential, 2-4 units)       712
Residential condominiums                                 467
TIMBER ONLY                                              339
Other undeveloped land                                   269
All other residential not elsewhere coded                125
Multiple family residence (Residential, 5+ units)         65
Open space land classified under chapter 84.34 RCW        61
Highway and street right of way                           53
BUILDING ONLY                                             41
Land with mobile home   

In [13]:
keep_these_property_types = ['2', '3', '6', '10', '11', '12', '13', '18', '19']
non_droppable_property_types = [prop_type in keep_these_property_types for prop_type in res_sales['PropertyType']]
res_sales = res_sales.loc[non_droppable_property_types].copy()
res_sales

Unnamed: 0,ExciseTaxNbr,Major,Minor,DocumentDate,SalePrice,RecordingNbr,Volume,Page,PlatNbr,PlatType,...,PropertyType,PrincipalUse,SaleInstrument,AFForestLand,AFCurrentUseLand,AFNonProfitUse,AFHistoricProperty,SaleReason,PropertyClass,SaleWarning
1,2743355,638580,0110,07/14/2015,190000,20150715002686,,,,,...,3,6,3,N,N,N,N,1,8,15
3,2841697,894677,0240,12/21/2016,818161,20161228000896,,,,,...,2,6,3,N,N,N,N,1,8,
5,2860712,408330,4150,03/15/2017,0,20170426000922,,,,,...,3,6,15,N,N,N,N,11,8,18 31 38
6,2813396,510140,4256,07/25/2016,0,20160802000964,,,,,...,2,6,15,N,N,N,N,13,8,18 31 51
7,2899303,126320,0110,10/30/2017,375000,,,,,,...,3,6,3,N,N,N,N,1,8,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351060,2755549,438920,1058,08/20/2015,1000000,20150914001478,,,,,...,3,6,3,N,N,N,N,1,7,10 56
351062,2935450,219331,0270,06/07/2018,850000,20180612000762,,,,,...,11,6,3,N,N,N,N,1,8,
351063,2942886,886030,0550,07/16/2016,900000,20180720000629,,,,,...,3,6,3,N,N,N,N,1,8,
351064,2845806,769791,0030,07/16/2016,133000,20170125000407,,,,,...,3,6,15,N,N,N,N,1,8,15 18 51


In [22]:
res_sales2 = res_sales
Parcel_ID = res_sales2['Major'].copy() + '-' + res_sales2['Minor'].copy()
Sale_ID   = Parcel_ID + '-' + res_sales2['ExciseTaxNbr']
# sale = pd.concat([Parcel_ID, Sale_ID], axis=1)
# sale.columns = ['Parcel_ID', 'Sale_ID']
# sale['SalePrice'] = df_rpsale['SalePrice'].astype('int')
# sale['DocumentDate'] = df_rpsale['DocumentDate'].astype(np.datetime64)
res_sales2 = pd.concat([Parcel_ID, Sale_ID, res_sales2], axis=1)
res_sales2.loc[res_sales2[0]=='011410-0979']

Unnamed: 0,0,1,ExciseTaxNbr,Major,Minor,DocumentDate,SalePrice,RecordingNbr,Volume,Page,PlatNbr,PlatType,PlatLot,PlatBlock,SellerName,BuyerName,PropertyType,PrincipalUse,SaleInstrument,AFForestLand,AFCurrentUseLand,AFNonProfitUse,AFHistoricProperty,SaleReason,PropertyClass,SaleWarning
16953,011410-0979,011410-0979-3016251,3016251,11410,979,08/29/2019,720000,20191018001917,,,,,,,BASLT II LLC,DEL AGUILA ERNESTO+CARRILLO ERICK PAUL,2,6,3,N,N,N,N,1,8,
28334,011410-0979,011410-0979-2748616,2748616,11410,979,07/20/2015,700000,20150807001816,,,,,,,BENNETT CHRIS T-PR,LAKE HARRISON ESTATES LLC,3,6,3,N,N,N,N,1,7,
47479,011410-0979,011410-0979-3014914,3014914,11410,979,08/29/2019,699500,20191011000917,,,,,,,BASALT II LLC,YOUNG BRADLEY W III+ROBERTA M,2,6,3,N,N,N,N,18,8,52.0
72150,011410-0979,011410-0979-3030919,3030919,11410,979,01/10/2020,559500,20200123001000,,,,,,,BASALT II LLC,UNRATH CAMILLE+MATTHEW,10,6,2,N,N,N,N,1,8,
101475,011410-0979,011410-0979-3034887,3034887,11410,979,02/12/2020,550000,20200221001221,,,,,,,BASALT II LLC,HORTON TIMOTHY A+KRAFT JENNIFER ESTER,10,6,2,N,N,N,N,1,8,
102229,011410-0979,011410-0979-3022009,3022009,11410,979,08/29/2019,519500,20191122000793,,,,,,,BASALT II LLC,GARCIA CEBADA FRANCISCO ALEJANDRO+CARAMON VELI...,2,6,3,N,N,N,N,1,8,
120809,011410-0979,011410-0979-3033940,3033940,11410,979,02/11/2020,731000,20200214000957,,,,,,,BASALT II LLC,GIOK-HALIM ALICIA XIAOKU+TAHA BIN ABD,10,6,2,N,N,N,N,1,8,
129898,011410-0979,011410-0979-3028811,3028811,11410,979,12/26/2019,599500,20200103001228,,,,,,,BASALT II LLC,PARTINGTON JEANETTE,2,6,3,N,N,N,N,1,8,
163064,011410-0979,011410-0979-3013816,3013816,11410,979,08/29/2019,715000,20191004000943,,,,,,,BASALT II LLC,KIM YOON BAEK,2,6,3,N,N,N,N,1,8,
195721,011410-0979,011410-0979-3032191,3032191,11410,979,01/10/2020,557950,20200131002180,,,,,,,BASALT II LLC,VILLICANA GREGORY,10,6,2,N,N,N,N,1,8,


In [23]:
df_resbldg

Unnamed: 0,Major,Minor,BldgNbr,NbrLivingUnits,Address,BuildingNumber,Fraction,DirectionPrefix,StreetName,StreetType,DirectionSuffix,ZipCode,Stories,BldgGrade,BldgGradeVar,SqFt1stFloor,SqFtHalfFloor,SqFt2ndFloor,SqFtUpperFloor,SqFtUnfinFull,SqFtUnfinHalf,SqFtTotLiving,SqFtTotBasement,SqFtFinBasement,FinBasementGrade,SqFtGarageBasement,SqFtGarageAttached,DaylightBasement,SqFtOpenPorch,SqFtEnclosedPorch,SqFtDeck,HeatSystem,HeatSource,BrickStone,ViewUtilization,Bedrooms,BathHalfCount,Bath3qtrCount,BathFullCount,FpSingleStory,FpMultiStory,FpFreestanding,FpAdditional,YrBuilt,YrRenovated,PcntComplete,Obsolescence,PcntNetCondition,Condition,AddnlCost
0,009800,0720,1,1,27719 SE 26TH WAY 98075,27719,,SE,26TH,WAY,,98075,2,11,0,1970,0,2130,0,0,0,4100,0,0,0,0,750,,0,0,0,5,2,0,,4,1,0,3,3,0,0,0,2001,0,0,0,0,3,0
1,009802,0140,1,1,2829 277TH TER SE 98075,2829,,,277TH,TER,SE,98075,2,10,0,1610,0,1400,0,0,0,3010,0,0,0,0,660,,380,0,0,5,2,0,,4,1,2,2,3,0,0,0,2004,0,0,0,0,3,0
2,009830,0020,1,1,1715 298TH CRESENT SE,1715,,,298TH CRESENT,,SE,,2,10,0,2520,0,2560,0,0,0,5080,0,0,0,0,1020,,360,0,270,5,2,0,,4,0,0,5,1,0,0,0,2017,0,0,0,0,3,0
3,009830,0160,1,1,1861 297TH WAY SE 98024,1861,,,297TH,WAY,SE,98024,2,10,0,2210,0,1860,0,0,0,4070,0,0,0,0,1000,N,690,0,0,5,2,0,N,4,1,0,3,2,0,0,0,2013,0,0,0,0,3,0
4,010050,0180,1,1,35410 25TH PL S 98003,35410,,,25TH,PL,S,98003,2,7,0,910,0,700,0,0,0,1610,0,0,0,0,440,,60,0,0,5,2,0,,3,1,1,1,1,0,0,0,1994,0,0,0,0,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181505,197220,1902,1,1,3816 WINSLOW PL N 98103,3816,,,WINSLOW,PL,N,98103,3,8,0,150,0,450,430,0,0,1030,0,0,0,0,240,,0,0,0,5,2,0,,2,1,0,1,1,0,0,0,2003,0,0,0,0,3,0
181506,197220,2224,1,1,3841 B WOODLAND PARK AVE N 98103,3841,B,,WOODLAND PARK,AVE,N,98103,3,8,0,260,0,580,590,0,0,1430,0,0,0,0,270,N,0,0,50,5,2,0,Y,3,1,0,2,1,0,0,0,2005,0,0,0,0,3,0
181507,197220,5172,1,1,11 B W DRAVUS ST,11,B,W,DRAVUS,ST,,,3,8,0,220,0,510,380,0,0,1600,490,490,8,0,290,,0,0,0,5,2,0,,3,1,0,2,0,0,0,0,2007,0,0,0,0,3,0
181508,197220,5173,1,1,11 C W DRAVUS ST,11,C,W,DRAVUS,ST,,,3,8,0,200,0,490,380,0,0,1470,400,400,8,0,230,,0,0,70,5,2,0,,3,1,0,2,0,0,0,0,2007,0,0,0,0,3,0


In [25]:
porch.Parcel_ID.value_counts()

082211-9001    21
272205-9164    19
511940-0095    13
011410-0979    11
935290-0035     8
               ..
980863-0150     1
785660-0890     1
428740-0120     1
167200-0120     1
779645-0370     1
Name: Parcel_ID, Length: 179476, dtype: int64

In [29]:
df_resbldg.loc[(df_resbldg['Major']=='082211')&(df_resbldg['Minor']=='9001')]

Unnamed: 0,Major,Minor,BldgNbr,NbrLivingUnits,Address,BuildingNumber,Fraction,DirectionPrefix,StreetName,StreetType,DirectionSuffix,ZipCode,Stories,BldgGrade,BldgGradeVar,SqFt1stFloor,SqFtHalfFloor,SqFt2ndFloor,SqFtUpperFloor,SqFtUnfinFull,SqFtUnfinHalf,SqFtTotLiving,SqFtTotBasement,SqFtFinBasement,FinBasementGrade,SqFtGarageBasement,SqFtGarageAttached,DaylightBasement,SqFtOpenPorch,SqFtEnclosedPorch,SqFtDeck,HeatSystem,HeatSource,BrickStone,ViewUtilization,Bedrooms,BathHalfCount,Bath3qtrCount,BathFullCount,FpSingleStory,FpMultiStory,FpFreestanding,FpAdditional,YrBuilt,YrRenovated,PcntComplete,Obsolescence,PcntNetCondition,Condition,AddnlCost
59918,82211,9001,20,1,20904 677TH PL SE 98045,20904,,,677TH,PL,SE,98045,1.0,3,0,340,0,0,0,0,0,340,0,0,0,0,0,,100,0,0,0,0,0,,0,0,0,0,1,0,0,0,1940,0,0,25,0,4,0
59919,82211,9001,1,1,20904 677TH PL SE 98045,20904,,,677TH,PL,SE,98045,1.0,3,0,430,0,0,0,0,0,430,0,0,0,0,0,,0,0,0,0,0,0,,0,0,0,0,1,0,0,0,1957,0,0,25,0,3,0
59920,82211,9001,14,1,20904 677TH PL SE 98045,20904,,,677TH,PL,SE,98045,1.0,4,0,320,0,0,0,0,0,320,0,0,0,0,0,,25,0,0,0,0,0,,0,0,0,0,0,0,0,0,1940,0,0,25,0,3,0
59921,82211,9001,17,1,20904 677TH PL SE 98045,20904,,,677TH,PL,SE,98045,1.0,5,0,430,0,0,0,0,0,430,0,0,0,0,0,,70,0,0,0,0,0,,0,0,0,0,1,0,0,0,1946,0,0,25,0,4,0
59922,82211,9001,5,1,20904 677TH PL SE 98045,20904,,,677TH,PL,SE,98045,1.5,5,0,540,270,0,0,0,0,810,0,0,0,0,0,,200,0,0,0,0,0,,0,0,0,0,0,0,0,0,1940,0,0,25,0,3,0
59923,82211,9001,15,1,20904 677TH PL SE 98045,20904,,,677TH,PL,SE,98045,1.5,4,0,430,190,0,0,0,0,620,0,0,0,0,0,,0,0,0,0,0,0,,0,0,0,0,1,0,0,0,1950,0,0,25,0,3,0
59924,82211,9001,11,1,20904 677TH PL SE 98045,20904,,,677TH,PL,SE,98045,1.0,4,0,330,0,0,0,0,0,330,0,0,0,0,0,,130,0,0,0,0,0,,0,0,0,0,0,0,0,0,1940,0,0,25,0,3,0
59925,82211,9001,9,1,20904 677TH PL SE 98045,20904,,,677TH,PL,SE,98045,1.0,3,0,480,0,0,0,0,0,480,0,0,0,0,0,,80,0,0,0,0,0,,0,0,0,0,0,0,0,0,1957,0,0,25,0,3,0
59926,82211,9001,12,1,20904 677TH PL SE 98045,20904,,,677TH,PL,SE,98045,1.0,4,0,300,0,0,0,0,0,300,0,0,0,0,0,,20,0,0,0,0,0,,0,0,0,0,1,0,0,0,1959,0,0,25,0,3,0
59927,82211,9001,13,1,20904 677TH PL SE 98045,20904,,,677TH,PL,SE,98045,1.5,4,0,440,150,0,0,0,0,590,0,0,0,0,0,,0,0,0,0,0,0,,0,0,0,0,1,0,0,0,1940,0,0,25,0,4,0
