Import necessary packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

Import datasets and packages

In [2]:
df_lookup = pd.read_csv('../../data/raw/EXTR_LookUp.csv', dtype='str')
df_parcel = pd.read_csv('../../data/raw/EXTR_PARCEL.csv', dtype='str')
df_resbldg = pd.read_csv('../../data/raw/EXTR_ResBldg.csv', dtype='str')
df_rpsale = pd.read_csv('../../data/raw/EXTR_RPSale.csv', dtype='str')

Strip leading and trailing spaces

In [3]:
def strip_spaces(df):
    for col in df.columns:
        df[col] = df[col].str.strip()
    return df

df_lookup = strip_spaces(df_lookup)
df_parcel = strip_spaces(df_parcel)
df_resbldg = strip_spaces(df_resbldg)
df_rpsale = strip_spaces(df_rpsale)

Define function to remove *x* std deviations of data from an input

In [4]:
def remove_extremes(data, devct):
    data = pd.Series([float(num) for num in data])
    cleaned = data.loc[data>0].copy()

    std = cleaned.std()
    med = cleaned.median()

    cleaned = cleaned.loc[(cleaned > (med - std*devct)) & (cleaned < (med+std*devct))].copy() 
    return cleaned

**Create DataFrame for Porch metrics**

In [5]:
porch = df_resbldg[['SqFtOpenPorch', 'SqFtEnclosedPorch']].copy()
porch['Parcel_ID'] = df_resbldg['Major'] + '-' + df_resbldg['Minor']
porch = porch[['Parcel_ID', 'SqFtOpenPorch', 'SqFtEnclosedPorch']].copy()

porch['SqFtOpenPorch'] = porch['SqFtOpenPorch'].astype('int')
porch['SqFtEnclosedPorch'] = porch['SqFtEnclosedPorch'].astype('int')
porch

Unnamed: 0,Parcel_ID,SqFtOpenPorch,SqFtEnclosedPorch
0,009800-0720,0,0
1,009802-0140,380,0
2,009830-0020,360,0
3,009830-0160,690,0
4,010050-0180,60,0
...,...,...,...
181505,197220-1902,0,0
181506,197220-2224,0,0
181507,197220-5172,0,0
181508,197220-5173,0,0


**Create DataFrame for Heating metrics**

In [6]:
from custom_functions import one_hot
from custom_functions import get_lookups

heating = one_hot(df_resbldg['HeatSystem'], name_lookup = get_lookups(108, df_lookup))
heating['binary_notforced'] = heating.drop('Forced Air', axis=1).sum(axis=1)
heating['binary_forced'] = heating['Forced Air']

**Bedroom and garage specs**

In [7]:
garage = df_resbldg[['SqFtGarageAttached', 'SqFtGarageBasement']].astype('int')
bedrooms = df_resbldg['Bedrooms'].astype('int')

**Aggregate Porch, Heating and Bedroom/Garage metrics**

In [8]:
porch_heating_rooms = pd.concat([porch, heating, garage, bedrooms], axis=1)
porch_heating_rooms.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181510 entries, 0 to 181509
Data columns (total 16 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   Parcel_ID           181510 non-null  object
 1   SqFtOpenPorch       181510 non-null  int32 
 2   SqFtEnclosedPorch   181510 non-null  int32 
 3   Floor-Wall          181510 non-null  int32 
 4   Gravity             181510 non-null  int32 
 5   Radiant             181510 non-null  int32 
 6   Elec BB             181510 non-null  int32 
 7   Forced Air          181510 non-null  int32 
 8   Hot Water           181510 non-null  int32 
 9   Heat Pump           181510 non-null  int32 
 10  Other               181510 non-null  int32 
 11  binary_notforced    181510 non-null  int64 
 12  binary_forced       181510 non-null  int32 
 13  SqFtGarageAttached  181510 non-null  int32 
 14  SqFtGarageBasement  181510 non-null  int32 
 15  Bedrooms            181510 non-null  int32 
dtypes:

**Pricing data aggregation**

In [9]:
# Parcel_ID = df_rpsale['Major'].copy() + '-' + df_rpsale['Minor'].copy()
# Sale_ID   = Parcel_ID + '-' + df_rpsale['ExciseTaxNbr']
# sale = pd.concat([Parcel_ID, Sale_ID], axis=1)
# sale.columns = ['Parcel_ID', 'Sale_ID']
# sale['SalePrice'] = df_rpsale['SalePrice'].astype('int')
# sale['DocumentDate'] = df_rpsale['DocumentDate'].astype(np.datetime64)
# sale

## Narrow to Residential Homes

In [19]:
principal_use_codes = ['6']
property_class_codes = ['7', '8']
property_type_codes = ['2', '3', '6', '10', '11', '12', '13', '18', '19']

res_sales = df_rpsale.copy()

for code in principal_use_codes:
    res_sales['PrincipalUse'].replace(to_replace=code, value=np.nan, inplace=True)

for code in property_class_codes:
    res_sales['PropertyClass'].replace(to_replace=code, value=np.nan, inplace=True)

for code in property_type_codes:
    res_sales['PropertyType'].replace(to_replace=code, value=np.nan, inplace=True)

res_sales.dropna(inplace=True)
res_sales.reset_index(inplace=True, drop=True)
res_sales

Unnamed: 0,ExciseTaxNbr,Major,Minor,DocumentDate,SalePrice,RecordingNbr,Volume,Page,PlatNbr,PlatType,PlatLot,PlatBlock,SellerName,BuyerName,PropertyType,PrincipalUse,SaleInstrument,AFForestLand,AFCurrentUseLand,AFNonProfitUse,AFHistoricProperty,SaleReason,PropertyClass,SaleWarning
0,2857301,516550,0560,03/30/2017,345000,20170406000260,,,,,,,BOZEMAN AARON,ARENDT CRAIG,14,2,3,,,,,1,3,
1,2963345,757480,0320,11/16/2018,0,20181126001127,,,,,,,ALSUBAYEE MOHANED,SADIK SAFAA T,14,2,15,N,N,N,N,1,3,
2,3031504,766620,3538,12/30/2019,0,20200128000956,,,,,,,NCEP V LLC,LEMMAN PETER R+DEAN+ET AL,51,7,15,N,N,N,N,18,2,
3,2937935,873178,0100,06/20/2018,185000,20180625000560,,,,,,,SIMONS LINDA J,PHAM ALISA,14,2,3,N,N,N,N,1,3,
4,2970575,866920,1280,01/16/2019,0,20190118000160,,,,,,,BARNETT KEVIN N+CHAPMAN CAROL H,BARNETT KEVIN N,14,2,3,N,N,N,N,1,3,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20324,2822785,069980,0400,09/11/2016,293700,20160916002231,,,,,,,JONES MEGAN,CHU ASHLEY L,14,2,3,,,,,1,3,
20325,2923804,720581,0240,04/06/2018,162955,20180410001307,,,,,,,ACCENT CONDO LLC,ROWLAND GWENDOLYN JEAN,14,2,3,N,N,N,N,1,3,
20326,2982719,389225,0180,04/15/2019,910000,,,,,,,,CHOUGULE TARUN+SHARMA PAVITRI,TANYERI MURAT+ESRA,14,2,3,N,N,N,N,1,3,
20327,3038910,026770,0310,03/12/2020,455000,20200317001112,,,,,,,MALIK ABDUL SATTAR,SHENDEROVICH JACOB M,14,2,2,N,N,N,N,1,3,


### Old work narrowing to residential homes

In [11]:
# principal_use = get_lookups(2, df_lookup) # Determine that '6' is the code for 'RESIDENTIAL'
# principal_use_residential = '6'
# res_sales = df_rpsale.loc[df_rpsale['PrincipalUse']==principal_use_residential].copy()
# res_sales

Unnamed: 0,ExciseTaxNbr,Major,Minor,DocumentDate,SalePrice,RecordingNbr,Volume,Page,PlatNbr,PlatType,PlatLot,PlatBlock,SellerName,BuyerName,PropertyType,PrincipalUse,SaleInstrument,AFForestLand,AFCurrentUseLand,AFNonProfitUse,AFHistoricProperty,SaleReason,PropertyClass,SaleWarning
1,2743355,638580,0110,07/14/2015,190000,20150715002686,,,,,,,GINGRICH AUDREY B,NATION FRED I,3,6,3,N,N,N,N,1,8,15
3,2841697,894677,0240,12/21/2016,818161,20161228000896,,,,,,,QUADRANT CORPORATION,KANG JING,2,6,3,N,N,N,N,1,8,
5,2860712,408330,4150,03/15/2017,0,20170426000922,,,,,,,MALLAHAN MATTHEW JOSEPH,MALLAHAN CAROLYN S,3,6,15,N,N,N,N,11,8,18 31 38
6,2813396,510140,4256,07/25/2016,0,20160802000964,,,,,,,OBRANOVICH THOMAS M,OBRANOVICH THOMAS M+KAREN J,2,6,15,N,N,N,N,13,8,18 31 51
7,2899303,126320,0110,10/30/2017,375000,,,,,,,,ADAMS DANIEL J,SOLIS FRANSISCO FIGUEROA+MIRNA L MOLINA SANTOS,3,6,3,N,N,N,N,1,8,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351062,2935450,219331,0270,06/07/2018,850000,20180612000762,,,,,,,LLANOS MIGUEL ANGEL III,NARAYANAN PRAKASH,11,6,3,N,N,N,N,1,8,
351063,2942886,886030,0550,07/16/2016,900000,20180720000629,,,,,,,PREUGSCHAT GARY N+SANDY K,POKAR NITIN+KINJABLBAHEN PATEL,3,6,3,N,N,N,N,1,8,
351064,2845806,769791,0030,07/16/2016,133000,20170125000407,,,,,,,SAY SORIN+ANG+DIEP JULIA,SAY SORIN,3,6,15,N,N,N,N,1,8,15 18 51
351065,2971374,924600,0650,01/02/2019,0,,,,,,,,LAMOURELLE REGINA REI+ALAIN P,LAMOURELLE ALAIN P+REGINA REI,11,6,3,N,N,N,N,1,8,


In [12]:
# property_type_codes = principal_use = get_lookups(1, df_lookup)

# prop_types = pd.DataFrame(res_sales['PropertyType'])
# prop_types['decoded'] = list(map(property_type_codes.get, res_sales['PropertyType']))
# prop_types.decoded.value_counts()

LAND WITH PREV USED BLDG                              140794
Household, single family units                         89288
LAND ONLY                                              12327
LAND WITH NEW BUILDING                                 11561
Land with new building                                  4509
Undeveloped land (land only)                            2365
LAND WITH MOBILE HOME                                   1780
Multiple family residence (Residential, 2-4 units)       712
Residential condominiums                                 467
TIMBER ONLY                                              339
Other undeveloped land                                   269
All other residential not elsewhere coded                125
Multiple family residence (Residential, 5+ units)         65
Open space land classified under chapter 84.34 RCW        61
Highway and street right of way                           53
Land with mobile home                                     41
BUILDING ONLY           

In [13]:
# keep_these_property_types = ['2', '3', '6', '10', '11', '12', '13', '18', '19']
# non_droppable_property_types = [prop_type in keep_these_property_types for prop_type in res_sales['PropertyType']]
# res_sales = res_sales.loc[non_droppable_property_types].copy()
# res_sales

Unnamed: 0,ExciseTaxNbr,Major,Minor,DocumentDate,SalePrice,RecordingNbr,Volume,Page,PlatNbr,PlatType,PlatLot,PlatBlock,SellerName,BuyerName,PropertyType,PrincipalUse,SaleInstrument,AFForestLand,AFCurrentUseLand,AFNonProfitUse,AFHistoricProperty,SaleReason,PropertyClass,SaleWarning
1,2743355,638580,0110,07/14/2015,190000,20150715002686,,,,,,,GINGRICH AUDREY B,NATION FRED I,3,6,3,N,N,N,N,1,8,15
3,2841697,894677,0240,12/21/2016,818161,20161228000896,,,,,,,QUADRANT CORPORATION,KANG JING,2,6,3,N,N,N,N,1,8,
5,2860712,408330,4150,03/15/2017,0,20170426000922,,,,,,,MALLAHAN MATTHEW JOSEPH,MALLAHAN CAROLYN S,3,6,15,N,N,N,N,11,8,18 31 38
6,2813396,510140,4256,07/25/2016,0,20160802000964,,,,,,,OBRANOVICH THOMAS M,OBRANOVICH THOMAS M+KAREN J,2,6,15,N,N,N,N,13,8,18 31 51
7,2899303,126320,0110,10/30/2017,375000,,,,,,,,ADAMS DANIEL J,SOLIS FRANSISCO FIGUEROA+MIRNA L MOLINA SANTOS,3,6,3,N,N,N,N,1,8,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351060,2755549,438920,1058,08/20/2015,1000000,20150914001478,,,,,,,PARKLAND HOMES INC,FALLING BROOK LLC,3,6,3,N,N,N,N,1,7,10 56
351062,2935450,219331,0270,06/07/2018,850000,20180612000762,,,,,,,LLANOS MIGUEL ANGEL III,NARAYANAN PRAKASH,11,6,3,N,N,N,N,1,8,
351063,2942886,886030,0550,07/16/2016,900000,20180720000629,,,,,,,PREUGSCHAT GARY N+SANDY K,POKAR NITIN+KINJABLBAHEN PATEL,3,6,3,N,N,N,N,1,8,
351064,2845806,769791,0030,07/16/2016,133000,20170125000407,,,,,,,SAY SORIN+ANG+DIEP JULIA,SAY SORIN,3,6,15,N,N,N,N,1,8,15 18 51


In [23]:
res_sales2 = res_sales
Parcel_ID = res_sales2['Major'].copy() + '-' + res_sales2['Minor'].copy()
Sale_ID   = Parcel_ID + '-' + res_sales2['ExciseTaxNbr']
# sale = pd.concat([Parcel_ID, Sale_ID], axis=1)
# sale.columns = ['Parcel_ID', 'Sale_ID']
# sale['SalePrice'] = df_rpsale['SalePrice'].astype('int')
# sale['DocumentDate'] = df_rpsale['DocumentDate'].astype(np.datetime64)
res_sales2 = pd.concat([Parcel_ID, Sale_ID, res_sales2], axis=1)
res_sales2

Unnamed: 0,0,1,ExciseTaxNbr,Major,Minor,DocumentDate,SalePrice,RecordingNbr,Volume,Page,PlatNbr,PlatType,PlatLot,PlatBlock,SellerName,BuyerName,PropertyType,PrincipalUse,SaleInstrument,AFForestLand,AFCurrentUseLand,AFNonProfitUse,AFHistoricProperty,SaleReason,PropertyClass,SaleWarning
1,638580-0110,638580-0110-2743355,2743355,638580,0110,07/14/2015,190000,20150715002686,,,,,,,GINGRICH AUDREY B,NATION FRED I,3,6,3,N,N,N,N,1,8,15
3,894677-0240,894677-0240-2841697,2841697,894677,0240,12/21/2016,818161,20161228000896,,,,,,,QUADRANT CORPORATION,KANG JING,2,6,3,N,N,N,N,1,8,
5,408330-4150,408330-4150-2860712,2860712,408330,4150,03/15/2017,0,20170426000922,,,,,,,MALLAHAN MATTHEW JOSEPH,MALLAHAN CAROLYN S,3,6,15,N,N,N,N,11,8,18 31 38
6,510140-4256,510140-4256-2813396,2813396,510140,4256,07/25/2016,0,20160802000964,,,,,,,OBRANOVICH THOMAS M,OBRANOVICH THOMAS M+KAREN J,2,6,15,N,N,N,N,13,8,18 31 51
7,126320-0110,126320-0110-2899303,2899303,126320,0110,10/30/2017,375000,,,,,,,,ADAMS DANIEL J,SOLIS FRANSISCO FIGUEROA+MIRNA L MOLINA SANTOS,3,6,3,N,N,N,N,1,8,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351060,438920-1058,438920-1058-2755549,2755549,438920,1058,08/20/2015,1000000,20150914001478,,,,,,,PARKLAND HOMES INC,FALLING BROOK LLC,3,6,3,N,N,N,N,1,7,10 56
351062,219331-0270,219331-0270-2935450,2935450,219331,0270,06/07/2018,850000,20180612000762,,,,,,,LLANOS MIGUEL ANGEL III,NARAYANAN PRAKASH,11,6,3,N,N,N,N,1,8,
351063,886030-0550,886030-0550-2942886,2942886,886030,0550,07/16/2016,900000,20180720000629,,,,,,,PREUGSCHAT GARY N+SANDY K,POKAR NITIN+KINJABLBAHEN PATEL,3,6,3,N,N,N,N,1,8,
351064,769791-0030,769791-0030-2845806,2845806,769791,0030,07/16/2016,133000,20170125000407,,,,,,,SAY SORIN+ANG+DIEP JULIA,SAY SORIN,3,6,15,N,N,N,N,1,8,15 18 51
