Import necessary packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', 500)

Import datasets and packages

In [2]:
df_lookup = pd.read_csv('../../data/raw/EXTR_LookUp.csv', dtype='str')
df_parcel = pd.read_csv('../../data/raw/EXTR_PARCEL.csv', dtype='str')
df_resbldg = pd.read_csv('../../data/raw/EXTR_ResBldg.csv', dtype='str')
df_rpsale = pd.read_csv('../../data/raw/EXTR_RPSale.csv', dtype='str')

Strip leading and trailing spaces

In [3]:
def strip_spaces(df):
    for col in df.columns:
        df[col] = df[col].str.strip()
    return df

df_lookup = strip_spaces(df_lookup)
df_parcel = strip_spaces(df_parcel)
df_resbldg = strip_spaces(df_resbldg)
df_rpsale = strip_spaces(df_rpsale)

Define function to remove *x* std deviations of data from an input

In [4]:
def remove_extremes(data, devct):
    data = pd.Series([float(num) for num in data])
    cleaned = data.loc[data>0].copy()

    std = cleaned.std()
    med = cleaned.median()

    cleaned = cleaned.loc[(cleaned > (med - std*devct)) & (cleaned < (med+std*devct))].copy() 
    return cleaned

Define function to retrieve list of lookups from file dictionary

In [5]:
def get_lookups(LUType):
    LUType = str(LUType)
    
    category = df_lookup.loc[df_lookup['LUType'] == LUType].copy()
    category = category.sort_values(by='LUItem')
    result = dict(zip(category.LUItem.str.strip(), category.LUDescription))
    return result

Define function to one-hot encode and rename columns

In [6]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False, drop='first')

def onehot(srs, prefix='x', name_lookup = False):
    df = pd.DataFrame(srs)
    df = pd.DataFrame(ohe.fit_transform(df))
    
    if name_lookup == False:    
        names = ohe.get_feature_names()
        new_names = [prefix+'_'+x[3:] for x in names]
        df.columns = new_names
    else:
        names = ohe.get_feature_names()
        codes = [x[3:] for x in names]
        new_names = [name_lookup[x] for x in codes]
        df.columns = new_names
    
    for col in df.columns:
        df[col] = df[col].astype('int')
    
    return df

**Create DataFrame for Porch metrics**

In [7]:
porch = df_resbldg[['SqFtOpenPorch', 'SqFtEnclosedPorch']].copy()
porch['Parcel_ID'] = df_resbldg['Major'] + '-' + df_resbldg['Minor']
porch = porch[['Parcel_ID', 'SqFtOpenPorch', 'SqFtEnclosedPorch']].copy()

porch['SqFtOpenPorch'] = porch['SqFtOpenPorch'].astype('int')
porch['SqFtEnclosedPorch'] = porch['SqFtEnclosedPorch'].astype('int')
porch

Unnamed: 0,Parcel_ID,SqFtOpenPorch,SqFtEnclosedPorch
0,009800-0720,0,0
1,009802-0140,380,0
2,009830-0020,360,0
3,009830-0160,690,0
4,010050-0180,60,0
...,...,...,...
181505,197220-1902,0,0
181506,197220-2224,0,0
181507,197220-5172,0,0
181508,197220-5173,0,0


**Create DataFrame for Heating metrics**

In [8]:
heating = onehot(df_resbldg['HeatSystem'], name_lookup = get_lookups(108))
heating['binary_notforced'] = heating.drop('Forced Air', axis=1).sum(axis=1)
heating['binary_forced'] = heating['Forced Air']

**Bedroom and garage specs**

In [9]:
garage = df_resbldg[['SqFtGarageAttached', 'SqFtGarageBasement']].astype('int')
bedrooms = df_resbldg['Bedrooms'].astype('int')

**Aggregate Porch, Heating and Bedroom/Garage metrics**

In [10]:
porch_heating_rooms = pd.concat([porch, heating, garage, bedrooms], axis=1)
porch_heating_rooms.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181510 entries, 0 to 181509
Data columns (total 16 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   Parcel_ID           181510 non-null  object
 1   SqFtOpenPorch       181510 non-null  int32 
 2   SqFtEnclosedPorch   181510 non-null  int32 
 3   Floor-Wall          181510 non-null  int32 
 4   Gravity             181510 non-null  int32 
 5   Radiant             181510 non-null  int32 
 6   Elec BB             181510 non-null  int32 
 7   Forced Air          181510 non-null  int32 
 8   Hot Water           181510 non-null  int32 
 9   Heat Pump           181510 non-null  int32 
 10  Other               181510 non-null  int32 
 11  binary_notforced    181510 non-null  int64 
 12  binary_forced       181510 non-null  int32 
 13  SqFtGarageAttached  181510 non-null  int32 
 14  SqFtGarageBasement  181510 non-null  int32 
 15  Bedrooms            181510 non-null  int32 
dtypes:

**Pricing data aggregation**

In [42]:
sale['Parcel_ID'] = df_rpsale['Major'].copy() + '-' + df_rpsale['Minor'].copy()
sale['Sale_ID'] = df_rpsale['Major'].copy() + '-' + df_rpsale['Minor'].copy() + '-' + df_rpsale['ExciseTaxNbr']
sale = sale[['Sale_ID', 'Parcel_ID', 'SalePrice', 'DocumentDate']]
sale['SalePrice'] = sale['SalePrice'].astype('int')
sale['DocumentDate'] = sale['DocumentDate'].astype(np.datetime64)
sale

Unnamed: 0,Sale_ID,Parcel_ID,SalePrice,DocumentDate
0,198920-1430-2857854,198920-1430,0,2017-03-28
1,638580-0110-2743355,638580-0110,190000,2015-07-14
2,919715-0200-2999169,919715-0200,192000,2019-07-08
3,894677-0240-2841697,894677-0240,818161,2016-12-21
4,445872-0260-2826129,445872-0260,0,2016-10-03
...,...,...,...,...
351062,219331-0270-2935450,219331-0270,850000,2018-06-07
351063,886030-0550-2942886,886030-0550,900000,2016-07-16
351064,769791-0030-2845806,769791-0030,133000,2016-07-16
351065,924600-0650-2971374,924600-0650,0,2019-01-02
