Import necessary packages

In [1]:
import numpy as np
import pandas as pd
import sqlite3
import os, sys

path_to_src = os.path.join('..', '..', 'src')
sys.path.insert(1, path_to_src)
from custom_functions import *

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
%load_ext autoreload
%autoreload 2

Import datasets and packages

In [2]:
df_lookup = pd.read_csv(os.path.join('..','..', 'data', 'raw', 'EXTR_LookUp.csv'), dtype='str')
df_resbldg = pd.read_csv(os.path.join('..','..', 'data', 'raw', 'EXTR_ResBldg.csv'), dtype='str')
df_rpsale = pd.read_csv(os.path.join('..','..', 'data', 'raw', 'EXTR_RpSale.csv'), dtype='str')

Strip leading and trailing spaces

In [3]:
df_lookup = strip_spaces(df_lookup)
df_resbldg = strip_spaces(df_resbldg)
df_rpsale = strip_spaces(df_rpsale)

Drop columns that are obviously unnecessary

In [4]:
resbldg_desired_columns = ['Major', 'Minor', 'NbrLivingUnits', 'Stories', 'BldgGrade', 
                           'BldgGradeVar', 'SqFt1stFloor', 'SqFtHalfFloor', 'SqFt2ndFloor',
                           'SqFtUpperFloor', 'SqFtUnfinFull', 'SqFtUnfinHalf', 'SqFtTotLiving', 'SqFtTotBasement', 
                           'SqFtFinBasement', 'FinBasementGrade', 'SqFtGarageBasement', 'SqFtGarageAttached', 
                           'DaylightBasement','SqFtOpenPorch', 'SqFtEnclosedPorch', 'SqFtDeck', 'HeatSystem',
                           'HeatSource', 'BrickStone', 'ViewUtilization', 'Bedrooms','BathHalfCount', 
                           'Bath3qtrCount', 'BathFullCount', 'FpSingleStory','FpMultiStory', 'FpFreestanding', 
                           'FpAdditional', 'YrBuilt','YrRenovated', 'PcntComplete', 'Obsolescence', 
                           'PcntNetCondition','Condition']
df_resbldg = df_resbldg[resbldg_desired_columns].copy()

rpsale_desired_columns = ['ExciseTaxNbr', 'Major', 'Minor', 'DocumentDate', 'SalePrice', 'RecordingNbr', 'PropertyType', 
                          'PrincipalUse', 'SaleInstrument', 'AFForestLand', 'AFCurrentUseLand', 'AFNonProfitUse', 
                          'AFHistoricProperty', 'SaleReason', 'PropertyClass', 'SaleWarning']
df_rpsale = df_rpsale[rpsale_desired_columns]

# Create ParcelIDs
df_rpsale['Parcel_ID'] = df_rpsale.Major + '-' + df_rpsale.Minor
df_resbldg['Parcel_ID'] = df_resbldg.Major + '-' + df_resbldg.Minor

## Refine *Sales* DataFrame

**Eliminate irrelevant property types, non-2019 sales, and add necessary helper columns**

In [5]:
res_sales = df_rpsale.copy()

principal_use_codes_to_keep = ['6'] # Consider investigating '4' too 
principal_use_codes_to_drop = np.setdiff1d(res_sales.PrincipalUse.unique(), principal_use_codes_to_keep)

property_class_codes_to_keep = ['8']
property_class_codes_to_drop = np.setdiff1d(res_sales.PropertyClass.unique(), property_class_codes_to_keep)

# Need to consider where to classify codes like 2,3 since there isn't an explicit statement
#    - Possible to disregard entirely, explore data first
# Consider looking into single-family and multiple-family separately
property_type_codes_to_keep =['11']# ['2', '3', '6', '10', '11', '12', '13', '18', '19']
property_type_codes_to_drop = np.setdiff1d(res_sales.PropertyType.unique(), property_type_codes_to_keep)

for code in principal_use_codes_to_drop:
    res_sales['PrincipalUse'].replace(to_replace=code, value=np.nan, inplace=True)

for code in property_class_codes_to_drop:
    res_sales['PropertyClass'].replace(to_replace=code, value=np.nan, inplace=True)

for code in property_type_codes_to_drop:
    res_sales['PropertyType'].replace(to_replace=code, value=np.nan, inplace=True)

## CREATE PARCEL ID
#res_sales['Parcel_ID'] = res_sales.Major + '-' + res_sales.Minor



# KEEP ONLY 2019 SALES
res_sales['DocumentDate'] = res_sales.DocumentDate.astype(np.datetime64)
res_sales['SaleYear'] = [sale.year for sale in res_sales['DocumentDate']]
res_sales = res_sales.loc[res_sales['SaleYear']==2019].copy()


# ELIMINATE UNREALISTICALLY SMALL SALES
min_acceptable_sale_price = 25000
res_sales['SalePrice'] = res_sales.SalePrice.astype('int')
#res_sales['SalePrice'].replace(0, np.nan, inplace=True)
res_sales = res_sales.loc[res_sales.SalePrice > min_acceptable_sale_price].copy()

# DROP SALES DETERMINED TO BE INVALID
res_sales.dropna(inplace=True)


# CREATE COLUMN TO IDENTIFY DUPLICATES
res_sales['SaleCount'] = list(map(dict(res_sales.Parcel_ID.value_counts()).get, res_sales.Parcel_ID))

**Remove sales for a given property that are not the most recent**

This prevents re-sale of homes from counting for multiple entries, which would overrepresent homes that tend to be resold within the final model. More importantly, it is a necessary step in order to join the Sales database with the Residential Building database

In [6]:
def identify_latest_sale(docdates, parcel_ids):
    latest_parcel_sale = []
    data = pd.DataFrame([docdates, parcel_ids]).T
    data.DocumentDate = data.DocumentDate.astype('datetime64')
 
    for i, parcel_id in enumerate(data.Parcel_ID):
        relevant_docdates = data.loc[data.Parcel_ID == parcel_id, 'DocumentDate']
        max_docdate = relevant_docdates.values.max()
        
        this_datetime = np.datetime64(data.iloc[i, 0]) 
        latest_parcel_sale.append(this_datetime == max_docdate)

    return latest_parcel_sale

tf = identify_latest_sale(res_sales.DocumentDate, res_sales.Parcel_ID)
latest_sales = res_sales.loc[tf].copy()
latest_sales['SaleCount'] = list(map(dict(latest_sales.Parcel_ID.value_counts()).get, latest_sales.Parcel_ID))

**Determine average price for multiple sales on the same day on the same parcel**

This is a necessary step to joining the Sales database with the Residential Building database by removing duplicate entries without losing the valuable sales price data that would be lost by dropping a duplicate at random.

In [7]:
def avg_price_for_duped_parcels(data):
    dupes = data.loc[data.SaleCount > 1]
    for i, ind in enumerate(dupes.index):
        parcel_id = data.loc[ind, 'Parcel_ID']
        parcels_w_parcel_id = data.loc[data.Parcel_ID == parcel_id, 'SalePrice']

        avg_price_for_id = parcels_w_parcel_id.values.mean()
        for parcel_index in parcels_w_parcel_id.index:
            data.at[parcel_index, 'SalePrice'] = avg_price_for_id
    return data

# Average pricing for duplicates
latest_sales_averaged = avg_price_for_duped_parcels(latest_sales)
latest_sales_averaged['SaleCount'] = list(map(dict(latest_sales_averaged.Parcel_ID.value_counts()).get, latest_sales_averaged.Parcel_ID))


# Remove duplicates
latest_sales_averaged.index = latest_sales_averaged.Parcel_ID.values
latest_sales_averaged_deduped = latest_sales_averaged.drop_duplicates('Parcel_ID')
latest_sales_averaged_deduped.reset_index(inplace=True, drop=True)
latest_sales_averaged_deduped.index = latest_sales_averaged_deduped.Parcel_ID.values


# Drop unused columns as final step of cleaning before join
latest_sales_averaged_deduped_tokeep = ['SalePrice', 'Parcel_ID', 'PropertyType', 'PrincipalUse', 
                                        'SaleInstrument', 'AFForestLand', 'AFCurrentUseLand', 
                                        'AFNonProfitUse', 'AFHistoricProperty', 'SaleReason', 
                                        'PropertyClass', 'SaleWarning']
sales = latest_sales_averaged_deduped[latest_sales_averaged_deduped_tokeep].copy()
sales['SaleInstrument'] = sales.SaleInstrument.astype('int64')


sales.head(3)

Unnamed: 0,SalePrice,Parcel_ID,PropertyType,PrincipalUse,SaleInstrument,AFForestLand,AFCurrentUseLand,AFNonProfitUse,AFHistoricProperty,SaleReason,PropertyClass,SaleWarning
213043-0120,560000,213043-0120,11,6,3,N,N,N,N,1,8,
940652-0630,435000,940652-0630,11,6,3,N,N,N,N,1,8,
347050-0040,648500,347050-0040,11,6,3,N,N,N,N,1,8,


In [8]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18228 entries, 213043-0120 to 033310-0255
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   SalePrice           18228 non-null  int32 
 1   Parcel_ID           18228 non-null  object
 2   PropertyType        18228 non-null  object
 3   PrincipalUse        18228 non-null  object
 4   SaleInstrument      18228 non-null  int64 
 5   AFForestLand        18228 non-null  object
 6   AFCurrentUseLand    18228 non-null  object
 7   AFNonProfitUse      18228 non-null  object
 8   AFHistoricProperty  18228 non-null  object
 9   SaleReason          18228 non-null  object
 10  PropertyClass       18228 non-null  object
dtypes: int32(1), int64(1), object(10)
memory usage: 1.7+ MB


# Handle Residential Building dataframe

Basic preparation of residential building database

In [9]:
convert_to_int = ['SqFtOpenPorch', 'SqFtEnclosedPorch', 'Bedrooms', 'SqFtGarageAttached', 'SqFtGarageBasement', 'NbrLivingUnits', 'BldgGrade', 'SqFt1stFloor',
                  'SqFtHalfFloor', 'SqFt2ndFloor', 'SqFtUpperFloor', 'SqFtUnfinFull', 'SqFtUnfinHalf','SqFtTotLiving', 'SqFtTotBasement', 'SqFtFinBasement', 
                  'FinBasementGrade', 'SqFtGarageBasement', 'SqFtGarageAttached', 'SqFtOpenPorch', 'SqFtEnclosedPorch', 'SqFtDeck', 'BathHalfCount', 
                  'Bath3qtrCount', 'BathFullCount', 'FpSingleStory', 'FpMultiStory', 'FpFreestanding', 'FpAdditional', 'YrBuilt', 'YrRenovated', 'BrickStone']
for category in convert_to_int:
    df_resbldg[category] = df_resbldg[category].astype('int')
    
convert_to_float = ['Stories']
for category in convert_to_float:
    df_resbldg[category] = df_resbldg[category].astype('float')
    
# Nit-picky
df_resbldg['DaylightBasement'] = df_resbldg['DaylightBasement'].str.upper() # Data cleaning for inconsistent casing
df_resbldg = df_resbldg.loc[df_resbldg.PcntComplete.astype('str') == '0'].copy() # Remove buildings that aren't complete
df_resbldg = df_resbldg.loc[df_resbldg.Obsolescence.astype('str') == '0'].copy() # Remove buildings in obsolescence process
df_resbldg = df_resbldg.loc[df_resbldg.PcntNetCondition.astype('str') == '0'].copy() # Remove 6 outliers in abnormal condition

### Join with SQL and Export

In [11]:
path_to_db = os.path.join('..', '..', 'data', 'main.db')
conn = sqlite3.connect(path_to_db)
df_resbldg.to_sql('buildings', conn, if_exists='replace')
sales.to_sql('sales', conn, if_exists='replace')

q = ''' SELECT * FROM buildings
LEFT JOIN sales USING (Parcel_ID)'''

joined = pd.read_sql(q, conn)


keepers = ['SalePrice', 'NbrLivingUnits', 'Stories', 'BldgGrade', 'SqFt1stFloor', 'SqFtHalfFloor', 'SqFt2ndFloor', 
'SqFtUpperFloor', 'SqFtUnfinFull', 'SqFtUnfinHalf', 'SqFtTotLiving', 'SqFtTotBasement', 'SqFtFinBasement', 'FinBasementGrade', 'SqFtGarageBasement', 
'SqFtGarageAttached', 'DaylightBasement', 'SqFtOpenPorch', 'SqFtEnclosedPorch', 'SqFtDeck', 'HeatSystem', 'HeatSource', 'BrickStone', 'ViewUtilization', 
'Bedrooms', 'BathHalfCount', 'Bath3qtrCount', 'BathFullCount', 'FpSingleStory', 'FpMultiStory', 'FpFreestanding', 'FpAdditional', 'YrBuilt', 
 'YrRenovated', 'Condition', 'SaleInstrument']
df_main = joined[keepers].copy()


df_main.dropna(inplace=True)
df_main.reset_index(inplace=True, drop=True)



df_main['SalePrice'] = df_main['SalePrice'].astype('int64')
df_main['SaleInstrument'] = df_main['SaleInstrument'].astype('int64')

YN_converter = lambda x: 1 if ((x == 'Y')|(x==1)) else 0 if ((x == 'N')|(x==0)) else np.nan
df_main.DaylightBasement = df_main.DaylightBasement.apply(YN_converter) # NOTE THAT THIS CAUSES LOTS OF NA'S!
df_main.ViewUtilization = df_main.ViewUtilization.apply(YN_converter) # NOTE THAT THIS CAUSES LOTS OF NA'S!

# Store primary dataframe in SQL database
df_main.to_sql('step1_aggregated', conn, if_exists='replace')


# Store the lookup codes in the SQL database in case they are needed downstream
df_lookup.to_sql('lookups', conn, if_exists='replace')

conn.close()