##### Import necessary packages

In [1]:
import numpy as np
import pandas as pd
import sqlite3
import os, sys

path_to_src = os.path.join('..', '..', 'src')
sys.path.insert(1, path_to_src)
from custom_functions import *

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
%load_ext autoreload
%autoreload 2

##### Import datasets and packages

In [2]:
# Import lookups dictionary
path = os.path.join('..','..', 'data', 'raw', 'EXTR_LookUp.csv')
df_lookup = pd.read_csv(path, dtype='str')

# Import Buildings database
path = os.path.join('..','..', 'data', 'raw', 'EXTR_ResBldg.csv')
df_resbldg = pd.read_csv(path, dtype='str')

# Import Sales database
path = os.path.join('..','..', 'data', 'raw', 'EXTR_RpSale.csv')
df_rpsale = pd.read_csv(path, dtype='str')

##### Strip leading and trailing spaces

In [3]:
df_lookup = strip_spaces(df_lookup)
df_resbldg = strip_spaces(df_resbldg)
df_rpsale = strip_spaces(df_rpsale)

##### Drop columns that are obviously unnecessary

In [4]:
resbldg_desired = ['Major', 'Minor', 'NbrLivingUnits', 'Stories', 'BldgGrade', 
                   'BldgGradeVar', 'SqFt1stFloor', 'SqFtHalfFloor', 
                   'SqFt2ndFloor','SqFtUpperFloor', 'SqFtUnfinFull', 
                   'SqFtUnfinHalf', 'SqFtTotLiving', 'SqFtTotBasement', 
                   'SqFtFinBasement', 'FinBasementGrade', 'SqFtGarageBasement', 
                   'SqFtGarageAttached', 'DaylightBasement','SqFtOpenPorch', 
                   'SqFtEnclosedPorch', 'SqFtDeck', 'HeatSystem','HeatSource', 
                   'BrickStone', 'ViewUtilization', 'Bedrooms','BathHalfCount', 
                   'Bath3qtrCount', 'BathFullCount', 'FpSingleStory',
                   'FpMultiStory', 'FpFreestanding', 'FpAdditional', 'YrBuilt',
                   'YrRenovated', 'PcntComplete', 'Obsolescence', 
                   'PcntNetCondition','Condition']
df_resbldg = df_resbldg[resbldg_desired].copy()

rpsale_desired = ['ExciseTaxNbr', 'Major', 'Minor', 'DocumentDate', 
                  'RecordingNbr', 'PropertyType', 'PrincipalUse', 
                  'SaleInstrument', 'AFForestLand', 'AFCurrentUseLand', 
                  'AFNonProfitUse', 'AFHistoricProperty', 'SaleReason', 
                  'SalePrice', 'PropertyClass', 'SaleWarning']
df_rpsale = df_rpsale[rpsale_desired].copy()

# Create ParcelIDs
df_rpsale['Parcel_ID'] = df_rpsale.Major + '-' + df_rpsale.Minor
df_resbldg['Parcel_ID'] = df_resbldg.Major + '-' + df_resbldg.Minor

## Refine *Sales* DataFrame

##### Eliminate irrelevant property types, non-2019 sales, and add necessary helper columns

In [5]:
# Select only sales for "Residential" plots, corresponding to code #6, 
# as can be found in the data dictionary. This eliminates:
# Commerical, Condominium, Apartment, etc.
df_rpsale['PrincipalUse'] = elimination_by_code(df_rpsale['PrincipalUse'], '6')

# PropertyClass is another distinction between Commerical/Industrial
# and Residential, as well as other fundamental features. 
# Code #8 corresponds to Residential Improved property
df_rpsale['PropertyClass'] = elimination_by_code(df_rpsale['PropertyClass'], '8')

# Yet another classification of property type. Code #11 corresponds 
# to single family households. Here we eliminate multiple family residences, 
# alongside many commercial uses
df_rpsale['PropertyType'] = elimination_by_code(df_rpsale['PropertyType'], '11')
df_rpsale.dropna(inplace=True)

##### Eliminate non-2019 sales

In [6]:
res_sales['DocumentDate'] = res_sales.DocumentDate.astype(np.datetime64)
res_sales['SaleYear'] = [sale.year for sale in res_sales['DocumentDate']]
res_sales = res_sales.loc[res_sales['SaleYear']==2019].copy()

NameError: name 'res_sales' is not defined

##### Eliminate unrealistically small sales

In [None]:
min_acceptable_sale_price = 25000
res_sales['SalePrice'] = res_sales.SalePrice.astype('int')
res_sales = res_sales.loc[res_sales.SalePrice > min_acceptable_sale_price].copy()

##### Misc

In [None]:
# Drop sales determined to be invalid
res_sales.dropna(inplace=True)

# Create column to identify duplicates
res_sales['SaleCount'] = list(map(dict(res_sales.Parcel_ID.
                                       value_counts()).get, 
                                  res_sales.Parcel_ID))

**Remove sales for a given property that are not the most recent**

This prevents re-sale of homes from counting for multiple entries, which would overrepresent homes that tend to be resold within the final model. More importantly, it is a necessary step in order to join the Sales database with the Residential Building database

In [None]:
def identify_latest_sale(docdates, parcel_ids):
    latest_parcel_sale = []
    data = pd.DataFrame([docdates, parcel_ids]).T
    data.DocumentDate = data.DocumentDate.astype('datetime64')
 
    for i, parcel_id in enumerate(data.Parcel_ID):
        relevant_docdates = data.loc[data.Parcel_ID == parcel_id, 'DocumentDate']
        max_docdate = relevant_docdates.values.max()
        
        this_datetime = np.datetime64(data.iloc[i, 0]) 
        latest_parcel_sale.append(this_datetime == max_docdate)

    return latest_parcel_sale

tf = identify_latest_sale(res_sales.DocumentDate, res_sales.Parcel_ID)
latest_sales = res_sales.loc[tf].copy()
latest_sales['SaleCount'] = list(map(dict(latest_sales.Parcel_ID.
                                          value_counts()).get, 
                                     latest_sales.Parcel_ID))

**Determine average price for multiple sales on the same day on the same parcel**

This is a necessary step to joining the Sales database with the Residential Building database by removing duplicate entries without losing the valuable sales price data that would be lost by dropping a duplicate at random.

In [None]:
def avg_price_for_duped_parcels(data):
    dupes = data.loc[data.SaleCount > 1]
    for i, ind in enumerate(dupes.index):
        parcel_id = data.loc[ind, 'Parcel_ID']
        parcels_w_parcel_id = data.loc[data.Parcel_ID == parcel_id, 'SalePrice']

        avg_price_for_id = parcels_w_parcel_id.values.mean()
        for parcel_index in parcels_w_parcel_id.index:
            data.at[parcel_index, 'SalePrice'] = avg_price_for_id
    return data

# Average pricing for duplicates
latest_sales_averaged = avg_price_for_duped_parcels(latest_sales)
latest_sales_averaged['SaleCount'] = list(map(dict(latest_sales_averaged.
                                                   Parcel_ID.value_counts()).get, 
                                              latest_sales_averaged.Parcel_ID))


# Remove duplicates
latest_sales_averaged.index = latest_sales_averaged.Parcel_ID.values
latest_sales_averaged_deduped = latest_sales_averaged.drop_duplicates('Parcel_ID')
latest_sales_averaged_deduped.reset_index(inplace=True, drop=True)
latest_sales_averaged_deduped.index = latest_sales_averaged_deduped.Parcel_ID.values


# Drop unused columns as final step of cleaning before join
latest_sales_averaged_deduped_tokeep = ['SalePrice', 'Parcel_ID', 'PropertyType', 
                                        'PrincipalUse', 'SaleInstrument', 
                                        'AFForestLand', 'AFCurrentUseLand', 
                                        'AFNonProfitUse', 'AFHistoricProperty', 
                                        'SaleReason', 'PropertyClass', 'SaleWarning']
sales = latest_sales_averaged_deduped[latest_sales_averaged_deduped_tokeep].copy()
sales['SaleInstrument'] = sales.SaleInstrument.astype('int64')

sales.head(3)

In [None]:
sales.info()

# Handle Residential Building dataframe

Basic preparation of residential building database

In [None]:
convert_to_int = ['SqFtOpenPorch', 'SqFtEnclosedPorch', 'Bedrooms', 
                  'SqFtGarageAttached', 'SqFtGarageBasement', 'NbrLivingUnits', 
                  'BldgGrade', 'SqFt1stFloor','SqFtHalfFloor', 'SqFt2ndFloor', 
                  'SqFtUpperFloor', 'SqFtUnfinFull', 'SqFtUnfinHalf',
                  'SqFtTotLiving', 'SqFtTotBasement', 'SqFtFinBasement', 
                  'FinBasementGrade', 'SqFtGarageBasement', 'SqFtGarageAttached', 
                  'SqFtOpenPorch', 'SqFtEnclosedPorch', 'SqFtDeck', 
                  'BathHalfCount', 'Bath3qtrCount', 'BathFullCount', 
                  'FpSingleStory', 'FpMultiStory', 'FpFreestanding', 
                  'FpAdditional', 'YrBuilt', 'YrRenovated', 'BrickStone']
for category in convert_to_int:
    df_resbldg[category] = df_resbldg[category].astype('int')
    
convert_to_float = ['Stories']
for category in convert_to_float:
    df_resbldg[category] = df_resbldg[category].astype('float')
    
# Nit-picky
# Data cleaning for inconsistent casing
df_resbldg['DaylightBasement'] = df_resbldg['DaylightBasement'].str.upper() 

# Remove buildings that aren't complete
df_resbldg = df_resbldg.loc[df_resbldg.PcntComplete.astype('str') == '0'].copy() 

# Remove buildings in obsolescence process
df_resbldg = df_resbldg.loc[df_resbldg.Obsolescence.astype('str') == '0'].copy() 

# Remove 6 outliers in abnormal condition
df_resbldg = df_resbldg.loc[df_resbldg.PcntNetCondition.astype('str') == '0'].copy() 

### Join with SQL and Export

In [None]:
path_to_db = os.path.join('..', '..', 'data', 'processed', 'main.db')
conn = sqlite3.connect(path_to_db)
df_resbldg.to_sql('buildings', conn, if_exists='replace')
sales.to_sql('sales', conn, if_exists='replace')

q = ''' SELECT * FROM buildings
LEFT JOIN sales USING (Parcel_ID)'''

joined = pd.read_sql(q, conn)


keepers = ['SalePrice', 'NbrLivingUnits', 'Stories', 'BldgGrade', 
           'SqFt1stFloor', 'SqFtHalfFloor', 'SqFt2ndFloor', 'SqFtUpperFloor', 
           'SqFtUnfinFull', 'SqFtUnfinHalf', 'SqFtTotLiving', 'SqFtTotBasement', 
           'SqFtFinBasement', 'FinBasementGrade', 'SqFtGarageBasement', 
           'SqFtGarageAttached', 'DaylightBasement', 'SqFtOpenPorch', 
           'SqFtEnclosedPorch', 'SqFtDeck', 'HeatSystem', 'HeatSource', 
           'BrickStone', 'ViewUtilization', 'Bedrooms', 'BathHalfCount', 
           'Bath3qtrCount', 'BathFullCount', 'FpSingleStory', 'FpMultiStory', 
           'FpFreestanding', 'FpAdditional', 'YrBuilt',  'YrRenovated', 
           'Condition', 'SaleInstrument']
df_main = joined[keepers].copy()


df_main.dropna(inplace=True)
df_main.reset_index(inplace=True, drop=True)

df_main['SalePrice'] = df_main['SalePrice'].astype('int64')
df_main['SaleInstrument'] = df_main['SaleInstrument'].astype('int64')

YN_converter = lambda x: 1 if ((x == 'Y')|(x==1)) else 
                         0 if ((x == 'N')|(x==0)) else np.nan

# NOTE THAT THESE CAUSES LOTS OF NA'S!
df_main.DaylightBasement = df_main.DaylightBasement.apply(YN_converter) 
df_main.ViewUtilization = df_main.ViewUtilization.apply(YN_converter)

# Store primary dataframe in SQL database
df_main.to_sql('step1_aggregated', conn, if_exists='replace')


# Store the lookup codes in the SQL database in case they are needed downstream
df_lookup.to_sql('lookups', conn, if_exists='replace')

conn.close()