##### Import necessary packages

In [1]:
import numpy as np
import pandas as pd
import sqlite3
import os, sys

path_to_src = os.path.join("..", "..", "src")
sys.path.insert(1, path_to_src)
from custom_functions import *

pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)
%load_ext autoreload
%autoreload 2

##### Import datasets and packages

In [2]:
# Import Buildings database
path = os.path.join("..", "..", "data", "raw", "EXTR_ResBldg.csv")
df_resbldg = pd.read_csv(path, dtype="str")

# Import Sales database
path = os.path.join("..", "..", "data", "raw", "EXTR_RpSale.csv")
df_rpsale = pd.read_csv(path, dtype="str")

# Strip leading and trailing spaces
df_resbldg = strip_spaces(df_resbldg)
df_rpsale = strip_spaces(df_rpsale)

### The Sales Database

##### Eliminate unecessary data. After close investigation, the below columns were deemed the most worthy of continued analysis.

In [3]:
# Manual selection of the features of choice
rpsale_desired = [
    "ExciseTaxNbr",
    "Major",
    "Minor",
    "DocumentDate",
    "SalePrice",
    "RecordingNbr",
    "PropertyType",
    "PrincipalUse",
    "SaleInstrument",
    "AFForestLand",
    "AFCurrentUseLand",
    "AFNonProfitUse",
    "AFHistoricProperty",
    "SaleReason",
    "PropertyClass",
    "SaleWarning",
]

# Remove all columns that are not in the above lists
df_rpsale = df_rpsale[rpsale_desired].copy()

##### Create identifier that will be used to connect the two dataframes. 
In this case, each database provides *Major* and *Minor*, which serve as location-specific identifiers. From here on, the combination of *Major* and *Minor* will simply be referred to as the *parcel*. Although there is often more than one sale associated with a parcel, this is a great place to start for narrowing down our search. The goal is to narrow down the *Sales* dataset to include only one sale per parcel. This allows for a connection with the second database, *Residential Buildings*. 

In [4]:
# Create ParcelID
df_rpsale["Parcel_ID"] = df_rpsale.Major + "-" + df_rpsale.Minor

##### Some of the nitty gritty data selection

**PrincipalUse:** Select only sales for "Residential" plots, corresponding to code # 6, as can be found in the data dictionary. This eliminates Commerical, Condominium, Apartment, etc.

**PropertyClass:** PropertyClass is another distinction between Commerical/Industrial and Residential, as well as other fundamental features. Code # 8 corresponds to Residential Improved property

**PropertyType:** # Yet another classification of property type. Code #11 corresponds to single family households. Here we eliminate multiple family residences, alongside many commercial uses.


In [5]:
# Actions described in text box above
df_rpsale["PrincipalUse"] = elimination_by_code(df_rpsale["PrincipalUse"], "6")
df_rpsale["PropertyClass"] = elimination_by_code(
    df_rpsale["PropertyClass"], "8"
)
df_rpsale["PropertyType"] = elimination_by_code(
    df_rpsale["PropertyType"], "11"
)

# drop na's
df_rpsale.dropna(inplace=True)

##### Limit scope to 2019 sales

In [6]:
# Type conversion
df_rpsale["DocumentDate"] = df_rpsale.DocumentDate.astype(np.datetime64)

# Isolate SaleYear as its own column
df_rpsale["SaleYear"] = [sale.year for sale in df_rpsale["DocumentDate"]]

# Eliminate rows corresponding to sales in a year other than 2019
df_rpsale = df_rpsale.loc[df_rpsale["SaleYear"] == 2019].copy()

##### Eliminate unrealistically small sales

In [7]:
min_acceptable_sale_price = 25000
df_rpsale["SalePrice"] = df_rpsale.SalePrice.astype("int")
df_rpsale = df_rpsale.loc[
    df_rpsale.SalePrice > min_acceptable_sale_price
].copy()

##### Create column to identify duplicates, a necessary process before combining the two datasets

In [8]:
df_rpsale["SaleCount"] = list(
    map(dict(df_rpsale.Parcel_ID.value_counts()).get, df_rpsale.Parcel_ID)
)

##### Upon further inspection, there are still duplicates (cases of more than one sale of a given parcel in 2019)

In [9]:
print(df_rpsale.SaleCount.value_counts())

1    17884
2      682
6        6
3        6
Name: SaleCount, dtype: int64


##### Eliminate sales that are not the most recent
This eliminates the duplicated data issue of multiple sales in one year, bringing the dataset one step closer to a unique Parcel ID

In [10]:
# User-defined function that returns True/False indicating
# whether a sale is the most recent for each parcel
true_false = identify_latest_sale(df_rpsale.DocumentDate, df_rpsale.Parcel_ID)
df_rpsale = df_rpsale.loc[true_false].copy()

# Recalculate 'SaleCount' after removing old sales as described above
df_rpsale["SaleCount"] = list(
    map(dict(df_rpsale.Parcel_ID.value_counts()).get, df_rpsale.Parcel_ID)
)
print(df_rpsale.SaleCount.value_counts())

1    18152
2      152
Name: SaleCount, dtype: int64


##### The great majority of parcels are narrowed to one sale. For the remainders, take a simple approach: average the SalePrice for all sales on that parcel. 
Further inspection was done to verify that this is a valid way of dealing with outliers. For example, in many cases, the sales are of equal or nearly equal price. 

In [11]:
# User-defined function to return the average SalePrice for each
# parcel with more than one sale
df_rpsale = avg_price_for_duped_parcels(df_rpsale)
df_rpsale["SaleCount"] = list(
    map(dict(df_rpsale.Parcel_ID.value_counts()).get, df_rpsale.Parcel_ID)
)

# Remove duplicates
df_rpsale.index = df_rpsale.Parcel_ID.values
df_rpsale.drop_duplicates("Parcel_ID", inplace=True)

##### As a final step to cleaning *Sales* dataset, eliminate columns that are of no use going forward

In [12]:
# Drop unused columns as final step of cleaning before join
cols_to_keep = [
    "SalePrice",
    "Parcel_ID",
    "PropertyType",
    "PrincipalUse",
    "SaleInstrument",
    "AFForestLand",
    "AFCurrentUseLand",
    "AFNonProfitUse",
    "AFHistoricProperty",
    "SaleReason",
    "PropertyClass",
    "SaleWarning",
]
df_rpsale = df_rpsale[cols_to_keep].copy()
df_rpsale.head(5)

Unnamed: 0,SalePrice,Parcel_ID,PropertyType,PrincipalUse,SaleInstrument,AFForestLand,AFCurrentUseLand,AFNonProfitUse,AFHistoricProperty,SaleReason,PropertyClass,SaleWarning
213043-0120,560000,213043-0120,11,6,3,N,N,N,N,1,8,
940652-0630,435000,940652-0630,11,6,3,N,N,N,N,1,8,
347050-0040,648500,347050-0040,11,6,3,N,N,N,N,1,8,
345960-0230,1255000,345960-0230,11,6,3,N,N,N,N,1,8,40.0
030200-0645,380000,030200-0645,11,6,3,N,N,N,N,1,8,10.0


### The Buildings Database
This dataset requires far less cleaning than *Sales* did. In this case, it is mostly eliminating columns that have been determined not valuable as well as converting to correct datatypes.

##### Eliminate unecessary data. After close investigation, the below columns were deemed the most worthy of continued analysis.

In [13]:
# Manual selection of the features of choice
resbldg_desired = [
    "Major",
    "Minor",
    "NbrLivingUnits",
    "Stories",
    "BldgGrade",
    "BldgGradeVar",
    "SqFt1stFloor",
    "SqFtHalfFloor",
    "SqFt2ndFloor",
    "SqFtUpperFloor",
    "SqFtUnfinFull",
    "SqFtUnfinHalf",
    "SqFtTotLiving",
    "SqFtTotBasement",
    "SqFtFinBasement",
    "FinBasementGrade",
    "SqFtGarageBasement",
    "SqFtGarageAttached",
    "DaylightBasement",
    "SqFtOpenPorch",
    "SqFtEnclosedPorch",
    "SqFtDeck",
    "HeatSystem",
    "HeatSource",
    "BrickStone",
    "ViewUtilization",
    "Bedrooms",
    "BathHalfCount",
    "Bath3qtrCount",
    "BathFullCount",
    "FpSingleStory",
    "FpMultiStory",
    "FpFreestanding",
    "FpAdditional",
    "YrBuilt",
    "YrRenovated",
    "PcntComplete",
    "Obsolescence",
    "PcntNetCondition",
    "Condition",
]

# Remove all columns that are not in above list
df_resbldg = df_resbldg[resbldg_desired].copy()

##### Create Parcel ID, which will map to Sales database 

In [14]:
# ParcelID is an aggregation of Major and Minor, as it was with the Sales database
df_resbldg["Parcel_ID"] = df_resbldg.Major + "-" + df_resbldg.Minor

##### Convert features to the appropriate data type

In [15]:
# Convert each of these columns into integer format
convert_to_int = [
    "SqFtOpenPorch",
    "SqFtEnclosedPorch",
    "Bedrooms",
    "SqFtGarageAttached",
    "SqFtGarageBasement",
    "NbrLivingUnits",
    "BldgGrade",
    "SqFt1stFloor",
    "SqFtHalfFloor",
    "SqFt2ndFloor",
    "SqFtUpperFloor",
    "SqFtUnfinFull",
    "SqFtUnfinHalf",
    "SqFtTotLiving",
    "SqFtTotBasement",
    "SqFtFinBasement",
    "FinBasementGrade",
    "SqFtGarageBasement",
    "SqFtGarageAttached",
    "SqFtOpenPorch",
    "SqFtEnclosedPorch",
    "SqFtDeck",
    "BathHalfCount",
    "Bath3qtrCount",
    "BathFullCount",
    "FpSingleStory",
    "FpMultiStory",
    "FpFreestanding",
    "FpAdditional",
    "YrBuilt",
    "YrRenovated",
    "BrickStone",
]

# Loop over entire list
for category in convert_to_int:
    df_resbldg[category] = df_resbldg[category].astype("int")


# Convert into float format
df_resbldg["Stories"] = df_resbldg["Stories"].astype("float")

# Nit-picky
# Data cleaning for inconsistent casing
df_resbldg["DaylightBasement"] = df_resbldg["DaylightBasement"].str.upper()

# Remove buildings that aren't complete
df_resbldg = df_resbldg.loc[
    df_resbldg.PcntComplete.astype("str") == "0"
].copy()

# Remove buildings in obsolescence process
df_resbldg = df_resbldg.loc[
    df_resbldg.Obsolescence.astype("str") == "0"
].copy()

# Remove 6 outliers in abnormal condition
df_resbldg = df_resbldg.loc[
    df_resbldg.PcntNetCondition.astype("str") == "0"
].copy()

### Join with SQL and Export

In [16]:
path_to_db = os.path.join("..", "..", "data", "processed", "main.db")
conn = sqlite3.connect(path_to_db)
df_resbldg.to_sql("buildings", conn, if_exists="replace")
df_rpsale.to_sql("sales", conn, if_exists="replace")

q = """ SELECT * FROM buildings
LEFT JOIN sales USING (Parcel_ID)"""

joined = pd.read_sql(q, conn)


keepers = [
    "SalePrice",
    "NbrLivingUnits",
    "Stories",
    "BldgGrade",
    "SqFt1stFloor",
    "SqFtHalfFloor",
    "SqFt2ndFloor",
    "SqFtUpperFloor",
    "SqFtUnfinFull",
    "SqFtUnfinHalf",
    "SqFtTotLiving",
    "SqFtTotBasement",
    "SqFtFinBasement",
    "FinBasementGrade",
    "SqFtGarageBasement",
    "SqFtGarageAttached",
    "DaylightBasement",
    "SqFtOpenPorch",
    "SqFtEnclosedPorch",
    "SqFtDeck",
    "HeatSystem",
    "HeatSource",
    "BrickStone",
    "ViewUtilization",
    "Bedrooms",
    "BathHalfCount",
    "Bath3qtrCount",
    "BathFullCount",
    "FpSingleStory",
    "FpMultiStory",
    "FpFreestanding",
    "FpAdditional",
    "YrBuilt",
    "YrRenovated",
    "Condition",
    "SaleInstrument",
]
df_main = joined[keepers].copy()


df_main.dropna(inplace=True)
df_main.reset_index(inplace=True, drop=True)

df_main["SalePrice"] = df_main["SalePrice"].astype("int64")
df_main["SaleInstrument"] = df_main["SaleInstrument"].astype("int64")

YN_converter = (
    lambda x: 1
    if ((x == "Y") | (x == 1))
    else 0
    if ((x == "N") | (x == 0))
    else np.nan
)

# NOTE THAT THESE CAUSES LOTS OF NA'S!
df_main.DaylightBasement = df_main.DaylightBasement.apply(YN_converter)
df_main.ViewUtilization = df_main.ViewUtilization.apply(YN_converter)

# Store primary dataframe in SQL database
df_main.to_sql("step1_aggregated", conn, if_exists="replace")

conn.close()