Import necessary packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
from statsmodels.formula.api import ols

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
conn = sqlite3.connect('../../data/processed/main.db')
query = '''SELECT * FROM df_main'''
df_main = pd.read_sql(query, conn, index_col='index').reset_index(drop=True)

In [3]:
df_main.head(3)

Unnamed: 0,SalePrice,NbrLivingUnits,Stories,BldgGrade,SqFt1stFloor,SqFtHalfFloor,SqFt2ndFloor,SqFtUpperFloor,SqFtUnfinFull,SqFtUnfinHalf,SqFtTotLiving,SqFtTotBasement,SqFtFinBasement,FinBasementGrade,SqFtGarageBasement,SqFtGarageAttached,DaylightBasement,SqFtOpenPorch,SqFtEnclosedPorch,SqFtDeck,HeatSystem,HeatSource,BrickStone,ViewUtilization,Bedrooms,BathHalfCount,Bath3qtrCount,BathFullCount,FpSingleStory,FpMultiStory,FpFreestanding,FpAdditional,YrBuilt,YrRenovated,Condition,SaleInstrument,forced_air_heating
0,445000,1,1.0,7,1480,0,0,0,0,0,1480,0,0,0,0,440,,80,0,120,5,2,0,,3,0,0,2,1,0,0,0,1994,0,5,3,1
1,188500,1,1.0,5,550,0,0,0,0,0,550,0,0,0,0,0,,0,0,0,1,1,0,,2,0,1,0,0,0,0,0,1942,0,3,3,0
2,730000,1,2.0,9,660,0,595,0,0,0,1540,380,285,8,0,0,1.0,0,0,155,1,3,0,1.0,3,1,1,1,0,1,0,0,1984,0,3,3,0


In [4]:
df_main.columns

Index(['SalePrice', 'NbrLivingUnits', 'Stories', 'BldgGrade', 'SqFt1stFloor',
       'SqFtHalfFloor', 'SqFt2ndFloor', 'SqFtUpperFloor', 'SqFtUnfinFull',
       'SqFtUnfinHalf', 'SqFtTotLiving', 'SqFtTotBasement', 'SqFtFinBasement',
       'FinBasementGrade', 'SqFtGarageBasement', 'SqFtGarageAttached',
       'DaylightBasement', 'SqFtOpenPorch', 'SqFtEnclosedPorch', 'SqFtDeck',
       'HeatSystem', 'HeatSource', 'BrickStone', 'ViewUtilization', 'Bedrooms',
       'BathHalfCount', 'Bath3qtrCount', 'BathFullCount', 'FpSingleStory',
       'FpMultiStory', 'FpFreestanding', 'FpAdditional', 'YrBuilt',
       'YrRenovated', 'Condition', 'SaleInstrument', 'forced_air_heating'],
      dtype='object')

In [5]:
# Binarize 'Heating Systems' variable to answer question posed in Prompt
encode_heating_systems = lambda x: 1 if x==5 else 0
df_main['forced_air_vs_other'] = df_main.HeatSystem.apply(encode_heating_systems)

In [6]:
# Isolate columns related to the square footage of the house
sq_footage = df_main[['SqFtTotLiving', 'SqFtTotBasement', 'SqFt1stFloor', 'SqFtHalfFloor', 'SqFt2ndFloor', 'SqFtUpperFloor', 'SqFtUnfinFull', 'SqFtUnfinHalf',  'SqFtFinBasement', 'SqFtGarageBasement']].copy()

# Here we see that SqFt isn't just living area + basement area
sq_footage['*TotLiving+TotBasement*'] = df_main.SqFtTotLiving+df_main.SqFtTotBasement 

# Here we see that SqFtTot IS ALWAYS >= SqFtFin+SqFtGarageBasement, as we would hope to be true, otherwise indicating an error
sq_footage.loc[sq_footage['SqFtFinBasement'] > (sq_footage['SqFtTotBasement']+sq_footage['SqFtGarageBasement'])] 

# Here we determine the SqFtTotLiving includes only finished space
finished = ['SqFt1stFloor', 'SqFtHalfFloor', 'SqFt2ndFloor', 'SqFtUpperFloor', 'SqFtFinBasement']
unfinished = ['SqFtUnfinFull', 'SqFtUnfinHalf']
sq_footage['livable'] = sq_footage[finished].sum(axis=1) - sq_footage[unfinished].sum(axis=1)
sq_footage.loc[sq_footage.livable != sq_footage.SqFtTotLiving]

# Now we can get rid of SqFt features that are already accounted for with 'summation' features
# All of the metrics can be encompassed in three features: SqFtTotLiving, SqFtTotBasement, SqFtUnfinished
SqFtUnfinished = df_main['SqFtUnfinFull'] + df_main['SqFtUnfinHalf']

In [7]:
sq_ft_cols_to_drop = ['SqFt1stFloor', 'SqFtHalfFloor', 'SqFtUpperFloor', 'SqFtUnfinFull', 'SqFtUnfinHalf', 'SqFt2ndFloor', 'SqFtGarageBasement', 'SqFtFinBasement']

# Allows the cell to be run multiple times without an error saying the column doesn't exist (because it was already dropped)
for col in sq_ft_cols_to_drop:
    if col in df_main.columns:
        df_main.drop(col, axis=1, inplace=True)
        
# Include the "unfinished area" metric since it is data not otherwise accounted for
df_main['SqFtUnfinished'] = SqFtUnfinished

In [9]:
# *****************************************************
##### UNCOMMENT TO WRITE TO SQL DB #####

# df_main.to_sql('df_main', conn, if_exists='replace')
# conn.close()

# *****************************************************