# King County Home Price Predictors - An Analysis

In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from statsmodels.formula.api import ols
import statsmodels.api as sm
import scipy.stats as stats

import os
import sys
sys.path.append('../src')

import sys
import os
sys.path.append(os.path.dirname(os.path.realpath(__file__)) + "/../src")

module_path = '/Users/1516/Predicting-Most-Valuable-Home-Projects-In-King-County-Analysis/src'

if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.modeling import modelfunctions as mf

RPSale = pd.read_csv('../../data/raw/EXTR_RPSale.csv')
ResBldg = pd.read_csv('../../data/raw/EXTR_ResBldg.csv')
Parcel = pd.read_csv('../../data/raw/EXTR_Parcel.csv')

ModuleNotFoundError: No module named 'src'

# Filtering the Data

### Filtering to The Last 3 Years

In [None]:
Date = RPSale['DocumentDate'].str.split('/', expand=True)
RPSale['MonthSold'] = Date[0].astype('float64')
RPSale['YrSold'] = Date[2].astype('float64')

#Drop original date column
RPSale.drop(columns=['DocumentDate'], axis=1, inplace=True)

In [None]:
#filtering data from 2018, 2019 and 2020 only
years = [2018, 2019, 2020]  
    
properties = RPSale.loc[RPSale['YrSold'].isin(years)]  
    
properties

In [None]:
properties['YrSold'].value_counts()

In [None]:
properties = properties.rename(columns=str.lower)

In [None]:
properties.head()

### Padding the 'major' and 'minor' columns and merging them together:

In [None]:
properties['major'] = properties['major'].astype(str)
properties['minor'] = properties['minor'].astype(str)

def major_padded(row):
    if len(row) == 6:
        return row
    elif len(row) == 5:
        return '0'+row
    elif len(row) == 4:
        return '00'+row
    elif len(row) == 3:
        return '000'+row
    elif len(row) == 2:
        return '0000'+row
    elif len(row) == 1:
        return '00000'+row
    else:
        return '000000'

def minor_padded(row):
    if len(row) == 4:
        return row
    elif len(row) == 3: 
        return '0'+row
    elif len(row) == 2:
        return '00'+row
    elif len(row) == 1:
        return '000'+row
    else:
        return '0000'
    
properties['major'] = properties.major.apply(major_padded)
properties['minor'] = properties.minor.apply(minor_padded)
properties['major_minor'] = properties['major']+properties['minor']

In [None]:
#Dropping the old columns
to_drop = ['major','minor']
properties.drop(to_drop, axis=1, inplace=True)

In [None]:
#Moving the 'major_minor' column to the front
col_name = 'major_minor'
last_col = properties.pop(col_name)
properties.insert(1, col_name, last_col)

### Narrowing the property type
According to our LookUp file, the residental values are coded as follows:

**11** - Household, single family units

**12** - Multiple family residence (Residential, 2-4 units)

**14** - Residential condominiums

**18** - All other residential not elsewhere coded

In [None]:
properties = properties[properties['propertytype'].isin([11, 12, 14, 18])]
properties

### Only using sale prices over 0:

In [None]:
properties = properties[properties['saleprice'] > 0]
properties

In [None]:
properties['saleprice'].describe()

### Filtering out based on principal use:

In [None]:
properties['principaluse'].unique()

According to our lookup, 

**6** - Residential

**2** - Condominium

**7** - Commercial

I will only focus on residential and condo properties.

In [None]:
properties = properties[properties['principaluse'].isin([6,2])]
properties

### Checking for 'propertyclass'

In [None]:
properties['propertyclass'].unique()

The follwing values and their definitions are found in our lookup file:

**7** - Res-Land only (I will remove this to only display residential property)

I will keep:

**8** - Residential- Improved Property

**3** - Condominium

**9** - Res/Condo mobile

In [None]:
properties = properties[properties['propertyclass'].isin([8,3,9])]
properties.head()

### Checking for duplicates based on major_minor numbers

In [None]:
len(properties) - len(properties.major_minor.unique())

I will check to see if there are any trends in the duplicates, since it is possible a home could have been sold more than once during the set timeframe.

In [None]:
properties[properties['major_minor'].isin(['8843900445','8847500000'])]

In [None]:
#Dropping the duplicates
properties.drop_duplicates(subset='major_minor', keep='last', inplace=True)

In [None]:
properties.head()

In [None]:
properties.info()

In [None]:
to_drop = ['volume', 'page', 'platnbr', 'plattype', 'platlot', 'platblock', 'sellername', 'buyername']
properties.drop(to_drop, axis=1, inplace=True)

In [None]:
properties.head()

Now I'll repeat everything I did for "properties" and apply it to residential buildings.

In [None]:
ResBldg.rename(columns=str.lower, inplace=True)
ResBldg['major'] = ResBldg['major'].astype(str)
ResBldg['minor'] = ResBldg['minor'].astype(str)

ResBldg['major'] = ResBldg['major'].apply(major_padded)
ResBldg['minor'] = ResBldg['minor'].apply(minor_padded)

ResBldg['major_minor'] = ResBldg['major']+ResBldg['minor']

col_name = 'major_minor'
last_col = ResBldg.pop(col_name)
ResBldg.insert(0, col_name, last_col)

ResBldg.head(2)

In [None]:
sum(ResBldg.minor == '0000')

Since residential buildings don't have condominiums (**Minor=0000**), I will be forced to drop more data.

In [None]:
ResBldg.loc[1]

In [None]:
ResBldg = ResBldg.dropna(how='any',axis=0)
ResBldg.info()

### Converting 'zipcode' to int

In [None]:
ResBldg['zipcode'] = ResBldg['zipcode'].str.split('-').str[0]
ResBldg['zipcode'].value_counts()

In [None]:
ResBldg['zipcode'] = ResBldg['zipcode'].astype(str).str.replace(',', '')
ResBldg['zipcode'] = ResBldg['zipcode'].astype(str).str.replace(' ', '')
ResBldg['zipcode'] = ResBldg['zipcode'].astype(str).str.replace('.', '')

In [None]:
df = ResBldg[ResBldg['zipcode'].apply(lambda x: not x.isnumeric())]
df

In [None]:
#Dropping non-numeric zipcode entries
ResBldg = ResBldg.drop(index=[71823,173514])

In [None]:
ResBldg['zipcode'].value_counts()

In [None]:
ResBldg['zipcode']= ResBldg['zipcode'].astype(np.int64)

In [None]:
prop_res = properties.merge(ResBldg, on='major_minor', suffixes=("", " "))
prop_res.describe()

### Converting the 'yrrenovated' column to a binary
Next, the 'yrrenovated' column has a large amount zero values (denoting that a house hasn't been renovated). Rather than dropping the data, I will make this a binary column whether a house was sold within 10 years of a renovation or not, or built within the last five years. 

First, I will make a new column 'age' with the properties' ages as variables to get a better sense of how old a home is.

In [None]:
#Convert YrBuilt to age by subtracting the build year from the most recent year in the in dataframe
prop_res['age'] = 2020 - prop_res.yrbuilt

Now I will convert the 'yrrenovated' column to a binary column.

In [None]:
prop_res['yrrenovated'].value_counts()

In [None]:
prop_res['yrrenovated'].fillna(0.0, inplace=True)

In [None]:
prop_res['renovated'] = prop_res['yrsold'] - prop_res['yrrenovated']

In [None]:
#Replacing values less than 10 with 1, and values over 10 with 0
Renovated = prop_res.renovated.values
Age = prop_res.age.values
values = np.where(Renovated <= 10, 1, 0)
prop_res['renovated'] = np.where(Age <= 5, 1, values)

Now I'll check again for duplicates:

In [None]:
prop_res.major_minor.value_counts()

In [None]:
prop_res[prop_res['major_minor'].isin(['0809003030', '6373000170', '2822059059'])]

In [None]:
prop_res.drop_duplicates(subset='major_minor', inplace=True)
len(prop_res)

We are now left with 33,671 entries in our data frame.

### Now for our last file, 'Parcel':

In [None]:
Parcel.rename(columns=str.lower, inplace=True)
Parcel['major'] = Parcel['major'].astype(str)
Parcel['minor'] = Parcel['minor'].astype(str)
Parcel['major'] = Parcel['major'].apply(major_padded)
Parcel['minor'] = Parcel['minor'].apply(minor_padded)
Parcel['major_minor'] = Parcel['major']+Parcel['minor']
last_col = Parcel.pop(col_name)
Parcel.insert(0, col_name, last_col)

In [None]:
prop_res = prop_res.merge(Parcel, on='major_minor', suffixes=("", " "))

In [None]:
prop_res.shape

In [None]:
prop_res['proptype'].unique()

Thankfully, we haven't lost any more entries *and* our property type column only contains 'R' (residential) values.

In [None]:
prop_res.describe()

**I will select features of a residential property that would have a bearing on its saleprice:**

In [None]:
property_data = prop_res.loc[:, ['saleprice', 'sqfttotliving', 'yrbuilt', 'yrrenovated', 'bedrooms',
                      'zipcode', 'sqfttotbasement', 'sqftfinbasement', 'sqftopenporch',
                      'sqftenclosedporch', 'sqftdeck','heatsystem', 'heatsource', 'bathhalfcount', 
                      'bath3qtrcount', 'bathfullcount','condition', 'viewutilization', 'sqftgarageattached',
                      'daylightbasement','bldggrade', 'finbasementgrade', 'hbuasifvacant', 'inadequateparking', 'township',
                      'mtrainier', 'olympics', 'cascades', 'territorial', 'seattleskyline', 'pugetsound',
                      'lakewashington', 'lakesammamish', 'smalllakerivercreek', 'otherview', 'wfntlocation',
                      'trafficnoise', 'airportnoise', 'powerlines', 'othernuisances', 'adjacentgreenbelt']]

In [None]:
property_data.head()

And lastly, I will narrow down the total square foot living space to 4500 (anything more is either a massive mansion or not a single family home, both of which are outside the scope of this analysis) and filter out properties between 10 and 2.5 million.

In [None]:
property_data = property_data[property_data['sqfttotliving'] < 4500]

In [None]:
property_data = property_data[(property_data['saleprice'] < 2500000) & (property_data['saleprice'] > 10)]

In [None]:
property_data.shape

In [None]:
cleaned_data = property_data.to_csv('cleaned_data.csv')
property_data = prop_res.to_csv('property_data.csv')

# Investigating the Target Variable: 'saleprice'

In [None]:
kc_data = pd.read_csv('../../data/processed/cleaned_data.csv', index_col=0) #Load w/out the unnecessary 'Unnamed:0' column.
kc_data.head()

In [None]:
kc_data.saleprice.describe()

In [None]:
fig, ax = plt.subplots(figsize=(15,8))
ax.set_title('Histogram of 2018, 2019, & 2020 Sales Price Data', fontsize = 18)
sns.histplot(kc_data.saleprice, ax = ax)
ax.tick_params(axis='both', labelsize=15)
plt.xlabel('Sale Price', fontsize = 15)
plt.ylabel('Count', fontsize = 15)
plt.tight_layout()

The filtered sales price data is right skewed which is to be expected since houses worth over 2.5 million dollars are included in the data. We see that our mean house price is just over 700K at $709,031.

In [None]:
data_corr = kc_data.corr()

# inspecting correlations with 'saleprice' in descending order to review the highest correlations first
sorted_corrs = data_corr['saleprice'].sort_values(ascending = False)
sorted_corrs

The top 2 highest correlated features with 'saleprice' are 'bldggrade' and 'sqfttotliving'. As is to be expected, houses with larger square footage are more expensive than smaller square foot houses. I will start by investigating this feature and work on building my first model.

## Total Living Square Footage ('sqfttotliving')
I'll start by examining the relationship between total living square footage and sale price.

In [None]:
fig, ax = plt.subplots(figsize=(15,8))
sns.regplot(x = 'sqfttotliving', y = 'saleprice', data = kc_data,
            scatter_kws={"color": "#0055AA"}, line_kws={"color": "green"});
ax.set_title('Scatter Plot of Total Living Square Footage against Sale Price', fontsize = 20)
ax.tick_params(axis='both', labelsize=15)
plt.xlabel('Total Living Square Footage', fontsize = 15)
plt.ylabel('Sale Price (Hundred Thousands)', fontsize = 15)
plt.tight_layout()

We can see there is a fairly strong correlation between total living space and sale price.
Let's examine the distribution of sqfttotliving:

In [None]:
fig, ax = plt.subplots(figsize=(11,6))
plt.title('Distribution of Total Living Space', fontsize = 25)
sns.histplot(kc_data.sqfttotliving, ax = ax)
ax.tick_params(axis = 'both', labelsize = 15)
plt.xlabel('Living Space Square Foot', fontsize = 20)
plt.tight_layout()

I will log-transform the data and examine, since this appears to be slightly right-skewed.

In [None]:
fig, ax = plt.subplots(figsize=(11,6))
plt.title('Log-Transformed Distribution of Total Living Square Foot', fontsize = 25)
sns.histplot(np.log(kc_data.sqfttotliving), ax = ax)
ax.tick_params(axis = 'both', labelsize = 15)
plt.xlabel('Living Square Footage', fontsize = 20)
plt.tight_layout()

The transformation happened to skew it to the left, I will try a square root transformation:

In [None]:
fig, ax = plt.subplots(figsize=(11,6))
plt.title('Square Root-Transformed Distribution of Total Living Square Foot ', fontsize = 25)
sns.histplot(np.sqrt(kc_data.sqfttotliving), ax = ax)
ax.tick_params(axis = 'both', labelsize = 15)
plt.xlabel('Living Square Footage', fontsize = 20)
plt.tight_layout()

There! Now the distribution is fairly normal and should work for modeling.

I'll save this as a column to use later:

In [None]:
kc_data['sqrt_sqfttotliving'] = np.sqrt(kc_data.sqfttotliving)

# 

In [None]:
def linear_model(dataframe):
    '''Build linear regression model, return model and print model summary from statsmodels.'''
    #Create feature and target columns
    X = dataframe.drop(columns=['price'], axis=1)
    y = dataframe.price
    
    #Split data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20, random_state=123)
    
    #Fit the model
    linreg = LinearRegression()
    model = linreg.fit(X_train, y_train)
    model
    
    #View model accuracy
    train_score = model.score(X_train, y_train)
    test_score = model.score(X_test, y_test)

    print('Training Score:', round(train_score, 2))
    print('Test Score:', round(test_score, 2))
    print('Coefficients:', model.coef_)
    
    #View model summary in statsmodels
    X_train = sm.add_constant(X_train)
    smmodel = sm.OLS(y_train, X_train).fit()
    print(smmodel.summary())
    return smmodel