# Housing Price Regression Modeling and Prediction
Author: Sandra Shtabnaya<br>
Course: DSCI 401 - Applied Machine Learning

In [125]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.impute import SimpleImputer

## Import Data

In [126]:
model_data = pd.read_csv("AmesHousingSetA.csv")
validation_data = pd.read_csv("AmesHousingSetB.csv")

## Data Preparation
The data provided contains many omitted NaN values. Most omitted values are from columns with type "object" or string that relates to unusual attributes of a house, such as a pool and a fence.

In [127]:
model_data.info()
validation_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2344 entries, 0 to 2343
Data columns (total 81 columns):
PID                2344 non-null int64
MS.SubClass        2344 non-null int64
MS.Zoning          2344 non-null object
Lot.Frontage       1946 non-null float64
Lot.Area           2344 non-null int64
Street             2344 non-null object
Alley              166 non-null object
Lot.Shape          2344 non-null object
Land.Contour       2344 non-null object
Utilities          2344 non-null object
Lot.Config         2344 non-null object
Land.Slope         2344 non-null object
Neighborhood       2344 non-null object
Condition.1        2344 non-null object
Condition.2        2344 non-null object
Bldg.Type          2344 non-null object
House.Style        2344 non-null object
Overall.Qual       2344 non-null int64
Overall.Cond       2344 non-null int64
Year.Built         2344 non-null int64
Year.Remod.Add     2344 non-null int64
Roof.Style         2344 non-null object
Roof.Matl          2

### Handling Missing Categorical Values
I will make the assumption that NaN in most cases means not applicable, so I will replace NaN with "None." This is with the exception of Fireplace, Basement and Garage related attributes, whose values can be deduced. For example, where fireplaces is 0, we can assume that all other fireplace attributes are "none."

#### Missing Basement Features

In [128]:
no_bsmt_m = (model_data["Total.Bsmt.SF"] == 0)
no_bsmt_v = (validation_data["Total.Bsmt.SF"] == 0)
bsmt_attributes = ["Bsmt.Qual", "Bsmt.Cond", "Bsmt.Exposure", "BsmtFin.Type.1", 
                   "BsmtFin.SF.1", "BsmtFin.Type.2", "BsmtFin.SF.2", "Bsmt.Unf.SF"]

# replace all NaN basement attributes to None, where basement square footage is 0.
for column in bsmt_attributes:
    model_data.loc[no_bsmt_m, column] = model_data.loc[no_bsmt_m, column].fillna("None")
    validation_data.loc[no_bsmt_v, column] = validation_data.loc[no_bsmt_v, column].fillna("None")

model_data.loc[no_bsmt_m, "Bsmt.Half.Bath"] = model_data.loc[no_bsmt_m, "Bsmt.Half.Bath"].fillna(0.0)
model_data.loc[no_bsmt_m, "Bsmt.Full.Bath"] = model_data.loc[no_bsmt_m, "Bsmt.Full.Bath"].fillna(0.0)

validation_data.loc[no_bsmt_v, "Bsmt.Half.Bath"] = validation_data.loc[no_bsmt_v, "Bsmt.Half.Bath"].fillna(0.0)
validation_data.loc[no_bsmt_v, "Bsmt.Full.Bath"] = validation_data.loc[no_bsmt_v, "Bsmt.Full.Bath"].fillna(0.0)

#### Missing Garage Features

In [129]:
no_garage_m = (model_data["Garage.Area"] == 0)
no_garage_v = (model_data["Garage.Area"] == 0)
garage_attributes = ['Garage.Type', 'Garage.Finish', 'Garage.Qual', 'Garage.Cond']

# replace all NaN garage attributes to None, where garage area is 0.
for column in garage_attributes:
    model_data.loc[no_garage_m, column] = model_data.loc[no_garage_m, column].fillna("None")
    validation_data.loc[no_garage_v, column] = model_data.loc[no_garage_v, column].fillna("None")

model_data.loc[no_garage_m, "Garage.Yr.Blt"] = model_data.loc[no_garage_m, "Garage.Yr.Blt"].fillna(0.0)
model_data.loc[no_garage_m, "Garage.Cars"] = model_data.loc[no_garage_m, "Garage.Cars"].fillna(0.0)

validation_data.loc[no_garage_v, "Garage.Yr.Blt"] = validation_data.loc[no_garage_v, "Garage.Yr.Blt"].fillna(0.0)
validation_data.loc[no_garage_v, "Garage.Cars"] = validation_data.loc[no_garage_v, "Garage.Cars"].fillna(0.0)

#### Missing Fireplace Features

In [130]:
# replace all NaN fireplace attributes with None, where total fireplaces is 0.
no_fireplc_m = (model_data.Fireplaces == 0)
no_fireplc_v = (validation_data.Fireplaces == 0)

model_data.loc[no_fireplc_m, 'Fireplace.Qu'] = model_data.loc[no_fireplc_m, 'Fireplace.Qu'].fillna("None")
validation_data.loc[no_fireplc_v, 'Fireplace.Qu'] = validation_data.loc[no_fireplc_v, 'Fireplace.Qu'].fillna("None")

# replace all NaN pool attributes with None, where pool area is 0.
no_pool_m = (model_data["Pool.Area"] == 0)
no_pool_v = (validation_data["Pool.Area"] == 0)

model_data.loc[no_pool_m, 'Pool.QC'] = model_data.loc[no_pool_m, 'Pool.QC'].fillna("None")
validation_data.loc[no_pool_v, 'Pool.QC'] = validation_data.loc[no_pool_v, 'Pool.QC'].fillna("None")

#### Missing Masonry Features
There are no instances where the specified masonry is 0, but the masonry type is NaN. Since most homes do not have masonry, I will replace the NaNs with 0 and None.

In [131]:
model_data[(model_data["Mas.Vnr.Area"] == 0) & (model_data["Mas.Vnr.Type"].isna())][["Mas.Vnr.Area", "Mas.Vnr.Type"]]

model_data["Mas.Vnr.Area"].fillna(0.0, inplace=True)
model_data["Mas.Vnr.Type"].fillna("None", inplace=True)

validation_data["Mas.Vnr.Area"].fillna(0.0, inplace=True)
validation_data["Mas.Vnr.Type"].fillna("None", inplace=True)

#### Missing Miscellaneous Features
The remaining missing values cannot be deduced, but we can assume that if they are missing, they do not apply to the house.

In [132]:
model_data["Alley"].fillna("None", inplace=True)
model_data["Fence"].fillna("None", inplace=True)
model_data["Misc.Feature"].fillna("None", inplace=True)

validation_data["Alley"].fillna("None", inplace=True)
validation_data["Fence"].fillna("None", inplace=True)
validation_data["Misc.Feature"].fillna("None", inplace=True)

### Checking Results
After making deducations, there still remain some categorical values that we cannot deduce. This includes all the basement and garage attributes.

In [133]:
model_data.info()
validation_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2344 entries, 0 to 2343
Data columns (total 81 columns):
PID                2344 non-null int64
MS.SubClass        2344 non-null int64
MS.Zoning          2344 non-null object
Lot.Frontage       1946 non-null float64
Lot.Area           2344 non-null int64
Street             2344 non-null object
Alley              2344 non-null object
Lot.Shape          2344 non-null object
Land.Contour       2344 non-null object
Utilities          2344 non-null object
Lot.Config         2344 non-null object
Land.Slope         2344 non-null object
Neighborhood       2344 non-null object
Condition.1        2344 non-null object
Condition.2        2344 non-null object
Bldg.Type          2344 non-null object
House.Style        2344 non-null object
Overall.Qual       2344 non-null int64
Overall.Cond       2344 non-null int64
Year.Built         2344 non-null int64
Year.Remod.Add     2344 non-null int64
Roof.Style         2344 non-null object
Roof.Matl          

### Imputing Categorical Columns
In this case, we can use aggregate statistics to pick a replacement that is representative of the overall column. This can be done using sklearn's simple imputer. We will use the mode for each column.

In [134]:
model_x = model_data.drop(["SalePrice"], axis=1)
model_y = model_data.SalePrice

validation_x = validation_data.drop(["SalePrice"], axis=1)
validation_y = validation_data.SalePrice
cat_imputer = SimpleImputer(strategy="most_frequent")

cat_mod_x = model_x.select_dtypes(include=['object'])
cat_mod_x = cat_imputer.fit_transform(cat_mod_x)

cat_valid_x = validation_x.select_dtypes(include=['object'])
cat_valid_x = cat_imputer.fit_transform(cat_valid_x)

### Imputing Numeric Columns
Finally, there are some numeric values that are missing. Rather than throw the incomplete rows out, we can use sklearn's SimpleImputer to replace those values with an aggregate statistic of the value's column. Here, we can use the median, since it tends to be unaffected by outliers.

In [135]:
num_mod_x = model_x._get_numeric_data()
num_valid_x = validation_x._get_numeric_data()
num_imputer = SimpleImputer(strategy="median", missing_values=np.nan)

num_mod_x = num_imputer.fit_transform(num_mod_x)
num_mod_x = preprocessing.normalize(num_mod_x)
num_valid_x = num_imputer.fit_transform(num_valid_x)
num_valid_x = preprocessing.normalize(num_valid_x)

In [136]:
model_x = np.concatenate([num_mod_x, cat_mod_x], axis = 1)  # combine categorical and numeric values back
valid_x = np.concatenate([num_valid_x, cat_valid_x], axis = 1)  # combine categorical and numeric values back

## Exploratory Analysis