# Project 2: Helping realtors to predict sale prices in Ames 

Done by: Richelle-Joy Chia, data scientist and realtor at MyProperty

Problem statement: How can we help realtors effectively and efficiently predict the market value of houses in Ames, Iowa?

## Part 3: Preprocessing and Feature Engineering

In [1]:
# import relevant libraries 

import pandas as pd
import numpy as np
import scipy.stats as stats
from itertools import combinations
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from math import sqrt # alternative to (variable)**0.5
from sklearn import metrics

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn')
sns.set(style="white", color_codes=True)
colors_palette = sns.color_palette("GnBu_d")
sns.set_palette(colors_palette)


In [2]:
# import datasets

train_data = pd.read_csv('./data_clean_final.csv')
test_data = pd.read_csv('./test_clean_final.csv')

In [3]:
# examine the shape of the datasets

print(train_data.shape)
print(test_data.shape)

(2051, 133)
(878, 128)


### Examine the correlation between features

In [4]:
# create matrix of all feature correlations

corr_matrix = train_data.corr().abs()

sol = (corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1)
                  .astype(np.bool))
                  .stack()
                  .sort_values(ascending=False))

# convert to dataframe and reset multi-level index
corr_df = pd.DataFrame(sol.head(20)).reset_index()

# rename columns
corr_df.columns = 'v1', 'v2', 'pair_corr'

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  .astype(np.bool))


In [5]:
# function to facilitate in the correlation matrix 
def corr_target(row):
    row['v1_y_corr'] = train_data.corr()['SalePrice'][row['v1']]
    row['v2_y_corr'] = train_data.corr()['SalePrice'][row['v2']]
    return row

In [6]:
# Create df with pairwise correlation and correlation to target
corr_df = corr_df.apply(corr_target, axis=1)

In [7]:
corr_df.head(20)

Unnamed: 0,v1,v2,pair_corr,v1_y_corr,v2_y_corr
0,Exterior 1st_CemntBd,Exterior 2nd_CmentBd,0.988379,0.167513,0.156999
1,Exterior 1st_VinylSd,Exterior 2nd_VinylSd,0.977569,0.342146,0.337563
2,Exterior 1st_MetalSd,Exterior 2nd_MetalSd,0.976461,-0.149851,-0.139339
3,Exterior 1st_HdBoard,Exterior 2nd_HdBoard,0.885877,-0.114332,-0.102462
4,MS Zoning_FV,Neighborhood_Somerst,0.874854,0.106695,0.150078
5,Gr Liv Area,TotalSqFeet,0.863761,0.697038,0.783108
6,Fireplaces,Fireplace Qu,0.859734,0.471093,0.538925
7,Exterior 1st_Wd Sdng,Exterior 2nd_Wd Sdng,0.859205,-0.191734,-0.172971
8,Neighborhood_NPkVill,Exterior 2nd_Brk Cmn,0.822054,-0.047296,-0.047893
9,Exterior 1st_AsbShng,Exterior 2nd_AsbShng,0.819811,-0.12634,-0.110404


- Given the results above, I will be dropping the following features that either have a high correlation with another feature or the predictor variable.

In [8]:
# drop features with high correlation to one another/the predictor variable

train_data = train_data.drop(['Exterior 2nd_CmentBd'], axis=1)
train_data = train_data.drop(['Exterior 2nd_MetalSd'], axis=1)
train_data = train_data.drop(['Exterior 2nd_VinylSd'], axis=1)
train_data = train_data.drop(['Fireplaces'], axis=1)
train_data = train_data.drop(['Exterior 2nd_Wd Sdng'], axis=1)
train_data = train_data.drop(['Neighborhood_NPkVill'], axis=1)
train_data = train_data.drop(['Exterior 2nd_AsbShng'], axis=1)
train_data = train_data.drop(['MS Zoning_RL'], axis=1)
train_data = train_data.drop(['Overall Qual'], axis=1)
train_data = train_data.drop(['TotalSqFeet'], axis=1)
train_data = train_data.drop(['Foundation_CBlock'], axis=1)
train_data = train_data.drop(['MS SubClass_90'], axis=1)
train_data = train_data.drop(['Exter Qual'], axis=1)
train_data = train_data.drop(['AgeofHouse'], axis=1)
train_data = train_data.drop(['Kitchen Qual'], axis=1)
train_data = train_data.drop(['Exterior 1st_Plywood'], axis=1)
train_data = train_data.drop(['Garage Type_Attchd'], axis=1)

In [9]:
# adjusting the variance to see how many features are included - plan to drop 25% of features with low variance 

low_var_list = train_data.var().sort_values(ascending=False)
low_var_list = low_var_list[low_var_list.values < 0.010]

In [10]:
# list of features with low variance

low_var_list

Neighborhood_BrDale     0.009182
Garage Type_2Types      0.009182
MS Zoning_C (all)       0.009182
Exterior 2nd_Brk Cmn    0.008224
Neighborhood_Veenker    0.008224
MS SubClass_75          0.007744
MS Zoning_RH            0.006783
Mas Vnr Type_BrkCmn     0.006301
MS SubClass_180         0.005337
Exterior 2nd_ImStucc    0.005337
Garage Type_CarPort     0.005337
MS SubClass_45          0.005337
Neighborhood_Blueste    0.002918
Exterior 2nd_Stone      0.002918
Foundation_Stone        0.002433
MS SubClass_40          0.001947
Exterior 1st_BrkComm    0.001461
Exterior 2nd_AsphShn    0.001461
Neighborhood_Greens     0.001461
Exterior 1st_Stone      0.000975
MS Zoning_A (agr)       0.000975
Exterior 1st_CBlock     0.000975
Neighborhood_GrnHill    0.000975
Foundation_Wood         0.000975
Exterior 2nd_CBlock     0.000975
MS SubClass_150         0.000488
Neighborhood_Landmrk    0.000488
MS Zoning_I (all)       0.000488
Exterior 1st_ImStucc    0.000488
Exterior 1st_AsphShn    0.000488
dtype: flo

In [11]:
# drop features with low variance (<0.010)

low_var_drop_list = [item for item in low_var_list.index]
train_data = train_data.drop(low_var_drop_list, axis=1)

### Apply changes to test set

In [12]:
# drop features with high correlation to one another/the predictor variable

test_data = test_data.drop(['Exterior 2nd_CmentBd'], axis=1)
test_data = test_data.drop(['Exterior 2nd_MetalSd'], axis=1)
test_data = test_data.drop(['Exterior 2nd_VinylSd'], axis=1)
test_data = test_data.drop(['Fireplaces'], axis=1)
test_data = test_data.drop(['Exterior 2nd_Wd Sdng'], axis=1)
test_data = test_data.drop(['Neighborhood_NPkVill'], axis=1)
test_data = test_data.drop(['Exterior 2nd_AsbShng'], axis=1)
test_data = test_data.drop(['MS Zoning_RL'], axis=1)
test_data = test_data.drop(['Overall Qual'], axis=1)
test_data = test_data.drop(['TotalSqFeet'], axis=1)
test_data = test_data.drop(['Foundation_CBlock'], axis=1)
test_data = test_data.drop(['MS SubClass_90'], axis=1)
test_data = test_data.drop(['Exter Qual'], axis=1)
test_data = test_data.drop(['AgeofHouse'], axis=1)
test_data = test_data.drop(['Kitchen Qual'], axis=1)
test_data = test_data.drop(['Exterior 1st_Plywood'], axis=1)
test_data = test_data.drop(['Garage Type_Attchd'], axis=1)

In [1]:
# function to apply changes to the test dataset

def apply_changes(df):
    # Dropping features with low variance
    low_var_drop_list = [item for item in low_var_list.index]
    df = df.drop(low_var_drop_list, axis=1, errors='ignore')
    return df

In [14]:
# execute the function from above

test_data = apply_changes(test_data)

In [15]:
# check for features in test dataset but not in train dataset

[x for x in test_data if x not in train_data]

['Mas Vnr Type_CBlock',
 'Exterior 1st_PreCast',
 'Exterior 2nd_Other',
 'Exterior 2nd_PreCast']

In [16]:
# drop features that are not in the train dataset 

test_data = test_data.drop(['Mas Vnr Type_CBlock', 'Exterior 1st_PreCast',
                  'Exterior 2nd_PreCast', 'Exterior 2nd_Other'], axis=1)

### Time to export the final datasets for the final notebook that contains the modeling and some insights!

In [17]:
train_data.to_csv('./datasets/train_data_final.csv', index=False)

In [18]:
test_data.to_csv('./datasets/test_data_final.csv', index=False)