In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [2]:
data = pd.read_csv('train.csv')

In [3]:
#Make new dataframe from values that can be useful in a linear regression analysis

housing_data = data[['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'MasVnrArea', \
                    'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', \
                    'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'HeatingQC', '1stFlrSF', '2ndFlrSF', \
                    'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'KitchenQual', \
                    'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageCars', \
                    'GarageArea', 'GarageQual', 'GarageCond', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', \
                    'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscVal', 'SalePrice']]

housing_data.head(5)

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,MasVnrArea,ExterQual,ExterCond,BsmtQual,BsmtCond,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscVal,SalePrice
0,65.0,8450,7,5,2003,196.0,Gd,TA,Gd,TA,...,0,61,0,0,0,0,,,0,208500
1,80.0,9600,6,8,1976,0.0,TA,TA,Gd,TA,...,298,0,0,0,0,0,,,0,181500
2,68.0,11250,7,5,2001,162.0,Gd,TA,Gd,TA,...,0,42,0,0,0,0,,,0,223500
3,60.0,9550,7,5,1915,0.0,TA,TA,TA,Gd,...,0,35,272,0,0,0,,,0,140000
4,84.0,14260,8,5,2000,350.0,Gd,TA,Gd,TA,...,192,84,0,0,0,0,,,0,250000


In [4]:
housing_data.isnull().mean() * 100

LotFrontage      17.739726
LotArea           0.000000
OverallQual       0.000000
OverallCond       0.000000
YearBuilt         0.000000
MasVnrArea        0.547945
ExterQual         0.000000
ExterCond         0.000000
BsmtQual          2.534247
BsmtCond          2.534247
BsmtExposure      2.602740
BsmtFinType1      2.534247
BsmtFinSF1        0.000000
BsmtFinType2      2.602740
BsmtFinSF2        0.000000
BsmtUnfSF         0.000000
TotalBsmtSF       0.000000
HeatingQC         0.000000
1stFlrSF          0.000000
2ndFlrSF          0.000000
LowQualFinSF      0.000000
GrLivArea         0.000000
BsmtFullBath      0.000000
BsmtHalfBath      0.000000
FullBath          0.000000
HalfBath          0.000000
KitchenQual       0.000000
TotRmsAbvGrd      0.000000
Functional        0.000000
Fireplaces        0.000000
FireplaceQu      47.260274
GarageType        5.547945
GarageYrBlt       5.547945
GarageCars        0.000000
GarageArea        0.000000
GarageQual        5.547945
GarageCond        5.547945
W

In [5]:
#Rename columns
housing_data = housing_data.rename({'ExterQual':'ExteriorQ', 'ExterCond':'ExteriorC', 'HeatingQC':'Heating', \
                                    'KitchenQual':'Kitchen', 'BsmtQual':'BasementQ', 'BsmtCond':'BasementC', \
                                    'FireplaceQu':'FireplaceQ', 'GarageQual':'GarageQ', 'GarageCond':'GarageC', \
                                    'PoolQC':'Pool', 'BsmtExposure':'BasementEx', 'BsmtFinType1':'Basement1', \
                                    'BsmtFinType2':'Basement2', 'Functional':'Function', 'GarageType':'Garage', \
                                    'Fence':'FenceType'}, axis='columns')

housing_data.head(5)

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,MasVnrArea,ExteriorQ,ExteriorC,BasementQ,BasementC,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,Pool,FenceType,MiscVal,SalePrice
0,65.0,8450,7,5,2003,196.0,Gd,TA,Gd,TA,...,0,61,0,0,0,0,,,0,208500
1,80.0,9600,6,8,1976,0.0,TA,TA,Gd,TA,...,298,0,0,0,0,0,,,0,181500
2,68.0,11250,7,5,2001,162.0,Gd,TA,Gd,TA,...,0,42,0,0,0,0,,,0,223500
3,60.0,9550,7,5,1915,0.0,TA,TA,TA,Gd,...,0,35,272,0,0,0,,,0,140000
4,84.0,14260,8,5,2000,350.0,Gd,TA,Gd,TA,...,192,84,0,0,0,0,,,0,250000


In [6]:
housing_data.describe()

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,...,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,SalePrice
count,1201.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,1460.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,70.049958,10516.828082,6.099315,5.575342,1971.267808,103.685262,443.639726,46.549315,567.240411,1057.429452,...,1.767123,472.980137,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,180921.19589
std,24.284752,9981.264932,1.382997,1.112799,30.202904,181.066207,456.098091,161.319273,441.866955,438.705324,...,0.747315,213.804841,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,79442.502883
min,21.0,1300.0,1.0,1.0,1872.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34900.0
25%,59.0,7553.5,5.0,5.0,1954.0,0.0,0.0,0.0,223.0,795.75,...,1.0,334.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,129975.0
50%,69.0,9478.5,6.0,5.0,1973.0,0.0,383.5,0.0,477.5,991.5,...,2.0,480.0,0.0,25.0,0.0,0.0,0.0,0.0,0.0,163000.0
75%,80.0,11601.5,7.0,6.0,2000.0,166.0,712.25,0.0,808.0,1298.25,...,2.0,576.0,168.0,68.0,0.0,0.0,0.0,0.0,0.0,214000.0
max,313.0,215245.0,10.0,9.0,2010.0,1600.0,5644.0,1474.0,2336.0,6110.0,...,4.0,1418.0,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,755000.0


In [7]:
#Convert Garage and FenceType columns to strings
housing_data['Garage'] = housing_data['Garage'].astype(str)
housing_data['FenceType'] = housing_data['FenceType'].astype(str)

In [8]:
#Convert ratings in the following categories to numbers so they can be used in a linear regression: ExterQual, ExterCond,
#BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2, TotalBsmtSF, HeatingQC, KitchenQual, Functional, 
#FireplaceQu, GarageType, GarageQual, GarageCond, PoolQC, Fence

housing_data['ExteriorQ'] = housing_data['ExteriorQ'].map({'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})
housing_data['ExteriorC'] = housing_data['ExteriorC'].map({'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})
housing_data['Heating'] = housing_data['Heating'].map({'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})
housing_data['Kitchen'] = housing_data['Kitchen'].map({'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})

housing_data['BasementQ'] = housing_data['BasementQ'].map({'NA': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})
housing_data['BasementC'] = housing_data['BasementC'].map({'NA': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})
housing_data['FireplaceQ'] = housing_data['FireplaceQ'].map({'NA': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})
housing_data['GarageQ'] = housing_data['GarageQ'].map({'NA': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})
housing_data['GarageC'] = housing_data['GarageC'].map({'NA': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})
housing_data['Pool'] = housing_data['Pool'].map({'NA': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})

housing_data['BasementEx'] = housing_data['BasementEx'].map({'NA': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4})

housing_data['Basement1'] = housing_data['Basement1'].map({'NA': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, \
                                                                 'ALQ': 5, 'GLQ': 6})
housing_data['Basement2'] = housing_data['Basement2'].map({'NA': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, \
                                                                 'ALQ': 5, 'GLQ': 6})

housing_data['Function'] = housing_data['Function'].map({'Sal': 1, 'Sev': 2, 'Maj2': 3, 'Maj1': 4, 'Mod': 5, \
                                                             'Min2': 6, 'Min1': 7, 'Typ': 8})

housing_data['Garage'] = housing_data['Garage'].map({'NaN': 0, 'Detchd': 1, 'CarPort': 2, 'BuiltIn': 3, \
                                                             'Basment': 4, 'Attachd': 5, '2Types': 6})

housing_data['FenceType'] = housing_data['FenceType'].map({'NaN': 0, 'MnWw': 1, 'GdWo': 2, 'MnPrv': 3, 'GdPrv': 4})

In [9]:
housing_data['FenceType'] = housing_data['FenceType'].fillna(0)
housing_data['Pool'] = housing_data['FenceType'].fillna(0)
housing_data['FenceType'].value_counts()

0.0    1179
3.0     157
4.0      59
2.0      54
1.0      11
Name: FenceType, dtype: int64

In [10]:
#The year the house was built is not useful in a regression analysis, but the age of the house is. The dataset is from 2011,
#so to get the age, subtract 2011 from the year built.

housing_data['HouseAge'] = 2011 - housing_data['YearBuilt']

In [11]:
housing_data.head(5)

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,MasVnrArea,ExteriorQ,ExteriorC,BasementQ,BasementC,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,Pool,FenceType,MiscVal,SalePrice,HouseAge
0,65.0,8450,7,5,2003,196.0,4,3,4.0,3.0,...,61,0,0,0,0,0.0,0.0,0,208500,8
1,80.0,9600,6,8,1976,0.0,3,3,4.0,3.0,...,0,0,0,0,0,0.0,0.0,0,181500,35
2,68.0,11250,7,5,2001,162.0,4,3,4.0,3.0,...,42,0,0,0,0,0.0,0.0,0,223500,10
3,60.0,9550,7,5,1915,0.0,3,3,3.0,4.0,...,35,272,0,0,0,0.0,0.0,0,140000,96
4,84.0,14260,8,5,2000,350.0,4,3,4.0,3.0,...,84,0,0,0,0,0.0,0.0,0,250000,11


In [12]:
#export cleaned data as CSV
housing_data.to_csv('housing_cleaned.csv')