This file combines our housing dataset with various neighborhood-level datasets. The neighborhood-level datasets contain information on crime, walkability, unemployment, school quality, etc.

This file also cleans the data and codes all categorical variables as binary variables.

In [9]:
import pandas as pd

In [10]:
# Load data
housing_data = pd.read_csv('ames_housing_data.csv')
income_unemp_data = pd.read_csv('income_unemp_data.csv')
walk_bike_score_data = pd.read_csv('walk_bike_score_data.csv')
school_quality_data = pd.read_csv('school_quality_data.csv')

In [11]:
# Merge datasets
data = housing_data.merge(income_unemp_data, how='left', on='Neighborhood')
data = data.merge(walk_bike_score_data, how='left', on='Neighborhood')
data = data.merge(school_quality_data, how='left', on='Neighborhood')

In [12]:
# Drop unecessary variables
data.drop(['Id', 'Neighborhood'], axis=1, inplace=True)

In [13]:
# Encode categorical variables as binary variables
data = pd.get_dummies(data, columns=['MSSubClass'])
data.drop(['MSSubClass_20'], axis=1, inplace=True)
data = pd.get_dummies(data, columns=['MSZoning'])
data.drop(['MSZoning_RL'], axis=1, inplace=True)
data = pd.get_dummies(data, columns=['Street'])
data.drop(['Street_Pave'], axis=1, inplace=True)
data = pd.get_dummies(data, columns=['Alley'])
# No need to drop a column, because Alley has an NA category, which will be our base category
data = pd.get_dummies(data, columns=['LotShape'])
data.drop(['LotShape_Reg'], axis=1, inplace=True)
data = pd.get_dummies(data, columns=['LandContour'])
data.drop(['LandContour_Lvl'], axis=1, inplace=True)
data = pd.get_dummies(data, columns=['Utilities'])
data.drop(['Utilities_AllPub'], axis=1, inplace=True)
data = pd.get_dummies(data, columns=['LotConfig'])
data.drop(['LotConfig_Inside'], axis=1, inplace=True)
data = pd.get_dummies(data, columns=['LandSlope'])
data.drop(['LandSlope_Gtl'], axis=1, inplace=True)
data = pd.get_dummies(data, columns=['Condition1'])
data.drop(['Condition1_Norm'], axis=1, inplace=True)
data = pd.get_dummies(data, columns=['Condition2'])
data.drop(['Condition2_Norm'], axis=1, inplace=True)
data = pd.get_dummies(data, columns=['BldgType'])
data.drop(['BldgType_1Fam'], axis=1, inplace=True)
data = pd.get_dummies(data, columns=['HouseStyle'])
data.drop(['HouseStyle_1Story'], axis=1, inplace=True)
data = pd.get_dummies(data, columns=['RoofStyle'])
data.drop(['RoofStyle_Gable'], axis=1, inplace=True)
data = pd.get_dummies(data, columns=['RoofMatl'])
data.drop(['RoofMatl_CompShg'], axis=1, inplace=True)
data = pd.get_dummies(data, columns=['Exterior1st'])
data.drop(['Exterior1st_VinylSd'], axis=1, inplace=True)
data = pd.get_dummies(data, columns=['Exterior2nd'])
data.drop(['Exterior2nd_VinylSd'], axis=1, inplace=True)
data = pd.get_dummies(data, columns=['MasVnrType'])
data.drop(['MasVnrType_None'], axis=1, inplace=True)
data = pd.get_dummies(data, columns=['ExterQual'])
data.drop(['ExterQual_TA'], axis=1, inplace=True)
data = pd.get_dummies(data, columns=['ExterCond'])
data.drop(['ExterCond_TA'], axis=1, inplace=True)
data = pd.get_dummies(data, columns=['Foundation'])
data.drop(['Foundation_PConc'], axis=1, inplace=True)
data = pd.get_dummies(data, columns=['BsmtQual'])
data = pd.get_dummies(data, columns=['BsmtCond'])
data = pd.get_dummies(data, columns=['BsmtExposure'])
data = pd.get_dummies(data, columns=['BsmtFinType1'])
data = pd.get_dummies(data, columns=['BsmtFinType2'])
data = pd.get_dummies(data, columns=['Heating'])
data.drop(['Heating_GasA'], axis=1, inplace=True)
data = pd.get_dummies(data, columns=['HeatingQC'])
data.drop(['HeatingQC_TA'], axis=1, inplace=True)
data = pd.get_dummies(data, columns=['CentralAir'])
data.drop(['CentralAir_N'], axis=1, inplace=True)
data = pd.get_dummies(data, columns=['Electrical'])
data.drop(['Electrical_SBrkr'], axis=1, inplace=True)
data = pd.get_dummies(data, columns=['KitchenQual'])
data.drop(['KitchenQual_TA'], axis=1, inplace=True)
data = pd.get_dummies(data, columns=['Functional'])
data.drop(['Functional_Typ'], axis=1, inplace=True)
data = pd.get_dummies(data, columns=['FireplaceQu'])
data = pd.get_dummies(data, columns=['GarageType'])
data = pd.get_dummies(data, columns=['GarageFinish'])
data = pd.get_dummies(data, columns=['GarageQual'])
data = pd.get_dummies(data, columns=['GarageCond'])
data = pd.get_dummies(data, columns=['PavedDrive'])
data = pd.get_dummies(data, columns=['PoolQC'])
data = pd.get_dummies(data, columns=['Fence'])
data = pd.get_dummies(data, columns=['MiscFeature'])
data = pd.get_dummies(data, columns=['MoSold'])
data.drop(['MoSold_1'], axis=1, inplace=True)
data = pd.get_dummies(data, columns=['YrSold'])
data.drop(['YrSold_2006'], axis=1, inplace=True)
data = pd.get_dummies(data, columns=['SaleType'])
data.drop(['SaleType_WD'], axis=1, inplace=True)
data = pd.get_dummies(data, columns=['SaleCondition'])
data.drop(['SaleCondition_Normal'], axis=1, inplace=True)

In [14]:
# Write cleaned dataset to a CSV file
data.to_csv('cleaned_data.csv', index=False)