In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [15]:
data = pd.read_csv('train.csv')

In [16]:
#data.head(5)
#data['PoolQC'].dtypes
data['MasVnrArea'].dtypes

dtype('float64')

In [17]:
#Make new dataframe from values that can be useful in a linear regression analysis

housing_data = data[['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'MasVnrArea', \
                    'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', \
                    'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'HeatingQC', '1stFlrSF', '2ndFlrSF', \
                    'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'KitchenQual', \
                    'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageCars', \
                    'GarageArea', 'GarageQual', 'GarageCond', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', \
                    'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscVal']]

housing_data.head(5)

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,MasVnrArea,ExterQual,ExterCond,BsmtQual,BsmtCond,...,GarageCond,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscVal
0,65.0,8450,7,5,2003,196.0,Gd,TA,Gd,TA,...,TA,0,61,0,0,0,0,,,0
1,80.0,9600,6,8,1976,0.0,TA,TA,Gd,TA,...,TA,298,0,0,0,0,0,,,0
2,68.0,11250,7,5,2001,162.0,Gd,TA,Gd,TA,...,TA,0,42,0,0,0,0,,,0
3,60.0,9550,7,5,1915,0.0,TA,TA,TA,Gd,...,TA,0,35,272,0,0,0,,,0
4,84.0,14260,8,5,2000,350.0,Gd,TA,Gd,TA,...,TA,192,84,0,0,0,0,,,0


In [19]:
#Convert ratings in the following categories to numbers so they can be used in a linear regression: ExterQual, ExterCond,
#BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2, TotalBsmtSF, HeatingQC, KitchenQual, Functional, 
#FireplaceQu, GarageType, GarageQual, GarageCond, PoolQC, Fence

housing_data['ExterQual'] = housing_data['ExterQual'].map({'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})
housing_data['ExterCond'] = housing_data['ExterCond'].map({'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})
housing_data['HeatingQC'] = housing_data['HeatingQC'].map({'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})
housing_data['KitchenQual'] = housing_data['KitchenQual'].map({'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})

housing_data['BsmtQual'] = housing_data['BsmtQual'].map({'NA': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})
housing_data['BsmtCond'] = housing_data['BsmtCond'].map({'NA': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})
housing_data['FireplaceQu'] = housing_data['FireplaceQu'].map({'NA': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})
housing_data['GarageQual'] = housing_data['GarageQual'].map({'NA': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})
housing_data['GarageCond'] = housing_data['GarageCond'].map({'NA': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})
housing_data['PoolQC'] = housing_data['PoolQC'].map({'NA': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})

housing_data['BsmtExposure'] = housing_data['BsmtExposure'].map({'NA': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4})

housing_data['BsmtFinType1'] = housing_data['BsmtFinType1'].map({'NA': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, \
                                                                 'ALQ': 5, 'GLQ': 6})
housing_data['BsmtFinType2'] = housing_data['BsmtFinType2'].map({'NA': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, \
                                                                 'ALQ': 5, 'GLQ': 6})

housing_data['Functional'] = housing_data['Functional'].map({'Sal': 1, 'Sev': 2, 'Maj2': 3, 'Maj1': 4, 'Mod': 5, \
                                                             'Min2': 6, 'Min1': 7, 'Typ': 8})

housing_data['GarageType'] = housing_data['GarageType'].map({'NA': 0, 'Detchd': 1, 'CarPort': 2, 'BuiltIn': 3, \
                                                             'Basment': 4, 'Attachd': 5, '2Types': 6})

housing_data['Fence'] = housing_data['Fence'].map({'NA': 0, 'MnWw': 1, 'GdWo': 2, 'MnPrv': 3, 'GdPrv': 4})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/p

In [20]:
housing_data.head(5)

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,MasVnrArea,ExterQual,ExterCond,BsmtQual,BsmtCond,...,GarageCond,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscVal
0,65.0,8450,7,5,2003,196.0,4,3,4.0,3.0,...,3.0,0,61,0,0,0,0,,,0
1,80.0,9600,6,8,1976,0.0,3,3,4.0,3.0,...,3.0,298,0,0,0,0,0,,,0
2,68.0,11250,7,5,2001,162.0,4,3,4.0,3.0,...,3.0,0,42,0,0,0,0,,,0
3,60.0,9550,7,5,1915,0.0,3,3,3.0,4.0,...,3.0,0,35,272,0,0,0,,,0
4,84.0,14260,8,5,2000,350.0,4,3,4.0,3.0,...,3.0,192,84,0,0,0,0,,,0


In [21]:
#The year the house was built is not useful in a regression analysis, but the age of the house is. The dataset is from 2011,
#so to get the age, subtract 2011 from the year built.

housing_data['HouseAge'] = 2011 - housing_data['YearBuilt']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [22]:
housing_data.head(5)

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,MasVnrArea,ExterQual,ExterCond,BsmtQual,BsmtCond,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscVal,HouseAge
0,65.0,8450,7,5,2003,196.0,4,3,4.0,3.0,...,0,61,0,0,0,0,,,0,8
1,80.0,9600,6,8,1976,0.0,3,3,4.0,3.0,...,298,0,0,0,0,0,,,0,35
2,68.0,11250,7,5,2001,162.0,4,3,4.0,3.0,...,0,42,0,0,0,0,,,0,10
3,60.0,9550,7,5,1915,0.0,3,3,3.0,4.0,...,0,35,272,0,0,0,,,0,96
4,84.0,14260,8,5,2000,350.0,4,3,4.0,3.0,...,192,84,0,0,0,0,,,0,11


In [23]:
#export cleaned data as CSV
housing_data.to_csv('housing_cleaned.csv')