In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict


%matplotlib inline

In [2]:
train = pd.read_csv('../datasets/train.csv')
test = pd.read_csv('../datasets/test.csv')

In [3]:
train.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [111]:
def noval(col):
    print((col).isnull().sum())

In [4]:
train['Lot Frontage'].isnull().sum()

330

In [5]:
train['Lot Frontage'].fillna(0, inplace=True)
#filling NA's in this numerical column with 0

In [6]:
train['Lot Area'].isnull().value_counts()

False    2051
Name: Lot Area, dtype: int64

In [7]:
train['Alley'].fillna('No Access', inplace=True)
train['Alley'].value_counts()
#data description states "NA" values indicate no alley access, so I replace those blanks with "No Access"

No Access    1911
Grvl           85
Pave           55
Name: Alley, dtype: int64

In [8]:
train['Condition 2'].value_counts()

Norm      2025
Feedr       11
Artery       5
PosA         3
PosN         3
RRNn         2
RRAn         1
RRAe         1
Name: Condition 2, dtype: int64

In [9]:
train['Conditions']= train['Condition 1'].str.cat(train['Condition 2'], sep=",")
#Condition 1 and Condition 2 deal with the same values; I combined into one column (separating vals with a comma) for ease/to prevent future high multicollinearity

In [10]:
train['Mas Vnr Type'].isnull().sum()

22

In [11]:
train['Mas Vnr Area'].isnull().sum()

22

In [12]:
#making sure the 22 null rows match up with respective columns;
#rather than get rid of them, I'll replace them with relevant assumed values
train.loc[(train['Mas Vnr Type'].isnull()) & (train['Mas Vnr Area'].isnull())]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice,Conditions
22,2393,528142010,60,RL,103.0,12867,Pave,No Access,IR1,Lvl,...,0,,,,0,7,2006,New,344133,"Norm,Norm"
41,2383,528110050,20,RL,107.0,13891,Pave,No Access,Reg,Lvl,...,0,,,,0,9,2006,New,465000,"Norm,Norm"
86,539,531371050,20,RL,67.0,10083,Pave,No Access,Reg,Lvl,...,0,,,,0,8,2009,WD,184900,"Norm,Norm"
212,518,528458020,20,FV,90.0,7993,Pave,No Access,IR1,Lvl,...,0,,,,0,10,2009,New,225000,"Norm,Norm"
276,2824,908130020,20,RL,75.0,8050,Pave,No Access,Reg,Lvl,...,0,,,,0,4,2006,WD,117250,"Norm,Norm"
338,1800,528458150,60,FV,112.0,12217,Pave,No Access,IR1,Lvl,...,0,,,,0,12,2007,New,310013,"Norm,Norm"
431,1455,907251090,60,RL,75.0,9473,Pave,No Access,Reg,Lvl,...,0,,,,0,3,2008,WD,237000,"Norm,Norm"
451,1120,528439010,20,RL,87.0,10037,Pave,No Access,Reg,Lvl,...,0,,,,0,8,2008,WD,247000,"Feedr,Norm"
591,1841,533208040,120,FV,35.0,4274,Pave,Pave,IR1,Lvl,...,0,,,,0,11,2007,New,199900,"Norm,Norm"
844,1840,533208030,120,FV,30.0,5330,Pave,Pave,IR2,Lvl,...,0,,,,0,7,2007,New,207500,"Norm,Norm"


In [13]:
train['Mas Vnr Type'].fillna('None', inplace=True)
train['Mas Vnr Area'].fillna(0, inplace=True)

In [14]:
(train[train['Bsmt Cond'].isnull()].index) & (train[train['Bsmt Qual'].isnull()].index) & (train[train['Bsmt Exposure'].isnull()].index)

Int64Index([  12,   93,  114,  146,  183,  240,  249,  256,  390,  437,  485,
             499,  514,  580,  581,  616,  635,  642,  696,  700,  737,  772,
             790,  794,  924,  933,  954, 1007, 1022, 1049, 1089, 1098, 1125,
            1157, 1211, 1222, 1252, 1256, 1270, 1327, 1331, 1377, 1387, 1428,
            1481, 1594, 1620, 1681, 1684, 1686, 1859, 1875, 1889, 1933, 2010],
           dtype='int64')

In [15]:
#there are more null vals in 'Bsmt Exposure'
#after identifying indexes that do not match; replacing those appropriately
(train[train['Bsmt Exposure'].isnull()].index)

Int64Index([  12,   93,  114,  146,  183,  240,  249,  256,  390,  437,  485,
             499,  514,  580,  581,  616,  635,  642,  696,  700,  737,  772,
             790,  794,  924,  933,  954, 1007, 1022, 1049, 1089, 1098, 1125,
            1157, 1211, 1222, 1252, 1256, 1270, 1327, 1331, 1377, 1387, 1428,
            1456, 1481, 1547, 1594, 1620, 1681, 1684, 1686, 1859, 1875, 1889,
            1933, 1997, 2010],
           dtype='int64')

In [16]:
train.loc[1997, 'Bsmt Exposure']='No'
train.loc[1547, 'Bsmt Exposure']='No'
train.loc[1456, 'Bsmt Exposure']='No'

In [17]:
#'NB' for no basement
train['Bsmt Qual'].fillna('NB', inplace=True)
train['Bsmt Cond'].fillna('NB', inplace=True)
train['Bsmt Exposure'].fillna('NB', inplace=True)

In [20]:
train['BsmtFin Type 1'].fillna('NB', inplace=True)

In [22]:
train['BsmtFin SF 1'].fillna(0, inplace=True)

In [23]:
train['BsmtFin Type 2'].fillna('NB', inplace=True)

In [26]:
train['BsmtFin SF 2'].fillna(0, inplace=True)

In [30]:
train['Bsmt Unf SF'].fillna(0, inplace=True)

In [38]:
#rename to match pattern of other basement names/easier to call
train.rename(columns ={'Total Bsmt SF': 'Bsmt Total SF'}, inplace=True)
train['Bsmt Total SF'].fillna(0, inplace=True)

In [53]:
train[train['Bsmt Full Bath'].isnull()].index

Int64Index([616, 1327], dtype='int64')

In [61]:
train[train['Bsmt Half Bath'].isnull()].index

Int64Index([616, 1327], dtype='int64')

In [65]:
train.loc[:,'Bsmt Full Bath':'Bsmt Half Bath'] = train.loc[:,'Bsmt Full Bath':'Bsmt Half Bath'].fillna(0, inplace=True)

In [86]:
#another instance where data descr uses NA to mean no fireplace
train['Fireplace Qu'].fillna('NFP', inplace=True)

In [90]:
#Na = no garage
train['Garage Type'].fillna('NG',inplace=True)

In [95]:
train['Garage Yr Blt'].fillna(0,inplace=True)

In [98]:
train['Garage Finish'].fillna('NG',inplace=True)

In [104]:
train[train['Garage Cars'].isnull()].index

Int64Index([1712], dtype='int64')

In [110]:
#The null value in 'garage cars' corresponds to a instance where 'garage type' is full but other garage variables are also empty
#Therefore, I am dropping the row
train.drop(train.index[1712], inplace=True)

In [114]:
train['Garage Qual'].fillna('NG', inplace=True)

In [118]:
train['Garage Cond'].fillna('NG', inplace=True)

In [None]:
#train.loc[:,'Garage Qual':'Garage Cond'] = train.loc[:,'Garage Qual':'Garage Cond'].fillna('NG', inplace=True)

In [None]:
#NAs in 'Pool QC' correspond with 0's in Pool sq ft; therefore, filling as 'NP', no pool

In [130]:
train.rename({'Pool QC': 'Pool Qual'}, axis=1,inplace=True)
train['Pool Qual'].fillna('NP', inplace=True)

In [128]:
noval(train['Pool QC'])

2041


In [84]:
train['Functional'].value_counts()

Typ     1915
Min1      42
Min2      42
Mod       29
Maj1      12
Maj2       7
Sal        2
Sev        2
Name: Functional, dtype: int64