# OVERALL PROJECT STRUCTURE

## Data Cleaning
#### Understand each column
#### Clean each column
#### 



In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read train.csv and assign it to a dataframe 'train'.
# keep_default_na=False because from the data description, we see that a few object columns have 'NA' values which have meanings.
# We need to avoid these 'NA' values from being wrongly read as empty NaN values by default.
# Only missing values (blank in the CSV file) should be identified as NaN values.

train = pd.read_csv('../datasets/train.csv', keep_default_na=False)

# We also read train.csv without keep_default_na=False to get the list of all numeric columns from it.
# This list will be later used to convert columns to their correct data types.

temp_train = pd.read_csv('../datasets/train.csv')
numeric_cols = list(temp_train._get_numeric_data().columns)

In [3]:
numeric_cols

['Id',
 'PID',
 'MS SubClass',
 'Lot Frontage',
 'Lot Area',
 'Overall Qual',
 'Overall Cond',
 'Year Built',
 'Year Remod/Add',
 'Mas Vnr Area',
 'BsmtFin SF 1',
 'BsmtFin SF 2',
 'Bsmt Unf SF',
 'Total Bsmt SF',
 '1st Flr SF',
 '2nd Flr SF',
 'Low Qual Fin SF',
 'Gr Liv Area',
 'Bsmt Full Bath',
 'Bsmt Half Bath',
 'Full Bath',
 'Half Bath',
 'Bedroom AbvGr',
 'Kitchen AbvGr',
 'TotRms AbvGrd',
 'Fireplaces',
 'Garage Yr Blt',
 'Garage Cars',
 'Garage Area',
 'Wood Deck SF',
 'Open Porch SF',
 'Enclosed Porch',
 '3Ssn Porch',
 'Screen Porch',
 'Pool Area',
 'Misc Val',
 'Mo Sold',
 'Yr Sold',
 'SalePrice']

In [4]:
train.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [5]:
train.tail()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
2046,1587,921126030,20,RL,79.0,11449,Pave,,IR1,HLS,...,0,0,,,,0,1,2008,WD,298751
2047,785,905377130,30,RL,,12342,Pave,,IR1,Lvl,...,0,0,,,,0,3,2009,WD,82500
2048,916,909253010,50,RL,57.0,7558,Pave,,Reg,Bnk,...,0,0,,,,0,3,2009,WD,177000
2049,639,535179160,20,RL,80.0,10400,Pave,,Reg,Lvl,...,0,0,,,,0,11,2009,WD,144000
2050,10,527162130,60,RL,60.0,7500,Pave,,Reg,Lvl,...,0,0,,,,0,6,2010,WD,189000


In [6]:
train.shape

(2051, 81)

In [7]:
train.info()

# Notice that a lot of columns in list 'numeric_cols' have wrong data type object below, instead of int/float.
# We also see that reading the CSV file with keep_default_na=False doesn't give us ANY NaN (null/missing) values in the entire dataframe.

# This is because it reads the missing values in the CSV as '' (empty string) instead.
# This means that some columns with int/float types get converted to object type if they have any missing values.
# So, we need to replace all '' with np.nan in all the numerical columns ('numeric_cols' defined earlier), and convert remaining values to type float.
# And we need to replace all '' with np.nan in all non-numerical columns.
# Then we will proceed with data cleaning as usual, while also checking the types and values in each column.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 81 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Id               2051 non-null   int64 
 1   PID              2051 non-null   int64 
 2   MS SubClass      2051 non-null   int64 
 3   MS Zoning        2051 non-null   object
 4   Lot Frontage     2051 non-null   object
 5   Lot Area         2051 non-null   int64 
 6   Street           2051 non-null   object
 7   Alley            2051 non-null   object
 8   Lot Shape        2051 non-null   object
 9   Land Contour     2051 non-null   object
 10  Utilities        2051 non-null   object
 11  Lot Config       2051 non-null   object
 12  Land Slope       2051 non-null   object
 13  Neighborhood     2051 non-null   object
 14  Condition 1      2051 non-null   object
 15  Condition 2      2051 non-null   object
 16  Bldg Type        2051 non-null   object
 17  House Style      2051 non-null   

In [8]:
# Mapping a lambda function for all numerical columns to convert all '' (empty string) to NaN and all other values to type float.

for col in numeric_cols:
    train[col] = train[col].map(lambda x : np.nan if x=='' else float(x))

In [9]:
# Mapping a lambda function for all non-numerical columns to convert all '' (empty string) to NaN.

for col in train.columns:
    if col not in numeric_cols:
        train[col] = train[col].map(lambda x : np.nan if x=='' else x)

In [10]:
# Checking data types of columns again.
# Also notice that now some columns have a few NaN values.

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 81 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Id               2051 non-null   float64
 1   PID              2051 non-null   float64
 2   MS SubClass      2051 non-null   float64
 3   MS Zoning        2051 non-null   object 
 4   Lot Frontage     1721 non-null   float64
 5   Lot Area         2051 non-null   float64
 6   Street           2051 non-null   object 
 7   Alley            2051 non-null   object 
 8   Lot Shape        2051 non-null   object 
 9   Land Contour     2051 non-null   object 
 10  Utilities        2051 non-null   object 
 11  Lot Config       2051 non-null   object 
 12  Land Slope       2051 non-null   object 
 13  Neighborhood     2051 non-null   object 
 14  Condition 1      2051 non-null   object 
 15  Condition 2      2051 non-null   object 
 16  Bldg Type        2051 non-null   object 
 17  House Style   

In [11]:
train.dtypes.unique()

array([dtype('float64'), dtype('O')], dtype=object)

In [12]:
# Checking for all np.nan values in all columns and filtering out columns which have one or more rows with np.nan values.
# This is the true number of missing values in the dataframe.

null_cols = train.isna().sum()[train.isna().sum()!=0]
null_cols

Lot Frontage      330
Mas Vnr Type       22
Mas Vnr Area       22
Bsmt Qual           1
Bsmt Cond           1
Bsmt Exposure       4
BsmtFin Type 1      1
BsmtFin SF 1        1
BsmtFin Type 2      2
BsmtFin SF 2        1
Bsmt Unf SF         1
Total Bsmt SF       1
Bsmt Full Bath      2
Bsmt Half Bath      2
Garage Yr Blt     114
Garage Finish       1
Garage Cars         1
Garage Area         1
Garage Qual         1
Garage Cond         1
dtype: int64

### Re-think this

In order to make this dataset more manageable, I will look through it in smaller sections. Each section will comprise of 16 columns and the 'SalePrice' column. For each section, I will do data cleaning, one-hot encoding of the ordinal columns, and identify any potential relationships between the columns and the 'SalePrice' column. If a potential relationship is identified, I will consider that column for potential inclusion in the regression model. If no relatioship is found, the column will not be included in the model.

### train section 1 - columns 1-15

Looking at the columns 1-by-1 and cross-referencing with the data description found [here](http://jse.amstat.org/v19n3/decock/DataDocumentation.txt):

- **Id**: Type *int*, discrete. Shows observation number. Wouldn't make logical sense to include it in the regression model.
- **PID**: Type *int*, nominal. Parcel identification number. Wouldn't make logical sense to include it in the regression model.
- **MS SubClass**: Type *int*, nominal (16 unique values). Identifies the type of dwelling involved in the sale. Would need to be one-hot encoded if it is to be added into the regression model.
- **MS Zoning**: Type *object*, nominal. Identifies the general zoning classification of the sale. Would need to be one-hot encoded if it is to be added into the regression model.

In [13]:
# Checking all unique values of 'MS SubClass' column.

sorted(train['MS SubClass'].unique())

[20.0,
 30.0,
 40.0,
 45.0,
 50.0,
 60.0,
 70.0,
 75.0,
 80.0,
 85.0,
 90.0,
 120.0,
 150.0,
 160.0,
 180.0,
 190.0]

In [14]:
# Checking all unique values of 'MS Zoning' column.

sorted(train['MS Zoning'].unique())

['A (agr)', 'C (all)', 'FV', 'I (all)', 'RH', 'RL', 'RM']

In [15]:
# Filtering all rows with missing values in 'Lot Frontage' column.
# May need to impute this missing data if column is to be used in the model.

train[train['Lot Frontage'].isna()]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109.0,533352170.0,60.0,RL,,13517.0,Pave,,IR1,Lvl,...,0.0,0.0,,,,0.0,3.0,2010.0,WD,130500.0
7,145.0,535154050.0,20.0,RL,,12160.0,Pave,,IR1,Lvl,...,0.0,0.0,,MnPrv,,0.0,5.0,2010.0,COD,142000.0
8,1942.0,535353130.0,20.0,RL,,15783.0,Pave,,Reg,Lvl,...,0.0,0.0,,MnPrv,Shed,400.0,6.0,2007.0,WD,112500.0
23,12.0,527165230.0,20.0,RL,,7980.0,Pave,,IR1,Lvl,...,0.0,0.0,,GdPrv,Shed,500.0,3.0,2010.0,WD,185000.0
27,1534.0,909277040.0,50.0,RL,,11700.0,Pave,Grvl,IR1,HLS,...,0.0,0.0,,,,0.0,7.0,2008.0,WD,198000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016,560.0,532376110.0,20.0,RL,,7791.0,Pave,,IR1,Lvl,...,0.0,0.0,,GdWo,,0.0,10.0,2009.0,WD,129000.0
2022,2872.0,909475020.0,20.0,RL,,16381.0,Pave,,IR1,Lvl,...,0.0,0.0,,,,0.0,12.0,2006.0,WD,223000.0
2028,2526.0,534127190.0,20.0,RL,,20781.0,Pave,,IR2,Lvl,...,0.0,0.0,,,,0.0,6.0,2006.0,WD,262500.0
2031,25.0,527402250.0,20.0,RL,,12537.0,Pave,,IR1,Lvl,...,0.0,0.0,,,,0.0,4.0,2010.0,WD,149900.0


In [16]:
# Checking for any values in 'Lot Frontage' column which are negative.

train[train['Lot Frontage']<0]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice


In [17]:
# Checking for any values in 'Lot Area' column which are negative.

train[train['Lot Area']<0]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice


In [18]:
# Checking all unique values of 'Street' column.

sorted(train['Street'].unique())

['Grvl', 'Pave']

In [19]:
# Checking all unique values of 'Alley' column.

sorted(train['Alley'].unique())

['Grvl', 'NA', 'Pave']

In [20]:
# Checking all unique values of 'Lot Shape' column.

sorted(train['Lot Shape'].unique())

['IR1', 'IR2', 'IR3', 'Reg']

In [21]:
# Converting ordinal values in 'Lot Shape' column to discrete values of type float.

ordinal_lot_shape = {'Reg':4.0, 'IR1':3.0, 'IR2':2.0, 'IR3':1.0}

train['Lot Shape'] = train['Lot Shape'].map(ordinal_lot_shape)

In [22]:
# Checking type of 'Lot Shape' column again to verify successful conversion.

train['Lot Shape'].dtypes

dtype('float64')

In [23]:
# Checking all unique values of 'Lot Shape' column.

sorted(train['Lot Shape'].unique())

[1.0, 2.0, 3.0, 4.0]

In [24]:
# Checking all unique values of 'Land Contour' column.

sorted(train['Land Contour'].unique())

['Bnk', 'HLS', 'Low', 'Lvl']

In [25]:
# Checking all unique values of 'Utilities' column.

sorted(train['Utilities'].unique())

['AllPub', 'NoSeWa', 'NoSewr']

In [26]:
# Converting ordinal values in 'Utilities' column to discrete values of type float.

ordinal_utilities = {'AllPub':4.0, 'NoSewr':3.0, 'NoSeWa':2.0, 'ELO':1.0}
 
train['Utilities'] = train['Utilities'].map(ordinal_utilities)

In [27]:
# Checking type of 'Utilities' column again to verify successful conversion.

train['Utilities'].dtypes

dtype('float64')

In [28]:
# Checking all unique values of 'Utilities' column.

sorted(train['Utilities'].unique())

[2.0, 3.0, 4.0]

In [29]:
sorted(train['Lot Config'].unique())

['Corner', 'CulDSac', 'FR2', 'FR3', 'Inside']

In [30]:
# Checking all unique values of 'Land Slope' column.

sorted(train['Land Slope'].unique())

['Gtl', 'Mod', 'Sev']

In [31]:
# Converting ordinal values in 'Land Slope' column to discrete values of type float.

ordinal_land_slope = {'Gtl':3.0, 'Mod':2.0, 'Sev':1.0}

train['Land Slope'] = train['Land Slope'].map(ordinal_land_slope)

In [32]:
# Checking type of 'Land Slope' column again to verify successful conversion.

train['Land Slope'].dtypes

dtype('float64')

In [33]:
# Checking all unique values of 'Land Slope' column.

sorted(train['Land Slope'].unique())

[1.0, 2.0, 3.0]

In [34]:
# Checking all unique values of 'Neighborhood' column.

sorted(train['Neighborhood'].unique())

['Blmngtn',
 'Blueste',
 'BrDale',
 'BrkSide',
 'ClearCr',
 'CollgCr',
 'Crawfor',
 'Edwards',
 'Gilbert',
 'Greens',
 'GrnHill',
 'IDOTRR',
 'Landmrk',
 'MeadowV',
 'Mitchel',
 'NAmes',
 'NPkVill',
 'NWAmes',
 'NoRidge',
 'NridgHt',
 'OldTown',
 'SWISU',
 'Sawyer',
 'SawyerW',
 'Somerst',
 'StoneBr',
 'Timber',
 'Veenker']

In [35]:
# Checking all unique values of 'Condition 1' column.

sorted(train['Condition 1'].unique())

['Artery', 'Feedr', 'Norm', 'PosA', 'PosN', 'RRAe', 'RRAn', 'RRNe', 'RRNn']

In [36]:
# Checking all unique values of 'Condition 2' column.

sorted(train['Condition 2'].unique())

['Artery', 'Feedr', 'Norm', 'PosA', 'PosN', 'RRAe', 'RRAn', 'RRNn']

In [37]:
# Checking all unique values of 'Bldg Type' column.

sorted(train['Bldg Type'].unique())

['1Fam', '2fmCon', 'Duplex', 'Twnhs', 'TwnhsE']

In [38]:
# Checking all unique values of 'House Style' column.

sorted(train['House Style'].unique())

['1.5Fin', '1.5Unf', '1Story', '2.5Fin', '2.5Unf', '2Story', 'SFoyer', 'SLvl']

In [39]:
# Checking all unique values of 'Overall Qual' column.

sorted(train['Overall Qual'].unique())

[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]

In [40]:
# Checking all unique values of 'Overall Cond' column.

sorted(train['Overall Cond'].unique())

[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]

In [41]:
# Checking for any values in 'Year Built' column which are negative.

train[train['Year Built']<0]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice


In [42]:
# Checking for any values in 'Year Remod/Add' column which are negative.

train[train['Year Remod/Add']<0]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice


In [43]:
# Checking all unique values of 'Roof Style' column.

sorted(train['Roof Style'].unique())

['Flat', 'Gable', 'Gambrel', 'Hip', 'Mansard', 'Shed']

In [44]:
# Checking all unique values of 'Roof Matl' column.

sorted(train['Roof Matl'].unique())

['ClyTile', 'CompShg', 'Membran', 'Tar&Grv', 'WdShake', 'WdShngl']

In [45]:
# Checking all unique values of 'Exterior 1st' column.

sorted(train['Exterior 1st'].unique())

['AsbShng',
 'AsphShn',
 'BrkComm',
 'BrkFace',
 'CBlock',
 'CemntBd',
 'HdBoard',
 'ImStucc',
 'MetalSd',
 'Plywood',
 'Stone',
 'Stucco',
 'VinylSd',
 'Wd Sdng',
 'WdShing']

In [46]:
# Checking all unique values of 'Exterior 2nd' column.

sorted(train['Exterior 2nd'].unique())

['AsbShng',
 'AsphShn',
 'Brk Cmn',
 'BrkFace',
 'CBlock',
 'CmentBd',
 'HdBoard',
 'ImStucc',
 'MetalSd',
 'Plywood',
 'Stone',
 'Stucco',
 'VinylSd',
 'Wd Sdng',
 'Wd Shng']

In [47]:
# Notice that a few values are misspelled.
# 'Wd Shng' is misspelled. According to data description, it should be 'WdShing'.
# 'Brk Cmn' is misspelled. According to data description, it should be 'BrkComm'.
# 'CmentBd' is misspelled. According to data description, it should be 'CemntBd'.
# Replacing the wrong spellings with the correct ones.

train['Exterior 2nd'] = train['Exterior 2nd'].map(lambda x : 'WdShing' if x=='Wd Shng' else x)
train['Exterior 2nd'] = train['Exterior 2nd'].map(lambda x : 'BrkComm' if x=='Brk Cmn' else x)
train['Exterior 2nd'] = train['Exterior 2nd'].map(lambda x : 'CemntBd' if x=='CmentBd' else x)

In [48]:
# Checking all unique values of 'Exterior 2nd' column again.

sorted(train['Exterior 2nd'].unique())

['AsbShng',
 'AsphShn',
 'BrkComm',
 'BrkFace',
 'CBlock',
 'CemntBd',
 'HdBoard',
 'ImStucc',
 'MetalSd',
 'Plywood',
 'Stone',
 'Stucco',
 'VinylSd',
 'Wd Sdng',
 'WdShing']

In [49]:
# Checking all unique values of 'Mas Vnr Type' column.

train['Mas Vnr Type'].unique()

array(['BrkFace', 'None', nan, 'Stone', 'BrkCmn'], dtype=object)

In [50]:
# Checking to see if any rows left in 'Mas Vnr Type' column with missing values.

train['Mas Vnr Type'].isna().sum()

22

In [51]:
# Filtering all rows with missing values in 'Mas Vnr Area' column.

train[train['Mas Vnr Area'].isna()]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
22,2393.0,528142010.0,60.0,RL,103.0,12867.0,Pave,,3.0,Lvl,...,0.0,0.0,,,,0.0,7.0,2006.0,New,344133.0
41,2383.0,528110050.0,20.0,RL,107.0,13891.0,Pave,,4.0,Lvl,...,0.0,0.0,,,,0.0,9.0,2006.0,New,465000.0
86,539.0,531371050.0,20.0,RL,67.0,10083.0,Pave,,4.0,Lvl,...,0.0,0.0,,,,0.0,8.0,2009.0,WD,184900.0
212,518.0,528458020.0,20.0,FV,90.0,7993.0,Pave,,3.0,Lvl,...,0.0,0.0,,,,0.0,10.0,2009.0,New,225000.0
276,2824.0,908130020.0,20.0,RL,75.0,8050.0,Pave,,4.0,Lvl,...,0.0,0.0,,,,0.0,4.0,2006.0,WD,117250.0
338,1800.0,528458150.0,60.0,FV,112.0,12217.0,Pave,,3.0,Lvl,...,0.0,0.0,,,,0.0,12.0,2007.0,New,310013.0
431,1455.0,907251090.0,60.0,RL,75.0,9473.0,Pave,,4.0,Lvl,...,0.0,0.0,,,,0.0,3.0,2008.0,WD,237000.0
451,1120.0,528439010.0,20.0,RL,87.0,10037.0,Pave,,4.0,Lvl,...,0.0,0.0,,,,0.0,8.0,2008.0,WD,247000.0
591,1841.0,533208040.0,120.0,FV,35.0,4274.0,Pave,Pave,3.0,Lvl,...,0.0,0.0,,,,0.0,11.0,2007.0,New,199900.0
844,1840.0,533208030.0,120.0,FV,30.0,5330.0,Pave,Pave,2.0,Lvl,...,0.0,0.0,,,,0.0,7.0,2007.0,New,207500.0


In [52]:
# Checking for any values in 'Mas Vnr Area' column which are negative.

train[train['Mas Vnr Area']<0]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice


In [53]:
# Checking all unique values of 'Exter Qual' column.

train['Exter Qual'].unique()

array(['Gd', 'TA', 'Ex', 'Fa'], dtype=object)

In [54]:
# Checking all unique values of 'Exter Cond' column.

train['Exter Cond'].unique()

array(['TA', 'Gd', 'Fa', 'Ex', 'Po'], dtype=object)

In [55]:
# Converting ordinal values in 'Exter Qual' & 'Exter Cond' columns to discrete values of type float.

ordinal_exter = {'Ex':5.0, 'Gd':4.0, 'TA':3.0, 'Fa':2.0, 'Po':1.0}

train['Exter Qual'] = train['Exter Qual'].map(ordinal_exter)

train['Exter Cond'] = train['Exter Cond'].map(ordinal_exter)

In [56]:
# Checking type of 'Exter Qual' column again to verify successful conversion.

train['Exter Qual'].dtypes

dtype('float64')

In [57]:
# Checking type of 'Exter Cond' column again to verify successful conversion.

train['Exter Cond'].dtypes

dtype('float64')

In [58]:
# Checking all unique values of 'Exter Qual' column.

sorted(train['Exter Qual'].unique())

[2.0, 3.0, 4.0, 5.0]

In [59]:
# Checking all unique values of 'Exter Cond' column.

sorted(train['Exter Cond'].unique())

[1.0, 2.0, 3.0, 4.0, 5.0]

In [60]:
# Checking all unique values of 'Foundation' column.

sorted(train['Foundation'].unique())

['BrkTil', 'CBlock', 'PConc', 'Slab', 'Stone', 'Wood']

In [61]:
# Checking all unique values of 'Bsmt Qual' column.

train['Bsmt Qual'].unique()

array(['TA', 'Gd', 'Fa', 'NA', 'Ex', nan, 'Po'], dtype=object)

In [62]:
# Checking all unique values of 'Bsmt Cond' column.

train['Bsmt Cond'].unique()

array(['TA', 'Gd', 'NA', 'Fa', 'Po', 'Ex', nan], dtype=object)

In [63]:
# Checking to see if any rows left in 'Bsmt Qual' column with missing values.

train['Bsmt Qual'].isna().sum()

1

In [64]:
# Checking to see if any rows left in 'Bsmt Cond' column with missing values.

train['Bsmt Cond'].isna().sum()

1

In [65]:
# Converting ordinal values in 'Bsmt Qual' & 'Bsmt Cond' columns to discrete values of type float.

ordinal_bsmt = {'Ex':5.0, 'Gd':4.0, 'TA':3.0, 'Fa':2.0, 'Po':1.0, 'NA':0.0}

train['Bsmt Qual'] = train['Bsmt Qual'].map(ordinal_bsmt)

train['Bsmt Cond'] = train['Bsmt Cond'].map(ordinal_bsmt)

In [66]:
# Checking type of 'Bsmt Qual' column again to verify successful conversion.

train['Bsmt Qual'].dtypes

dtype('float64')

In [67]:
# Checking type of 'Bsmt Cond' column again to verify successful conversion.

train['Bsmt Cond'].dtypes

dtype('float64')

In [68]:
# Checking all unique values of 'Bsmt Qual' column.

sorted(train['Bsmt Qual'].unique())

[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, nan]

In [69]:
# Checking all unique values of 'Bsmt Cond' column.

sorted(train['Bsmt Cond'].unique())

[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, nan]

In [70]:
# Checking all unique values of 'Bsmt Exposure' column.

train['Bsmt Exposure'].unique()

array(['No', 'Gd', 'Av', 'NA', 'Mn', nan], dtype=object)

In [71]:
# Checking to see if any rows left in 'Bsmt Exposure' column with missing values.

train['Bsmt Exposure'].isna().sum()

4

In [72]:
# Converting ordinal values in 'Bsmt Exposure' column to discrete values of type float.

ordinal_bsmt_exposure = {'Gd':4.0, 'Av':3.0, 'Mn':2.0, 'No':1.0, 'NA':0.0}

train['Bsmt Exposure'] = train['Bsmt Exposure'].map(ordinal_bsmt_exposure)

In [73]:
# Checking type of 'Bsmt Exposure' column again to verify successful conversion.

train['Bsmt Exposure'].dtypes

dtype('float64')

In [74]:
# Checking all unique values of 'Bsmt Exposure' column.

sorted(train['Bsmt Exposure'].unique())

[0.0, 1.0, 2.0, 3.0, 4.0, nan]

In [75]:
# Checking all unique values of 'BsmtFin Type 1' column.

train['BsmtFin Type 1'].unique()

array(['GLQ', 'Unf', 'ALQ', 'Rec', 'NA', 'BLQ', 'LwQ', nan], dtype=object)

In [76]:
# Checking all unique values of 'BsmtFin Type 2' column.

train['BsmtFin Type 2'].unique()

array(['Unf', 'Rec', 'NA', 'BLQ', 'GLQ', 'LwQ', 'ALQ', nan], dtype=object)

In [77]:
# Checking to see if any rows left in 'BsmtFin Type 1' column with missing values.

train['BsmtFin Type 1'].isna().sum()

1

In [78]:
# Checking to see if any rows left in 'BsmtFin Type 2' column with missing values.

train['BsmtFin Type 2'].isna().sum()

2

In [79]:
# Converting ordinal values in 'Bsmt Exposure' column to discrete values of type float.

ordinal_bsmtfin_type = {'GLQ':6.0, 'ALQ':5.0, 'BLQ':4.0, 'Rec':3.0, 'LwQ':2.0, 'Unf':1.0, 'NA':0.0}

train['BsmtFin Type 1'] = train['BsmtFin Type 1'].map(ordinal_bsmtfin_type)

train['BsmtFin Type 2'] = train['BsmtFin Type 2'].map(ordinal_bsmtfin_type)

In [80]:
# Checking type of 'BsmtFin Type 1' column again to verify successful conversion.

train['BsmtFin Type 1'].dtypes

dtype('float64')

In [81]:
# Checking type of 'BsmtFin Type 2' column again to verify successful conversion.

train['BsmtFin Type 2'].dtypes

dtype('float64')

In [82]:
# Checking all unique values of 'BsmtFin Type 1' column.

sorted(train['BsmtFin Type 1'].unique())

[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, nan]

In [83]:
# Checking all unique values of 'BsmtFin Type 2' column.

sorted(train['BsmtFin Type 2'].unique())

[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, nan]

In [84]:
# Filtering all rows with missing values in 'BsmtFin SF 1' column.

train[train['BsmtFin SF 1'].isna()]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
1327,1342.0,903230120.0,20.0,RM,99.0,5940.0,Pave,,3.0,Lvl,...,0.0,0.0,,MnPrv,,0.0,4.0,2008.0,ConLD,79000.0


In [85]:
# Checking for any values in 'BsmtFin SF 1' column which are negative.

train[train['BsmtFin SF 1']<0]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice


In [86]:
# Filtering all rows with missing values in 'BsmtFin SF 2' column.

train[train['BsmtFin SF 2'].isna()]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
1327,1342.0,903230120.0,20.0,RM,99.0,5940.0,Pave,,3.0,Lvl,...,0.0,0.0,,MnPrv,,0.0,4.0,2008.0,ConLD,79000.0


In [87]:
# Checking for any values in 'BsmtFin SF 2' column which are negative.

train[train['BsmtFin SF 2']<0]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice


In [88]:
# Filtering all rows with missing values in 'Bsmt Unf SF' column.

train[train['Bsmt Unf SF'].isna()]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
1327,1342.0,903230120.0,20.0,RM,99.0,5940.0,Pave,,3.0,Lvl,...,0.0,0.0,,MnPrv,,0.0,4.0,2008.0,ConLD,79000.0


In [89]:
# Checking for any values in 'Bsmt Unf SF' column which are negative.

train[train['Bsmt Unf SF']<0]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice


In [90]:
# Filtering all rows with missing values in 'Total Bsmt SF' column.

train[train['Total Bsmt SF'].isna()]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
1327,1342.0,903230120.0,20.0,RM,99.0,5940.0,Pave,,3.0,Lvl,...,0.0,0.0,,MnPrv,,0.0,4.0,2008.0,ConLD,79000.0


In [91]:
# Checking for any values in 'Total Bsmt SF' column which are negative.

train[train['Total Bsmt SF']<0]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice


In [92]:
# Checking all unique values of 'Heating' column.

sorted(train['Heating'].unique())

['GasA', 'GasW', 'Grav', 'OthW', 'Wall']

In [93]:
# Checking all unique values of 'Heating QC' column.

train['Heating QC'].unique()

array(['Ex', 'TA', 'Gd', 'Fa', 'Po'], dtype=object)

In [94]:
# Converting ordinal values in 'Heating QC' column to discrete values of type float.

ordinal_heating_qc = {'Ex':5.0, 'Gd':4.0, 'TA':3.0, 'Fa':2.0, 'Po':1.0}

train['Heating QC'] = train['Heating QC'].map(ordinal_heating_qc)

In [95]:
# Checking type of 'Heating QC' column again to verify successful conversion.

train['Heating QC'].dtypes

dtype('float64')

In [96]:
# Checking all unique values of 'Heating QC' column.

sorted(train['Heating QC'].unique())

[1.0, 2.0, 3.0, 4.0, 5.0]

In [97]:
# Checking all unique values of 'Central Air' column.

train['Central Air'].unique()

array(['Y', 'N'], dtype=object)

In [98]:
# Converting ordinal values in 'Central Air' column to discrete values of type float.

ordinal_central_air = {'Y':1.0, 'N':0.0}

train['Central Air'] = train['Central Air'].map(ordinal_central_air)

In [99]:
# Checking type of 'Central Air' column again to verify successful conversion.

train['Central Air'].dtypes

dtype('float64')

In [100]:
# Checking all unique values of 'Central Air' column.

sorted(train['Central Air'].unique())

[0.0, 1.0]

In [101]:
# Checking all unique values of 'Electrical' column.

train['Electrical'].unique()

array(['SBrkr', 'FuseF', 'FuseA', 'FuseP', 'Mix'], dtype=object)

In [102]:
# Converting ordinal values in 'Electrical' column to discrete values of type float.

ordinal_electrical = {'SBrkr':5.0, 'FuseA':4.0, 'FuseF':3.0, 'FuseP':2.0, 'Mix':1.0}

train['Electrical'] = train['Electrical'].map(ordinal_electrical)

In [103]:
# Checking type of 'Electrical' column again to verify successful conversion.

train['Electrical'].dtypes

dtype('float64')

In [104]:
# Checking all unique values of 'Electrical' column.

sorted(train['Electrical'].unique())

[1.0, 2.0, 3.0, 4.0, 5.0]

In [105]:
# Checking for any values in '1st Flr SF' column which are negative.

train[train['1st Flr SF']<0]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice


In [106]:
# Checking for any values in '2nd Flr SF' column which are negative.

train[train['2nd Flr SF']<0]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice


In [107]:
# Checking for any values in 'Low Qual Fin SF' column which are negative.

train[train['Low Qual Fin SF']<0]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice


In [108]:
# Checking for any values in 'Gr Liv Area' column which are negative.

train[train['Gr Liv Area']<0]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice


In [109]:
# Filtering all rows with missing values in 'Bsmt Full Bath' column.

train[train['Bsmt Full Bath'].isna()]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
616,1498.0,908154080.0,20.0,RL,123.0,47007.0,Pave,,3.0,Lvl,...,0.0,0.0,,,,0.0,7.0,2008.0,WD,284700.0
1327,1342.0,903230120.0,20.0,RM,99.0,5940.0,Pave,,3.0,Lvl,...,0.0,0.0,,MnPrv,,0.0,4.0,2008.0,ConLD,79000.0


In [110]:
# Checking for any values in 'Bsmt Full Bath' column which are negative.

train[train['Bsmt Full Bath']<0]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice


In [111]:
# Filtering all rows with missing values in 'Bsmt Half Bath' column.

train[train['Bsmt Half Bath'].isna()]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
616,1498.0,908154080.0,20.0,RL,123.0,47007.0,Pave,,3.0,Lvl,...,0.0,0.0,,,,0.0,7.0,2008.0,WD,284700.0
1327,1342.0,903230120.0,20.0,RM,99.0,5940.0,Pave,,3.0,Lvl,...,0.0,0.0,,MnPrv,,0.0,4.0,2008.0,ConLD,79000.0


In [112]:
# Checking for any values in 'Bsmt Half Bath' column which are negative.

train[train['Bsmt Half Bath']<0]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice


In [113]:
# Checking for any values in 'Full Bath' column which are negative.

train[train['Full Bath']<0]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice


In [114]:
# Checking for any values in 'Half Bath' column which are negative.

train[train['Half Bath']<0]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice


In [115]:
# Checking for any values in 'Bedroom AbvGr' column which are negative.

train[train['Bedroom AbvGr']<0]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice


In [116]:
# Checking for any values in 'Kitchen AbvGr' column which are negative.

train[train['Kitchen AbvGr']<0]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice


In [117]:
# Checking all unique values of 'Kitchen Qual' column.

train['Kitchen Qual'].unique()

array(['Gd', 'TA', 'Fa', 'Ex'], dtype=object)

In [118]:
# Converting ordinal values in 'Kitchen Qual' column to discrete values of type float.

ordinal_kitchen_qual = {'Ex':5.0, 'Gd':4.0, 'TA':3.0, 'Fa':2.0, 'Po':1.0}

train['Kitchen Qual'] = train['Kitchen Qual'].map(ordinal_kitchen_qual)

In [119]:
# Checking type of 'Kitchen Qual' column again to verify successful conversion.

train['Kitchen Qual'].dtypes

dtype('float64')

In [120]:
# Checking all unique values of 'Kitchen Qual' column.

sorted(train['Kitchen Qual'].unique())

[2.0, 3.0, 4.0, 5.0]

In [121]:
# Checking for any values in 'TotRms AbvGrd' column which are negative.

train[train['TotRms AbvGrd']<0]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice


In [122]:
# Checking all unique values of 'Functional' column.

train['Functional'].unique()

array(['Typ', 'Mod', 'Min2', 'Maj1', 'Min1', 'Sev', 'Sal', 'Maj2'],
      dtype=object)

In [123]:
# Converting ordinal values in 'Functional' column to discrete values of type float.

ordinal_functional = {'Typ':8.0, 'Min1':7.0, 'Min2':6.0, 'Mod':5.0, 'Maj1':4.0, 'Maj2':3.0, 'Sev':2.0, 'Sal':1.0}

train['Functional'] = train['Functional'].map(ordinal_functional)

In [124]:
# Checking type of 'Functional' column again to verify successful conversion.

train['Functional'].dtypes

dtype('float64')

In [125]:
# Checking all unique values of 'Functional' column.

sorted(train['Functional'].unique())

[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]

In [126]:
# Checking for any values in 'Fireplaces' column which are negative.

train[train['Fireplaces']<0]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice


In [127]:
# Checking all unique values of 'Fireplace Qu' column.

train['Fireplace Qu'].unique()

array(['NA', 'TA', 'Gd', 'Po', 'Ex', 'Fa'], dtype=object)

In [128]:
# Converting ordinal values in 'Fireplace Qu' column to discrete values of type float.

ordinal_fireplace_qu = {'Ex':5.0, 'Gd':4.0, 'TA':3.0, 'Fa':2.0, 'Po':1.0, 'NA':0.0}

train['Fireplace Qu'] = train['Fireplace Qu'].map(ordinal_fireplace_qu)

In [129]:
# Checking type of 'Fireplace Qu' column again to verify successful conversion.

train['Fireplace Qu'].dtypes

dtype('float64')

In [130]:
# Checking all unique values of 'Fireplace Qu' column.

sorted(train['Fireplace Qu'].unique())

[0.0, 1.0, 2.0, 3.0, 4.0, 5.0]

In [131]:
# Checking all unique values of 'Garage Type' column.

sorted(train['Garage Type'].unique())

['2Types', 'Attchd', 'Basment', 'BuiltIn', 'CarPort', 'Detchd', 'NA']

In [132]:
# Checking for any values in 'Garage Yr Blt' column which are negative.

train[train['Garage Yr Blt'].isna()]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
28,2243.0,911103060.0,190.0,C (all),75.0,8250.0,Pave,,4.0,Lvl,...,0.0,0.0,,,,0.0,6.0,2007.0,WD,119600.0
53,330.0,923226250.0,160.0,RM,21.0,1476.0,Pave,,4.0,Lvl,...,0.0,0.0,,,,0.0,3.0,2010.0,WD,76000.0
65,2278.0,923202134.0,20.0,RL,70.0,8402.0,Pave,,4.0,Lvl,...,0.0,0.0,,,,0.0,12.0,2007.0,New,147000.0
79,2235.0,910201050.0,50.0,RM,50.0,7288.0,Pave,,4.0,Lvl,...,0.0,0.0,,,,0.0,9.0,2007.0,WD,129850.0
101,2084.0,905476170.0,30.0,RL,58.0,9098.0,Pave,,3.0,Lvl,...,0.0,0.0,,,,0.0,7.0,2007.0,WD,86000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1991,325.0,923204150.0,90.0,RL,94.0,9400.0,Pave,,4.0,Low,...,0.0,0.0,,,,0.0,4.0,2010.0,WD,139000.0
2010,2880.0,911175410.0,30.0,C (all),69.0,12366.0,Pave,,4.0,Lvl,...,0.0,0.0,,,,0.0,10.0,2006.0,WD,51689.0
2027,2628.0,535456010.0,90.0,RL,75.0,8512.0,Pave,,4.0,Lvl,...,0.0,0.0,,,,0.0,9.0,2006.0,WD,119000.0
2039,2288.0,923228220.0,160.0,RM,21.0,1495.0,Pave,,4.0,Lvl,...,0.0,0.0,,,,0.0,5.0,2007.0,WD,93900.0


In [133]:
# Checking for any values in 'Garage Yr Blt' column which are negative.

train[train['Garage Yr Blt']<0]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice


In [134]:
# Checking all unique values of 'Garage Finish' column.

train['Garage Finish'].unique()

array(['RFn', 'Unf', 'Fin', 'NA', nan], dtype=object)

In [135]:
# Checking to see if any rows left in 'Garage Finish' column with missing values.

train['Garage Finish'].isna().sum()

1

In [136]:
# Converting ordinal values in 'Garage Finish' column to discrete values of type float.

ordinal_garage_finish = {'Fin':3.0, 'RFn':2.0, 'Unf':1.0, 'NA':0.0}

train['Garage Finish'] = train['Garage Finish'].map(ordinal_garage_finish)

In [137]:
# Checking type of 'Garage Finish' column again to verify successful conversion.

train['Garage Finish'].dtypes

dtype('float64')

In [138]:
# Checking all unique values of 'Garage Finish' column.

sorted(train['Garage Finish'].unique())

[0.0, 1.0, 2.0, 3.0, nan]

In [139]:
# Checking for any values in 'Garage Cars' column which are negative.

train[train['Garage Cars'].isna()]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
1712,2237.0,910201180.0,70.0,RM,50.0,9060.0,Pave,,4.0,Lvl,...,0.0,0.0,,MnPrv,,0.0,3.0,2007.0,WD,150909.0


In [140]:
# Checking for any values in 'Garage Cars' column which are negative.

train[train['Garage Cars']<0]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice


In [141]:
# Checking for any values in 'Garage Area' column which are negative.

train[train['Garage Area'].isna()]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
1712,2237.0,910201180.0,70.0,RM,50.0,9060.0,Pave,,4.0,Lvl,...,0.0,0.0,,MnPrv,,0.0,3.0,2007.0,WD,150909.0


In [142]:
# Checking for any values in 'Garage Area' column which are negative.

train[train['Garage Area']<0]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice


In [143]:
# Checking all unique values of 'Garage Qual' column.

train['Garage Qual'].unique()

array(['TA', 'Fa', 'NA', 'Gd', 'Ex', 'Po', nan], dtype=object)

In [144]:
# Checking all unique values of 'Garage Cond' column.

train['Garage Cond'].unique()

array(['TA', 'Fa', 'NA', 'Po', 'Gd', 'Ex', nan], dtype=object)

In [145]:
# Checking to see if any rows left in 'Garage Qual' column with missing values.

train['Garage Qual'].isna().sum()

1

In [146]:
# Checking to see if any rows left in 'Garage Cond' column with missing values.

train['Garage Cond'].isna().sum()

1

In [147]:
# Converting ordinal values in 'Garage Qual' & 'Garage Cond' columns to discrete values of type float.

ordinal_garage = {'Ex':5.0, 'Gd':4.0, 'TA':3.0, 'Fa':2.0, 'Po':1.0, 'NA':0.0}

train['Garage Qual'] = train['Garage Qual'].map(ordinal_garage)

train['Garage Cond'] = train['Garage Cond'].map(ordinal_garage)

In [148]:
# Checking type of 'Garage Qual' column again to verify successful conversion.

train['Garage Qual'].dtypes

dtype('float64')

In [149]:
# Checking type of 'Garage Cond' column again to verify successful conversion.

train['Garage Cond'].dtypes

dtype('float64')

In [150]:
# Checking all unique values of 'Garage Qual' column.

sorted(train['Garage Qual'].unique())

[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, nan]

In [151]:
# Checking all unique values of 'Garage Cond' column.

sorted(train['Garage Cond'].unique())

[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, nan]

In [152]:
# Checking all unique values of 'Paved Drive' column.

train['Paved Drive'].unique()

array(['Y', 'N', 'P'], dtype=object)

In [153]:
# Converting ordinal values in 'Paved Drive' column to discrete values of type float.

ordinal_paved_drive = {'Y':3.0, 'P':2.0, 'N':1.0}

train['Paved Drive'] = train['Paved Drive'].map(ordinal_paved_drive)

In [154]:
# Checking type of 'Paved Drive' column again to verify successful conversion.

train['Paved Drive'].dtypes

dtype('float64')

In [155]:
# Checking all unique values of 'Paved Drive' column.

sorted(train['Paved Drive'].unique())

[1.0, 2.0, 3.0]

In [156]:
# Checking for any values in 'Wood Deck SF' column which are negative.

train[train['Wood Deck SF']<0]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice


In [157]:
# Checking for any values in 'Open Porch SF' column which are negative.

train[train['Open Porch SF']<0]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice


In [158]:
# Checking for any values in 'Enclosed Porch' column which are negative.

train[train['Enclosed Porch']<0]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice


In [159]:
# Checking for any values in '3Ssn Porch' column which are negative.

train[train['3Ssn Porch']<0]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice


In [160]:
# Checking for any values in 'Screen Porch' column which are negative.

train[train['Screen Porch']<0]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice


In [161]:
# Checking for any values in 'Pool Area' column which are negative.

train[train['Pool Area']<0]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice


In [162]:
# Checking all unique values of 'Pool QC' column.

train['Pool QC'].unique()

array(['NA', 'Fa', 'Gd', 'Ex', 'TA'], dtype=object)

In [163]:
# Converting ordinal values in 'Pool QC' column to discrete values of type float.

ordinal_pool_qc = {'Ex':4.0, 'Gd':3.0, 'TA':2.0, 'Fa':1.0, 'NA':0.0}

train['Pool QC'] = train['Pool QC'].map(ordinal_pool_qc)

In [164]:
# Checking type of 'Pool QC' column again to verify successful conversion.

train['Pool QC'].dtypes

dtype('float64')

In [165]:
# Checking all unique values of 'Pool QC' column.

sorted(train['Pool QC'].unique())

[0.0, 1.0, 2.0, 3.0, 4.0]

In [166]:
# Checking all unique values of 'Fence' column.

train['Fence'].unique()

array(['NA', 'MnPrv', 'GdPrv', 'GdWo', 'MnWw'], dtype=object)

In [167]:
# Converting ordinal values in 'Fence' column to discrete values of type float.

ordinal_fence = {'GdPrv':4.0, 'MnPrv':3.0, 'GdWo':2.0, 'MnWw':1.0, 'NA':0.0}

train['Fence'] = train['Fence'].map(ordinal_fence)

In [168]:
# Checking type of 'Fence' column again to verify successful conversion.

train['Fence'].dtypes

dtype('float64')

In [169]:
# Checking all unique values of 'Fence' column.

sorted(train['Fence'].unique())

[0.0, 1.0, 2.0, 3.0, 4.0]

In [170]:
# Checking all unique values of 'Misc Feature' column.

sorted(train['Misc Feature'].unique())

['Elev', 'Gar2', 'NA', 'Othr', 'Shed', 'TenC']

In [171]:
# Checking for any values in 'Misc Val' column which are negative.

train[train['Misc Val']<0]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice


In [172]:
# Checking for any values in 'Mo Sold' column which are <= 0 and >= 13.

train[(train['Mo Sold']<=0) & (train['Mo Sold']>=13)]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice


In [173]:
# Checking for any values in 'Yr Sold' column which are <= 0.

train[train['Yr Sold']<=0]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice


In [174]:
# Checking all unique values of 'Sale Type' column.

sorted(train['Sale Type'].unique())

['COD', 'CWD', 'Con', 'ConLD', 'ConLI', 'ConLw', 'New', 'Oth', 'WD ']

In [175]:
# Checking for any values in 'SalePrice' column which are negative.

train[train['SalePrice']<0]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice


In [176]:
train.shape

(2051, 81)

In [177]:
# Checking for all np.nan values in all columns and filtering out columns which have one or more rows with np.nan values.
# This is the true number of missing values in the dataframe.

null_cols = train.isna().sum()[train.isna().sum()!=0]
null_cols

Lot Frontage      330
Mas Vnr Type       22
Mas Vnr Area       22
Bsmt Qual           1
Bsmt Cond           1
Bsmt Exposure       4
BsmtFin Type 1      1
BsmtFin SF 1        1
BsmtFin Type 2      2
BsmtFin SF 2        1
Bsmt Unf SF         1
Total Bsmt SF       1
Bsmt Full Bath      2
Bsmt Half Bath      2
Garage Yr Blt     114
Garage Finish       1
Garage Cars         1
Garage Area         1
Garage Qual         1
Garage Cond         1
dtype: int64

In [178]:
# We notice that majority of the above mentioned columns have <=4 rows with missing data.
# Filtering index of rows with missing data.

print("Columns with <=4 rows of missing data : Index of rows with missing data")
for col, val in null_cols.items():
    if val<22:
        print(f"{col} : {list(train[train[col].isna()].index)}")

Columns with <=4 rows of missing data : Index of rows with missing data
Bsmt Qual : [1327]
Bsmt Cond : [1327]
Bsmt Exposure : [1327, 1456, 1547, 1997]
BsmtFin Type 1 : [1327]
BsmtFin SF 1 : [1327]
BsmtFin Type 2 : [1147, 1327]
BsmtFin SF 2 : [1327]
Bsmt Unf SF : [1327]
Total Bsmt SF : [1327]
Bsmt Full Bath : [616, 1327]
Bsmt Half Bath : [616, 1327]
Garage Finish : [1712]
Garage Cars : [1712]
Garage Area : [1712]
Garage Qual : [1712]
Garage Cond : [1712]


In [179]:
# From above, we see that 7 rows exist with missing data in some columns.
# Index of these 7 rows: 616, 1147, 1327, 1456, 1547, 1712 & 1997.
# Since the number of rows with missing values (7) is only about 0.3% of the total number of rows (2051), we will drop these rows before proceeding.

drop_rows_index = [616, 1147, 1327, 1456, 1547, 1712, 1997]

train.drop(index=drop_rows_index, inplace=True)

In [180]:
train.shape

(2044, 81)

In [181]:
# Checking for all np.nan values in all columns and filtering out columns which have one or more rows with np.nan values.
# This is the true number of missing values in the dataframe.

null_cols = train.isna().sum()[train.isna().sum()!=0]
null_cols

# The remaining columns with larger numbers of rows of missing data will be dealth with later (depending on whether they're crucial in model training or not).
# Since the numbers of rows of missing data are larger (esp. 113 and 330), we will need to look at other methods of data imputation later, if necessary.

Lot Frontage     330
Mas Vnr Type      22
Mas Vnr Area      22
Garage Yr Blt    113
dtype: int64

In [182]:
# Exporting the cleaned train dataframe to a CSV file using a relative path.

train.to_csv("../datasets/train_cleaned.csv", index=False)

## Data dictionary

A data dictionary was compiled in MS Excel while going through the above data cleaning process. It will be imported below so as to ease the process of filtering columns for further process of one-hot encoding of nominal columns.

In [183]:
data_dict = pd.read_csv("../datasets/data_dictionary.csv")

In [184]:
data_dict.head(10)

Unnamed: 0,Column name,Variable type,Col dtype (initial),Col dtype (converted to),Col dtype (final),Needs one-hot encoding,Possible unique values,Unique values in train,Rows of missing data,Description
0,Id,discrete,float,,float,no,,,0,Observation number.
1,PID,nominal,float,,float,no,,,0,Parcel identification number.
2,MS SubClass,nominal,float,,float,yes,16.0,16.0,0,Identifies the type of dwelling involved in th...
3,MS Zoning,nominal,object,,object,yes,8.0,7.0,0,Identifies the general zoning classification o...
4,Lot Frontage,continuous,float,,float,no,,,330,Linear feet of street connected to property.
5,Lot Area,continuous,float,,float,no,,,0,Lot size in square feet.
6,Street,nominal,object,,object,yes,2.0,2.0,0,Type of road access to property.
7,Alley,nominal,object,,object,yes,3.0,3.0,0,Type of alley access to property.
8,Lot Shape,ordinal,object,float,float,no,4.0,4.0,0,"General shape of property. 'Reg' = 4, 'IR1' = ..."
9,Land Contour,nominal,object,,object,yes,4.0,4.0,0,Flatness of the property.


## One-hot Encoding of Nominal Columns

21 nominal columns were identified which need to be one-hot encoded, before they can be utilized as features in regression models.

In [185]:
# Filtering columns from data_dict which require one-hot encoding.

data_dict[data_dict['Needs one-hot encoding']=='yes']

Unnamed: 0,Column name,Variable type,Col dtype (initial),Col dtype (converted to),Col dtype (final),Needs one-hot encoding,Possible unique values,Unique values in train,Rows of missing data,Description
2,MS SubClass,nominal,float,,float,yes,16.0,16.0,0,Identifies the type of dwelling involved in th...
3,MS Zoning,nominal,object,,object,yes,8.0,7.0,0,Identifies the general zoning classification o...
6,Street,nominal,object,,object,yes,2.0,2.0,0,Type of road access to property.
7,Alley,nominal,object,,object,yes,3.0,3.0,0,Type of alley access to property.
9,Land Contour,nominal,object,,object,yes,4.0,4.0,0,Flatness of the property.
11,Lot Config,nominal,object,,object,yes,5.0,5.0,0,Lot configuration.
13,Neighborhood,nominal,object,,object,yes,28.0,28.0,0,Physical locations within Ames city limits (ma...
14,Condition 1,nominal,object,,object,yes,9.0,9.0,0,Proximity to various conditions.
15,Condition 2,nominal,object,,object,yes,9.0,8.0,0,Proximity to various conditions (if more than ...
16,Bldg Type,nominal,object,,object,yes,5.0,5.0,0,Type of dwelling.


In [186]:
# Preparing list of columns in train to be one-hot encoded.

train_enc_list = list(data_dict[(data_dict['Needs one-hot encoding']=='yes')]['Column name'])
print(len(train_enc_list))
train_enc_list

21


['MS SubClass',
 'MS Zoning',
 'Street',
 'Alley',
 'Land Contour',
 'Lot Config',
 'Neighborhood',
 'Condition 1',
 'Condition 2',
 'Bldg Type',
 'House Style',
 'Roof Style',
 'Roof Matl',
 'Exterior 1st',
 'Exterior 2nd',
 'Mas Vnr Type',
 'Foundation',
 'Heating',
 'Garage Type',
 'Misc Feature',
 'Sale Type']

In [187]:
train.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109.0,533352170.0,60.0,RL,,13517.0,Pave,,3.0,Lvl,...,0.0,0.0,0.0,0.0,,0.0,3.0,2010.0,WD,130500.0
1,544.0,531379050.0,60.0,RL,43.0,11492.0,Pave,,3.0,Lvl,...,0.0,0.0,0.0,0.0,,0.0,4.0,2009.0,WD,220000.0
2,153.0,535304180.0,20.0,RL,68.0,7922.0,Pave,,4.0,Lvl,...,0.0,0.0,0.0,0.0,,0.0,1.0,2010.0,WD,109000.0
3,318.0,916386060.0,60.0,RL,73.0,9802.0,Pave,,4.0,Lvl,...,0.0,0.0,0.0,0.0,,0.0,4.0,2010.0,WD,174000.0
4,255.0,906425045.0,50.0,RL,82.0,14235.0,Pave,,3.0,Lvl,...,0.0,0.0,0.0,0.0,,0.0,3.0,2010.0,WD,138500.0


In [None]:
train.shape

In [188]:
# One-hot encoding the nominal columns in train dataframe.

# We also notice from above that 'Mas Vnr Type' column contains 22 rows with missing values (NaN).
# Since the number of rows with missing values (22) is only about 1% of the total number of rows (2051), we will ignore these null values while one-hot encoding.
# So, we set dummy_na=False while one-hot enconding.

train = pd.get_dummies(train, columns=train_enc_list, drop_first=True, dummy_na=False)
train.head()

Unnamed: 0,Id,PID,Lot Frontage,Lot Area,Lot Shape,Utilities,Land Slope,Overall Qual,Overall Cond,Year Built,...,Misc Feature_Shed,Misc Feature_TenC,Sale Type_CWD,Sale Type_Con,Sale Type_ConLD,Sale Type_ConLI,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_WD
0,109.0,533352170.0,,13517.0,3.0,4.0,3.0,6.0,8.0,1976.0,...,0,0,0,0,0,0,0,0,0,1
1,544.0,531379050.0,43.0,11492.0,3.0,4.0,3.0,7.0,5.0,1996.0,...,0,0,0,0,0,0,0,0,0,1
2,153.0,535304180.0,68.0,7922.0,4.0,4.0,3.0,5.0,7.0,1953.0,...,0,0,0,0,0,0,0,0,0,1
3,318.0,916386060.0,73.0,9802.0,4.0,4.0,3.0,5.0,5.0,2006.0,...,0,0,0,0,0,0,0,0,0,1
4,255.0,906425045.0,82.0,14235.0,3.0,4.0,3.0,6.0,8.0,1900.0,...,0,0,0,0,0,0,0,0,0,1


In [189]:
train.shape

(2044, 213)

In [190]:
# Number of columns to be one-hot encoded

len(data_dict[(data_dict['Needs one-hot encoding']=='yes')]['Column name'])

21

In [191]:
# Total number of unique values in all columns to be one-hot encoded.

data_dict[(data_dict['Needs one-hot encoding']=='yes')]['Unique values in train'].sum()

174.0

In [192]:
# Verifying final number of columns in train.

# 81 = total no. of columns before one-hot encoding
# 21 = no. of columns to be one-hot encoded
# 174 = total no. of unique values in all columns to be one-hot encoded
# 21 = no. of reference columns dropped because of drop_first=True

81 - 21 + 174 - 21

# So, train has been one-hot encoded correctly.

213

In [193]:
# Checking for all np.nan values in all columns and filtering out columns which have one or more rows with np.nan values.
# This is the number of remaining missing values in the dataframe.
# I will consider imputing this missing data if these columns are absolutely required in the regression model.

null_cols = train.isna().sum()[train.isna().sum()!=0]
null_cols

Lot Frontage     330
Mas Vnr Area      22
Garage Yr Blt    113
dtype: int64

In [194]:
# Looking at unique dtypes in train dataframe to ensure that no columns remain with type object.

train.dtypes.unique()

array([dtype('float64'), dtype('uint8')], dtype=object)

In [195]:
# Exporting the cleaned & encoded train dataframe to a CSV file using a relative path.

train.to_csv("../datasets/train_cleaned_encoded.csv", index=False)