# House Price Prediction

#### Step 1: Reading And Understanding Data


In [1]:
# Import Libraries 

import pandas as pd
import numpy as np

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)

# Import Libraries for EDA

import seaborn as sns
import matplotlib
import plotly.graph_objs as go
matplotlib.style.use('ggplot')
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.lines import Line2D
%matplotlib inline

import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

import os

In [2]:
# Supress Warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Will create the separator for better data visualization among each variable
def Line_Separator():
    print('*'*50, '\n')
    
def Line_Separator1():
    print('*'*100, '\n')

In [4]:
class datastyle:
   start = '\033[1m'
   end = '\033[0m'

In [6]:
# Import file, read and review the data 
df = pd.read_csv('train.csv')
df.head(2)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,...,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,...,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,...,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500


In [7]:
# Check the number of columns and rows
Number_of_raw = df.shape[0]
Number_of_column = df.shape[1]

print(datastyle.start+'Number of raws in housing dataset of the Australian market    :'+ datastyle.end, Number_of_raw)
print(datastyle.start+'Number of columns in housing dataset of the Australian market :'+ datastyle.end, Number_of_column); Line_Separator()

[1mNumber of raws in housing dataset of the Australian market    :[0m 1460
[1mNumber of columns in housing dataset of the Australian market :[0m 81
************************************************** 



In [8]:
# Check the number of catogrical and numerical features
def data_features (data):
    categorical_features = df.select_dtypes(exclude = [np.number]).columns
    numerical_features = df.select_dtypes(include = [np.number]).columns
    print(datastyle.start+"Categorical features :\n \n"+ datastyle.end,categorical_features); Line_Separator1()
    print(datastyle.start+"Numerical features:\n \n"+ datastyle.end,numerical_features)
print(data_features(df))

[1mCategorical features :
 
[0m Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'],
      dtype='object')
**************************************************************************************************** 

[1mNumerical features:
 
[0m Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2',

In [9]:
# Check missing values and types of data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [10]:
# Identify the missing value in precentage
print(datastyle.start+"Missing values in Percentage :\n"+ datastyle.end); Line_Separator()
percent_missing_housing_data_of_Australian_market =round(100*(df.isnull()).sum()/len(df), 2)
percent_missing_housing_data_of_Australian_market.sort_values(ascending=False)

[1mMissing values in Percentage :
[0m
************************************************** 



PoolQC         99.52
MiscFeature    96.30
Alley          93.77
Fence          80.75
FireplaceQu    47.26
               ...  
ExterQual       0.00
Exterior2nd     0.00
Exterior1st     0.00
RoofMatl        0.00
SalePrice       0.00
Length: 81, dtype: float64

We have detected missing values in a total of 16 variables, consisting of both categorical and numerical data. In order to address this issue, we will analyze each categorical variable individually and apply appropriate treatments to handle the missing values.

Let's examine the presence of null values in each categorical variable by referring to the data definition provided, addressing them one column at a time.

In [12]:
# Non categorical columns detailed study

print(datastyle.start+'1.  PoolQC       :'+ datastyle.end)
print(df['PoolQC'].describe()); Line_Separator()

print(datastyle.start+'2.  MiscFeature  :'+ datastyle.end)
print(df['MiscFeature'].describe()); Line_Separator()

print(datastyle.start+'3.  Alley        :'+ datastyle.end)
print(df['Alley'].describe()); Line_Separator()

print(datastyle.start+'4.  Fence        :'+ datastyle.end)
print(df['Fence'].describe()); Line_Separator()

print(datastyle.start+'5.  FireplaceQu  :'+ datastyle.end)
print(df['FireplaceQu'].describe()); Line_Separator()

print(datastyle.start+'6.  GarageCond   :'+ datastyle.end)
print(df['GarageCond'].describe()); Line_Separator()

print(datastyle.start+'7.  GarageType   :'+ datastyle.end)
print(df['GarageType'].describe()); Line_Separator()

print(datastyle.start+'8.  GarageFinish :'+ datastyle.end)
print(df['GarageFinish'].describe()); Line_Separator()

print(datastyle.start+'9.  GarageQual   :'+ datastyle.end)
print(df['GarageQual'].describe()); Line_Separator()

print(datastyle.start+'10. BsmtExposure :'+ datastyle.end)
print(df['BsmtExposure'].describe()); Line_Separator() 

print(datastyle.start+'11. BsmtFinType2 :'+ datastyle.end)
print(df['BsmtFinType2'].describe()); Line_Separator()

print(datastyle.start+'12. BsmtFinType1 :'+ datastyle.end)
print(df['BsmtFinType1'].describe()); Line_Separator()

print(datastyle.start+'13. BsmtCond     :'+ datastyle.end)
print(df['BsmtCond'].describe()); Line_Separator()

print(datastyle.start+'14. BsmtQual     :'+ datastyle.end)
print(df['BsmtQual'].describe()); Line_Separator()

print(datastyle.start+'15. MasVnrType   :'+ datastyle.end)
print(df['MasVnrType'].describe()); Line_Separator()

print(datastyle.start+'16. Electrical   :'+ datastyle.end)
print(df['Electrical'].describe())

[1m1.  PoolQC       :[0m
count      7
unique     3
top       Gd
freq       3
Name: PoolQC, dtype: object
************************************************** 

[1m2.  MiscFeature  :[0m
count       54
unique       4
top       Shed
freq        49
Name: MiscFeature, dtype: object
************************************************** 

[1m3.  Alley        :[0m
count       91
unique       2
top       Grvl
freq        50
Name: Alley, dtype: object
************************************************** 

[1m4.  Fence        :[0m
count       281
unique        4
top       MnPrv
freq        157
Name: Fence, dtype: object
************************************************** 

[1m5.  FireplaceQu  :[0m
count     770
unique      5
top        Gd
freq      380
Name: FireplaceQu, dtype: object
************************************************** 

[1m6.  GarageCond   :[0m
count     1379
unique       5
top         TA
freq      1326
Name: GarageCond, dtype: object
*****************************************

In [13]:
# We will conduct a detailed analysis of the non-categorical columns, examining each one thoroughly to understand the nature and distribution of missing values in these variables.

print(datastyle.start+'1.  PoolQC       :'+ datastyle.end, df['PoolQC'].unique())
print(datastyle.start+'2.  MiscFeature  :'+ datastyle.end, df['MiscFeature'].unique())
print(datastyle.start+'3.  Alley        :'+ datastyle.end, df['Alley'].unique())
print(datastyle.start+'4.  Fence        :'+ datastyle.end, df['Fence'].unique())
print(datastyle.start+'5.  FireplaceQu  :'+ datastyle.end, df['FireplaceQu'].unique())
print(datastyle.start+'6.  GarageCond   :'+ datastyle.end, df['GarageCond'].unique())
print(datastyle.start+'7.  GarageType   :'+ datastyle.end, df['GarageType'].unique())
print(datastyle.start+'8.  GarageFinish :'+ datastyle.end, df['GarageFinish'].unique())
print(datastyle.start+'9.  GarageQual   :'+ datastyle.end, df['GarageQual'].unique())
print(datastyle.start+'10. BsmtExposure :'+ datastyle.end, df['BsmtExposure'].unique())    
print(datastyle.start+'11. BsmtFinType2 :'+ datastyle.end, df['BsmtFinType2'].unique())
print(datastyle.start+'12. BsmtFinType1 :'+ datastyle.end, df['BsmtFinType1'].unique())
print(datastyle.start+'13. BsmtCond     :'+ datastyle.end, df['BsmtCond'].unique())
print(datastyle.start+'14. BsmtQual     :'+ datastyle.end, df['BsmtQual'].unique())
print(datastyle.start+'15. MasVnrType   :'+ datastyle.end, df['MasVnrType'].unique())
print(datastyle.start+'16. Electrical   :'+ datastyle.end, df['Electrical'].unique())

[1m1.  PoolQC       :[0m [nan 'Ex' 'Fa' 'Gd']
[1m2.  MiscFeature  :[0m [nan 'Shed' 'Gar2' 'Othr' 'TenC']
[1m3.  Alley        :[0m [nan 'Grvl' 'Pave']
[1m4.  Fence        :[0m [nan 'MnPrv' 'GdWo' 'GdPrv' 'MnWw']
[1m5.  FireplaceQu  :[0m [nan 'TA' 'Gd' 'Fa' 'Ex' 'Po']
[1m6.  GarageCond   :[0m ['TA' 'Fa' nan 'Gd' 'Po' 'Ex']
[1m7.  GarageType   :[0m ['Attchd' 'Detchd' 'BuiltIn' 'CarPort' nan 'Basment' '2Types']
[1m8.  GarageFinish :[0m ['RFn' 'Unf' 'Fin' nan]
[1m9.  GarageQual   :[0m ['TA' 'Fa' 'Gd' nan 'Ex' 'Po']
[1m10. BsmtExposure :[0m ['No' 'Gd' 'Mn' 'Av' nan]
[1m11. BsmtFinType2 :[0m ['Unf' 'BLQ' nan 'ALQ' 'Rec' 'LwQ' 'GLQ']
[1m12. BsmtFinType1 :[0m ['GLQ' 'ALQ' 'Unf' 'Rec' 'BLQ' nan 'LwQ']
[1m13. BsmtCond     :[0m ['TA' 'Gd' nan 'Fa' 'Po']
[1m14. BsmtQual     :[0m ['Gd' 'TA' 'Ex' nan 'Fa']
[1m15. MasVnrType   :[0m ['BrkFace' 'None' 'Stone' 'BrkCmn' nan]
[1m16. Electrical   :[0m ['SBrkr' 'FuseF' 'FuseA' 'FuseP' 'Mix' nan]


In [14]:
# Replace the categorical name, especially "nan."

# 1. PoolQC --- Nan = No Pool
print(datastyle.start+"1."+ datastyle.end,"Replaced Nan data from PoolQC to No Pool")
def replace_name(x,y):
    df['PoolQC'].replace(x,y, inplace=True)

replace_name(np.nan,'No Pool')


# 2. MiscFeature and 15. MasVnrType----- Nan = none 

print(datastyle.start+"2."+ datastyle.end, "Replaced Nan data from MiscFeature &  MasVnrType to none")
def replace_name(x,y):
    df['MiscFeature'].replace(x,y, inplace=True)
    df['MasVnrType'].replace(x,y, inplace=True)

replace_name(np.nan,'None')

# 3. Alley --- Nan = No Alley
print(datastyle.start+"3."+ datastyle.end,"Replaced Nan data from Alley to No Alley")
def replace_name(x,y):
    df['Alley'].replace(x,y, inplace=True)

replace_name(np.nan,'No Alley')

# 4. Fence --- Nan = No Fence
print(datastyle.start+"4."+ datastyle.end, "Replaced Nan data from Fence to No Fence")
def replace_name(x,y):
    df['Fence'].replace(x,y, inplace=True)

replace_name(np.nan,'No Fence')

# 5. FireplaceQu --- Nan = No Fireplace
print(datastyle.start+"5."+ datastyle.end,"Replaced Nan data from FireplaceQu to No Fireplace")
def replace_name(x,y):
    df['FireplaceQu'].replace(x,y, inplace=True)

replace_name(np.nan,'No Fireplace')

# 6. GarageCond , 7.GarageType, 8.GarageFinish, 9.GarageQual----- Nan = No Garage
print(datastyle.start+"6."+ datastyle.end, "Replaced Nan data from GarageCond, GarageType, GarageFinish,& GarageQual to No Garage")
def replace_name(x,y):
    df['GarageCond'].replace(x,y, inplace=True)
    df['GarageType'].replace(x,y, inplace=True)
    df['GarageFinish'].replace(x,y, inplace=True)
    df['GarageQual'].replace(x,y, inplace=True)

replace_name(np.nan,'No Garage')

# 10. BsmtExposure ,11. BsmtFinType2, 12. BsmtFinType1, 13. BsmtCond, 14.BsmtQual----- Nan = No Basement
print(datastyle.start+"7."+ datastyle.end,"Replaced Nan data from BsmtExposure, BsmtFinType2, BsmtFinType1, BsmtCond & BsmtQual to No Basement")
def replace_name(x,y):
    df['BsmtExposure'].replace(x,y, inplace=True)
    df['BsmtFinType2'].replace(x,y, inplace=True)
    df['BsmtFinType1'].replace(x,y, inplace=True)
    df['BsmtCond'].replace(x,y, inplace=True)
    df['BsmtQual'].replace(x,y, inplace=True)

replace_name(np.nan,'No Basement')

# 16. Electrical----- Nan = unknown
print(datastyle.start+"8."+ datastyle.end, "Replaced Nan data from Electrical to unknown")
def replace_name(x,y):
    df['Electrical'].replace(x,y, inplace=True)
    

replace_name(np.nan,'Uknown')

[1m1.[0m Replaced Nan data from PoolQC to No Pool
[1m2.[0m Replaced Nan data from MiscFeature &  MasVnrType to none
[1m3.[0m Replaced Nan data from Alley to No Alley
[1m4.[0m Replaced Nan data from Fence to No Fence
[1m5.[0m Replaced Nan data from FireplaceQu to No Fireplace
[1m6.[0m Replaced Nan data from GarageCond, GarageType, GarageFinish,& GarageQual to No Garage
[1m7.[0m Replaced Nan data from BsmtExposure, BsmtFinType2, BsmtFinType1, BsmtCond & BsmtQual to No Basement
[1m8.[0m Replaced Nan data from Electrical to unknown


In [15]:
# After replacing the NaN values, let's review the updated dataset to ensure that the missing values have been properly addressed and the data is now complete.
print(datastyle.start+'1.  PoolQC       :'+ datastyle.end, df['PoolQC'].unique())
print(datastyle.start+'2.  MiscFeature  :'+ datastyle.end, df['MiscFeature'].unique())
print(datastyle.start+'3.  Alley        :'+ datastyle.end, df['Alley'].unique())
print(datastyle.start+'4.  Fence        :'+ datastyle.end, df['Fence'].unique())
print(datastyle.start+'5.  FireplaceQu  :'+ datastyle.end, df['FireplaceQu'].unique())
print(datastyle.start+'6.  GarageCond   :'+ datastyle.end, df['GarageCond'].unique())
print(datastyle.start+'7.  GarageType   :'+ datastyle.end, df['GarageType'].unique())
print(datastyle.start+'8.  GarageFinish :'+ datastyle.end, df['GarageFinish'].unique())
print(datastyle.start+'9.  GarageQual   :'+ datastyle.end, df['GarageQual'].unique())
print(datastyle.start+'10. BsmtExposure :'+ datastyle.end, df['BsmtExposure'].unique())    
print(datastyle.start+'11. BsmtFinType2 :'+ datastyle.end, df['BsmtFinType2'].unique())
print(datastyle.start+'12. BsmtFinType1 :'+ datastyle.end, df['BsmtFinType1'].unique())
print(datastyle.start+'13. BsmtCond     :'+ datastyle.end, df['BsmtCond'].unique())
print(datastyle.start+'14. BsmtQual     :'+ datastyle.end, df['BsmtQual'].unique())
print(datastyle.start+'15. MasVnrType   :'+ datastyle.end, df['MasVnrType'].unique())
print(datastyle.start+'16. Electrical   :'+ datastyle.end, df['Electrical'].unique())

[1m1.  PoolQC       :[0m ['No Pool' 'Ex' 'Fa' 'Gd']
[1m2.  MiscFeature  :[0m ['None' 'Shed' 'Gar2' 'Othr' 'TenC']
[1m3.  Alley        :[0m ['No Alley' 'Grvl' 'Pave']
[1m4.  Fence        :[0m ['No Fence' 'MnPrv' 'GdWo' 'GdPrv' 'MnWw']
[1m5.  FireplaceQu  :[0m ['No Fireplace' 'TA' 'Gd' 'Fa' 'Ex' 'Po']
[1m6.  GarageCond   :[0m ['TA' 'Fa' 'No Garage' 'Gd' 'Po' 'Ex']
[1m7.  GarageType   :[0m ['Attchd' 'Detchd' 'BuiltIn' 'CarPort' 'No Garage' 'Basment' '2Types']
[1m8.  GarageFinish :[0m ['RFn' 'Unf' 'Fin' 'No Garage']
[1m9.  GarageQual   :[0m ['TA' 'Fa' 'Gd' 'No Garage' 'Ex' 'Po']
[1m10. BsmtExposure :[0m ['No' 'Gd' 'Mn' 'Av' 'No Basement']
[1m11. BsmtFinType2 :[0m ['Unf' 'BLQ' 'No Basement' 'ALQ' 'Rec' 'LwQ' 'GLQ']
[1m12. BsmtFinType1 :[0m ['GLQ' 'ALQ' 'Unf' 'Rec' 'BLQ' 'No Basement' 'LwQ']
[1m13. BsmtCond     :[0m ['TA' 'Gd' 'No Basement' 'Fa' 'Po']
[1m14. BsmtQual     :[0m ['Gd' 'TA' 'Ex' 'No Basement' 'Fa']
[1m15. MasVnrType   :[0m ['BrkFace' 'None' 'Stone' 

Once we have handled the missing values in the categorical variables, we will proceed to review the remaining variables that still contain NaN values. The list of these variables is provided below.

In [16]:
# identify missing value
print(datastyle.start+"Missing values in Percentage :\n"+ datastyle.end); Line_Separator()
percent_missing_housing_data_of_Australian_market =round(100*(df.isnull()).sum()/len(df), 2)
percent_missing_housing_data_of_Australian_market.sort_values(ascending=False)

[1mMissing values in Percentage :
[0m
************************************************** 



LotFrontage     17.74
GarageYrBlt      5.55
MasVnrArea       0.55
Id               0.00
KitchenAbvGr     0.00
                ...  
ExterQual        0.00
MasVnrType       0.00
Exterior2nd      0.00
Exterior1st      0.00
SalePrice        0.00
Length: 81, dtype: float64

We have identified three numerical variables that contain missing values. We will examine each variable individually and take appropriate actions to address the missing values.

In [17]:
# Nan numerical columns detailed study
print(datastyle.start+'1.  LotFrontage       :'+ datastyle.end)
print(df['LotFrontage'].describe()); Line_Separator()

print(datastyle.start+'2.  GarageYrBlt       :'+ datastyle.end)
print(df['GarageYrBlt'].describe()); Line_Separator()

print(datastyle.start+'3.  MasVnrArea        :'+ datastyle.end)
print(df['MasVnrArea'].describe()); Line_Separator()

[1m1.  LotFrontage       :[0m
count    1201.000000
mean       70.049958
std        24.284752
min        21.000000
25%        59.000000
50%        69.000000
75%        80.000000
max       313.000000
Name: LotFrontage, dtype: float64
************************************************** 

[1m2.  GarageYrBlt       :[0m
count    1379.000000
mean     1978.506164
std        24.689725
min      1900.000000
25%      1961.000000
50%      1980.000000
75%      2002.000000
max      2010.000000
Name: GarageYrBlt, dtype: float64
************************************************** 

[1m3.  MasVnrArea        :[0m
count    1452.000000
mean      103.685262
std       181.066207
min         0.000000
25%         0.000000
50%         0.000000
75%       166.000000
max      1600.000000
Name: MasVnrArea, dtype: float64
************************************************** 



In [18]:
print(datastyle.start+'1.  LotFrontage       :'+ datastyle.end)
print(df['LotFrontage'].unique()); Line_Separator1()

print(datastyle.start+'2.  GarageYrBlt       :'+ datastyle.end)
print(df['GarageYrBlt'].unique()); Line_Separator1()

print(datastyle.start+'3.  MasVnrArea        :'+ datastyle.end)
print(df['MasVnrArea'].unique()); Line_Separator1()

[1m1.  LotFrontage       :[0m
[ 65.  80.  68.  60.  84.  85.  75.  nan  51.  50.  70.  91.  72.  66.
 101.  57.  44. 110.  98.  47. 108. 112.  74. 115.  61.  48.  33.  52.
 100.  24.  89.  63.  76.  81.  95.  69.  21.  32.  78. 121. 122.  40.
 105.  73.  77.  64.  94.  34.  90.  55.  88.  82.  71. 120. 107.  92.
 134.  62.  86. 141.  97.  54.  41.  79. 174.  99.  67.  83.  43. 103.
  93.  30. 129. 140.  35.  37. 118.  87. 116. 150. 111.  49.  96.  59.
  36.  56. 102.  58.  38. 109. 130.  53. 137.  45. 106. 104.  42.  39.
 144. 114. 128. 149. 313. 168. 182. 138. 160. 152. 124. 153.  46.]
**************************************************************************************************** 

[1m2.  GarageYrBlt       :[0m
[2003. 1976. 2001. 1998. 2000. 1993. 2004. 1973. 1931. 1939. 1965. 2005.
 1962. 2006. 1960. 1991. 1970. 1967. 1958. 1930. 2002. 1968. 2007. 2008.
 1957. 1920. 1966. 1959. 1995. 1954. 1953.   nan 1983. 1977. 1997. 1985.
 1963. 1981. 1964. 1999. 1935. 1990. 1945. 1987. 1

In [19]:
# Missing value filled with zero 
print(datastyle.start+"Missing Value Filled With Zero:\n"+ datastyle.end)
print(datastyle.start+"LotFrontage: Filled empty raws with zero:"+ datastyle.end, df["LotFrontage"].fillna(0, inplace=True))
print(datastyle.start+"MasVnrArea : Filled empty raws with zero:"+ datastyle.end, df["MasVnrArea"].fillna(0, inplace=True)); Line_Separator()

# GarageYrBlt median value is 1980 (identity using describe previous steps)
print(datastyle.start+"Replaced nan value from GarageYrBlt with median value of 1980"+ datastyle.end); Line_Separator1()
def replace_name(x,y):
    df['GarageYrBlt'].replace(x,y, inplace=True)
    
replace_name(np.nan,1980)

# Convert the value to int64
print(datastyle.start+"Converted values of LotFrontage, MasVnrArea, & GarageYrBlt to Integer"+ datastyle.end); Line_Separator1()
df['LotFrontage']=df['LotFrontage'].values.astype(np.int64)
df['MasVnrArea']=df['MasVnrArea'].values.astype(np.int64)
df['GarageYrBlt']=df['GarageYrBlt'].values.astype(np.int64)


[1mMissing Value Filled With Zero:
[0m
[1mLotFrontage: Filled empty raws with zero:[0m None
[1mMasVnrArea : Filled empty raws with zero:[0m None
************************************************** 

[1mReplaced nan value from GarageYrBlt with median value of 1980[0m
**************************************************************************************************** 

[1mConverted values of LotFrontage, MasVnrArea, & GarageYrBlt to Integer[0m
**************************************************************************************************** 



In [20]:
# identify again if there is any missing value in the dataset
print(datastyle.start+"Missing values in Percentage :\n"+ datastyle.end); Line_Separator()
percent_missing_housing_data_of_Australian_market =round(100*(df.isnull()).sum()/len(df), 2)
percent_missing_housing_data_of_Australian_market.sort_values(ascending=False)

[1mMissing values in Percentage :
[0m
************************************************** 



Id             0.0
CentralAir     0.0
GarageYrBlt    0.0
GarageType     0.0
FireplaceQu    0.0
              ... 
MasVnrArea     0.0
MasVnrType     0.0
Exterior2nd    0.0
Exterior1st    0.0
SalePrice      0.0
Length: 81, dtype: float64

Upon addressing the missing values in both categorical and numerical variables, we have successfully resolved all instances of missing values. Consequently, the dataset now does not contain any missing values.

Let's review the categorical variables for the presence of any outliers and conduct an exploratory data analysis (EDA) specifically focusing on the sales price variable.