In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Reading the Training Data

In [None]:
train_df = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
train_df.head()

# Reading the Testing Data

In [None]:
test_df = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
test_df.head()

# Dropping the Id's Column as they are not needed

In [None]:
train_df.drop('Id', axis=1, inplace=True)
test_df.drop('Id', axis=1, inplace=True)

# Combining both the Training and Testing dataframes

In [None]:
train_df = train_df.append(test_df)
train_df

In [None]:
train_df.reset_index(drop=True, inplace=True)

In [None]:
train_df.head()

In [None]:
train_df.info()

# Extracting the numerical features names

In [None]:
columns_objects = [key for key in dict(train_df.dtypes) if dict(train_df.dtypes)[key] in ['float64', 'int64']]

# Processing the Categorical Features

# Visulazing MSZoning feature with SalePrice to assign ranks for each category

In [None]:
result = train_df.groupby(['MSZoning'])['SalePrice'].median().reset_index().sort_values('SalePrice')
sns.barplot(x='MSZoning', y='SalePrice', data=train_df, order=result['MSZoning'])

In [None]:
train_df.info()

# Visualizing Neighborhood with SalePrice by Box Plot

In [None]:
sns.set(rc={'figure.figsize':(25.7,15.27)})
sns.set_style("whitegrid")
sns.boxplot(x='Neighborhood', y='SalePrice', data=train_df)

# Visualizing Neighborhood feature with SalePrice by Bar Plot to assign ranks to each category

In [None]:
result = train_df.groupby(['Neighborhood'])['SalePrice'].median().reset_index().sort_values('SalePrice')
#norm = plt.Normalize(train_df['SalePrice'].values.min(), train_df['SalePrice'].values.max())
#colors = plt.cm.Reds(norm(train_df['SalePrice']))
sns.barplot(x='Neighborhood', y="SalePrice", data=train_df, order=result['Neighborhood'])

In [None]:
train_df.info()

# Visualizing Exterior1st feature with SalePrice by Bar Plot to assign ranks to each category

In [None]:
result = train_df.groupby(['Exterior1st'])['SalePrice'].median().reset_index().sort_values('SalePrice')
sns.barplot(x='Exterior1st', y='SalePrice', data=train_df, order=result['Exterior1st'])

# Visualizing Exterior2nd feature with SalePrice by Bar Plot to assign ranks to each category

In [None]:
result = train_df.groupby(['Exterior2nd'])['SalePrice'].median().reset_index().sort_values('SalePrice')
sns.barplot(x='Exterior2nd', y='SalePrice', data=train_df, order=result['Exterior2nd'])

# Visualizing RoofStyle feature with SalePrice by Bar Plot to assign ranks to each category

In [None]:
result = train_df.groupby(['RoofStyle'])['SalePrice'].median().reset_index().sort_values('SalePrice')
sns.barplot(x='RoofStyle', y='SalePrice', data=train_df, order=result['RoofStyle'])

# Visualizing RoofMatl feature with SalePrice by Bar Plot to assign ranks to each category

In [None]:
result = train_df.groupby(['RoofMatl'])['SalePrice'].median().reset_index().sort_values('SalePrice')
sns.barplot(x='RoofMatl', y='SalePrice', data=train_df, order=result['RoofMatl'])

# Visualizing MasVnrType feature with SalePrice by Bar Plot to assign ranks to each category

In [None]:
result = train_df.groupby(['MasVnrType'])['SalePrice'].median().reset_index().sort_values('SalePrice')
sns.barplot(x='MasVnrType', y='SalePrice', data=train_df, order=result['MasVnrType'])

# Visualizing LotConfig feature with SalePrice by Bar Plot to assign ranks to each category

In [None]:
result = train_df.groupby(['LotConfig'])['SalePrice'].median().reset_index().sort_values('SalePrice')
sns.barplot(x='LotConfig', y='SalePrice', data=train_df, order=result['LotConfig'])

# Visualizing Condition1 feature with SalePrice by Bar Plot to assign ranks to each category

In [None]:
result = train_df.groupby(['Condition1'])['SalePrice'].median().reset_index().sort_values('SalePrice')
sns.barplot(x='Condition1', y='SalePrice', data=train_df, order=result['Condition1'])

# Visualizing LandContour feature with SalePrice by Bar Plot to assign ranks to each category

In [None]:
result = train_df.groupby(['LandContour'])['SalePrice'].median().reset_index().sort_values('SalePrice')
sns.barplot(x='LandContour', y='SalePrice', data=train_df, order=result['LandContour'])

# Visualizing Condition2 feature with SalePrice by Bar Plot to assign ranks to each category

In [None]:
result = train_df.groupby(['Condition2'])['SalePrice'].median().reset_index().sort_values('SalePrice')
sns.barplot(x='Condition2', y='SalePrice', data=train_df, order=result['Condition2'])

# Visualizing Foundation feature with SalePrice by Bar Plot to assign ranks to each category

In [None]:
result = train_df.groupby(['Foundation'])['SalePrice'].median().reset_index().sort_values('SalePrice')
sns.barplot(x='Foundation', y='SalePrice', data=train_df, order=result['Foundation'])

# Visualizing SaleCondition feature with SalePrice by Bar Plot to assign ranks to each category

In [None]:
result = train_df.groupby(['SaleCondition'])['SalePrice'].median().reset_index().sort_values('SalePrice')
sns.barplot(x='SaleCondition', y='SalePrice', data=train_df, order=result['SaleCondition'])

# Visualizing SaleType feature with SalePrice by Bar Plot to assign ranks to each category

In [None]:
result = train_df.groupby(['SaleType'])['SalePrice'].median().reset_index().sort_values('SalePrice')
sns.barplot(x='SaleType', y='SalePrice', data=train_df, order=result['SaleType'])

# Visualizing MiscFeature feature with SalePrice by Bar Plot to assign ranks to each category

In [None]:
result = train_df.groupby(['MiscFeature'])['SalePrice'].median().reset_index().sort_values('SalePrice')
sns.barplot(x='MiscFeature', y='SalePrice', data=train_df, order=result['MiscFeature'])

# Visualizing Fence feature with SalePrice by Bar Plot to assign ranks to each category

In [None]:
result = train_df.groupby(['Fence'])['SalePrice'].median().reset_index().sort_values('SalePrice')
sns.barplot(x='Fence', y='SalePrice', data=train_df, order=result['Fence'])

# Visualizing PavedDrive feature with SalePrice by Bar Plot to assign ranks to each category

In [None]:
result = train_df.groupby(['PavedDrive'])['SalePrice'].median().reset_index().sort_values('SalePrice')
sns.barplot(x='PavedDrive', y='SalePrice', data=train_df, order=result['PavedDrive'])

# Visualizing GarageFinish feature with SalePrice by Bar Plot to assign ranks to each category

In [None]:
result = train_df.groupby(['GarageFinish'])['SalePrice'].median().reset_index().sort_values('SalePrice')
sns.barplot(x='GarageFinish', y='SalePrice', data=train_df, order=result['GarageFinish'])

# Visualizing GarageType feature with SalePrice by Bar Plot to assign ranks to each category

In [None]:
result = train_df.groupby(['GarageType'])['SalePrice'].median().reset_index().sort_values('SalePrice')
sns.barplot(x='GarageType', y='SalePrice', data=train_df, order=result['GarageType'])

# Visualizing Functional feature with SalePrice by Bar Plot to assign ranks to each category

In [None]:
result = train_df.groupby(['Functional'])['SalePrice'].median().reset_index().sort_values('SalePrice')
sns.barplot(x='Functional', y='SalePrice', data=train_df, order=result['Functional'])

# Visualizing Electrical feature with SalePrice by Bar Plot to assign ranks to each category

In [None]:
sns.barplot(x='Electrical', y='SalePrice', data=train_df)

# Visualizing Heating feature with SalePrice by Bar Plot to assign ranks to each category

In [None]:
sns.barplot(x='Heating', y='SalePrice', data=train_df)

# Visualizing RoofMatl feature with SalePrice by Bar Plot to assign ranks to each category

In [None]:
sns.barplot(x='RoofMatl', y='SalePrice', data=train_df, hue='RoofStyle')

# Visualizing RoofStyle feature with SalePrice by Bar Plot to assign ranks to each category

In [None]:
sns.barplot(x='RoofStyle', y='SalePrice', data=train_df)

# Visualizing LandSlope feature with SalePrice by Bar Plot to assign ranks to each category

In [None]:
sns.barplot(x='LandSlope', y='SalePrice', data=train_df)

# Visualizing LandContour feature with SalePrice by Bar Plot to assign ranks to each category

In [None]:
sns.barplot(x='LandContour', y='SalePrice', data=train_df)

# Visualizing Street feature with SalePrice by Bar Plot to assign ranks to each category

In [None]:
sns.barplot(x='Street', y='SalePrice', data=train_df)

# Visualizing Alley feature with SalePrice by Bar Plot to assign ranks to each category

In [None]:
sns.barplot(x='Alley', y='SalePrice', data=train_df)

# Assigning ranks to each distinct category of categorical feature

In [None]:
alley_dict = {'Grvl':1, 'Pave':2}
landslope_dict = {'Gtl':1, 'Mod':2, 'Sev':3}
lotshape_dict = {'IR3':1, 'IR2':2, 'IR1':3, 'Reg':4}
utilities_dict = {'ELO':1, 'NoSeWa':2, 'NoSewr':3, 'AllPub':4}
landslope_dict = {'Gtl':1, 'Mod':2, 'Sev':3}
neighbor_dict = {'MeadowV':1, 'IDOTRR':2, 'BrDale':3, 'BrkSide':4, 'Edwards':5, 'OldTown':6, 'Sawyer':7, 'Blueste':8, 'SWISU':9, 'NPkVill':10, 'NAmes':11, 'Mitchel':12, 'SawyerW':13, 'NWAmes':14, 'Gilbert':15, 'Blmngtn':16, 'CollgCr':17, 'Crawfor':18, 'ClearCr':19, 'Somerst':20, 'Veenker':21, 'Timber':22, 'StoneBr':23, 'NridgHt':24, 'NoRidge':25}
bldg_dict = {'1Fam':1, '2FmCon':2, 'Duplx':3, 'TwnhsE':4, 'TwnhsI':5}
housestyle_dict = {'1Story':1, '1.5Fin':2, '1.5Unf':3, '2Story':4, '2.5Fin':5, '2.5Unf':6, 'SFoyer':7, 'SLvl':8}
qual_dict = {'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}
exposure_dict = {'No':1, 'Mn':2, 'Av':3, 'Gd':4}
bsmtfintype_dict = {'Unf':1, 'LwQ':2, 'Rec':3, 'BLQ':4, 'ALQ':5, 'GLQ':6}
heat_dict = {'Floor':1, 'Grav':2, 'Wall':3, 'OthW':4, 'GasW':5, 'GasA':6}
centralac_dict = {'Y':1, 'N':0}
electric_dict = {'Mix':1, 'FuseP':2, 'FuseF':3, 'FuseA':4, 'SBrkr':5}
functional_dict = {'Maj2':1, 'Sev':2, 'Min2':3, 'Min1':4, 'Maj1':5, 'Mod':6, 'Typ':7}
garagetype_dict = {'CarPort':1, 'Detchd':2, '2Types':3, 'Basment':4, 'Attchd':5, 'BuiltIn':6}
garagefin_dict = {'Unf':1, 'RFn':2, 'Fin':3}
pavedrive_dict = {'N':1, 'P':2, 'Y':3}
fence_dict = {'MnWw':1, 'GdWo':2, 'MnPrv':3, 'GdPrv':4}
misc_dict = {'Othr':1, 'Shed':2, 'Gar2':3, 'TenC':4}
saletype_dict = {'Oth':1, 'ConLD':2, 'ConLw':3, 'COD':4, 'WD':5, 'ConLI':6, 'CWD':7, 'Con':8, 'New':9}
salecond_dict = {'AdjLand':1, 'Abnorml':2, 'Family':3, 'Alloca':4, 'Normal':5, 'Partial':6}
foundation_dict = {'Slab':1, 'Brktil':2, 'CBlock':3, 'Stone':4, 'Wood':5, 'PConc':6}
exterior1st_dict = {'BrkComm':1, 'AsphShn':2, 'CBlock':3, 'AsbShng':4, 'MetalSd':5, 'Wd Sdng':6, 'WdShing':7, 'HdBoard':8, 'Stucco': 9, 'Plywood':10, 'BrkFace':11, 'VinylSd':12, 'CemntBd':13, 'Stone':14, 'ImStucc':15}
exterior2nd_dict = {'CBlock':1, 'AsbShng':2, 'Brk Cmn':3, 'AsphShn':4, 'Wd Sdng':5, 'MetalSd':6,'Stucco': 7, 'Stone':8, 'Wd Shng':9, 'HdBoard':10, 'Plywood':11, 'BrkFace':12, 'VinylSd':13, 'CmentBd':14, 'ImStucc':15, 'Other':16}
roofstyle_dict = {'Gambrel':1, 'Gable':2, 'Mansard':3, 'Flat':4, 'Hip':5, 'Shed':6}
roofmatl_dict = {'Roll':1, 'ClyTile':2, 'CompShg':3, 'Metal':4, 'Tar&Grv':5, 'Membran':6, 'WdShake':7, 'WdShngl':8}
masvnrtype_dict = {'BrkCmn':1, 'None':2, 'BrkFace':3, 'Stone':4}
landcontour_dict = {'Bnk':1, 'Lvl':2, 'Low':3, 'HLS':4}
lotconfig_dict = {'Inside':1, 'FR2':2, 'Corner':3, 'FR3':4, 'CulDSac':5}
con2_dict = {'RRNn':1, 'Artery':2, 'Feedr':3, 'RRAn':4, 'Norm':5, 'RRAe':6, 'PosN':7, 'PosA':8}
con1_dict = {'Artery':1, 'RRAe':2, 'Feedr':3, 'Norm':4, 'RRAn':5, 'RRNe':6, 'RRNn':7, 'PosN':8, 'PosA':9}
mszoning_dict = {'C (all)':1, 'RM':2, 'RH':3, 'RL':4, 'FV':5}


# Applying Target Odinal Encoding to all the categorical features

In [None]:
train_df['Alley'] = train_df['Alley'].map(alley_dict)
train_df['Street'] = train_df['Street'].map(alley_dict)
train_df['LandSlope'] = train_df['LandSlope'].map(landslope_dict)
train_df['MSZoning'] = train_df['MSZoning'].map(mszoning_dict)
train_df['MasVnrType'] = train_df['MasVnrType'].map(masvnrtype_dict)
train_df['LotShape'] = train_df['LotShape'].map(lotshape_dict)
train_df['Utilities'] = train_df['Utilities'].map(utilities_dict)
train_df['Neighborhood'] = train_df['Neighborhood'].map(neighbor_dict)
train_df['BldgType'] = train_df['BldgType'].map(bldg_dict)
train_df['HouseStyle'] = train_df['HouseStyle'].map(housestyle_dict)
train_df['ExterQual'] = train_df['ExterQual'].map(qual_dict)
train_df['ExterCond'] = train_df['ExterCond'].map(qual_dict)
train_df['BsmtExposure'] = train_df['BsmtExposure'].map(exposure_dict)
train_df['BsmtFinType1'] = train_df['BsmtFinType1'].map(bsmtfintype_dict)
train_df['BsmtFinType2'] = train_df['BsmtFinType2'].map(bsmtfintype_dict)
train_df['BsmtExposure'] = train_df['BsmtExposure'].map(exposure_dict)
train_df['BsmtCond'] = train_df['BsmtCond'].map(qual_dict)
train_df['BsmtQual'] = train_df['BsmtQual'].map(qual_dict)
train_df['Heating'] = train_df['Heating'].map(heat_dict)
train_df['CentralAir'] = train_df['CentralAir'].map(centralac_dict)
train_df['Electrical'] = train_df['Electrical'].map(electric_dict)
train_df['Functional'] = train_df['Functional'].map(functional_dict)
train_df['GarageType'] = train_df['GarageType'].map(garagetype_dict)
train_df['GarageFinish'] = train_df['GarageFinish'].map(garagefin_dict)
train_df['GarageQual'] = train_df['GarageQual'].map(qual_dict)
train_df['GarageCond'] = train_df['GarageCond'].map(qual_dict)
train_df['PavedDrive'] = train_df['PavedDrive'].map(pavedrive_dict)
train_df['Fence'] = train_df['Fence'].map(fence_dict)
train_df['SaleType'] = train_df['SaleType'].map(saletype_dict)
train_df['SaleCondition'] = train_df['SaleCondition'].map(salecond_dict)
train_df['MiscFeature'] = train_df['MiscFeature'].map(misc_dict)
train_df['Foundation'] = train_df['Foundation'].map(foundation_dict)
train_df['Exterior1st'] = train_df['Exterior1st'].map(exterior1st_dict)
train_df['Exterior2nd'] = train_df['Exterior2nd'].map(exterior2nd_dict)
train_df['RoofStyle'] = train_df['RoofStyle'].map(roofstyle_dict)
train_df['RoofMatl'] = train_df['RoofMatl'].map(roofmatl_dict)
train_df['LandContour'] = train_df['LandContour'].map(landcontour_dict)
train_df['LotConfig'] = train_df['LotConfig'].map(lotconfig_dict)
train_df['Condition1'] = train_df['Condition1'].map(con1_dict)
train_df['Condition2'] = train_df['Condition2'].map(con2_dict)
train_df['HeatingQC'] = train_df['HeatingQC'].map(qual_dict)
train_df['KitchenQual'] = train_df['KitchenQual'].map(qual_dict)
train_df['FireplaceQu'] = train_df['FireplaceQu'].map(qual_dict)

# Handling NA values

In [None]:
train_df['LotFrontage'].isna().sum()

In [None]:
train_df['LotFrontage'] = train_df['LotFrontage'].fillna(train_df['LotFrontage'].mode()[0])

In [None]:
train_df['BldgType'] = train_df['BldgType'].fillna(train_df['BldgType'].mode()[0])

In [None]:
train_df['MasVnrType'].unique()

In [None]:
train_df['MasVnrType'] = train_df['MasVnrType'].fillna(0)

In [None]:
train_df['MasVnrArea'] = train_df['MasVnrArea'].fillna(0)

In [None]:
train_df['Exterior1st'] = train_df['Exterior1st'].fillna(train_df['Exterior1st'].mode()[0])

In [None]:
train_df['Exterior2nd'] = train_df['Exterior2nd'].fillna(train_df['Exterior2nd'].mode()[0])


In [None]:
train_df['Foundation'].isna().sum()

In [None]:
train_df['Foundation'].unique()

In [None]:
train_df['Foundation'] = train_df['Foundation'].fillna(train_df['Foundation'].mode()[0])

In [None]:
train_df['Electrical'] = train_df['Electrical'].fillna(train_df['Electrical'].mode()[0])

In [None]:
train_df['BsmtUnfSF'].value_counts()

In [None]:
train_df['BsmtUnfSF'] = train_df['BsmtUnfSF'].fillna(train_df['BsmtUnfSF'].mode()[0])

In [None]:
train_df['BsmtFinSF2'].value_counts()

In [None]:
train_df['BsmtFinSF2'] = train_df['BsmtFinSF2'].fillna(train_df['BsmtFinSF2'].mode()[0])


In [None]:
train_df['BsmtFinType2'].value_counts()

In [None]:
train_df['BsmtFinType2'].unique()

In [None]:
train_df['BsmtFinType2'].isna().sum()

In [None]:
train_df['BsmtFinType2'] = train_df['BsmtFinType2'].fillna(0)

In [None]:
train_df['BsmtFinType2'].value_counts()

In [None]:
train_df['BsmtFinSF1'] = train_df['BsmtFinSF1'].fillna(train_df['BsmtFinSF1'].mode()[0])

In [None]:
train_df['BsmtFinType1'].unique()

In [None]:
train_df['FireplaceQu'].unique()

In [None]:
train_df['FireplaceQu'] = train_df['FireplaceQu'].fillna(0)

In [None]:
train_df['Fence'] = train_df['Fence'].fillna(0)

In [None]:
train_df['SaleType'] = train_df['SaleType'].fillna(train_df['SaleType'].mode()[0])

In [None]:
train_df['MiscFeature'] = train_df['MiscFeature'].fillna(0)

In [None]:
train_df.drop('PoolArea', axis=1, inplace=True)
train_df.drop('PoolQC', axis=1, inplace=True)

In [None]:
train_df['GarageArea'].unique()

In [None]:
train_df['GarageCars'].unique()

In [None]:
train_df['GarageType'].value_counts()

In [None]:
train_df['GarageQual'] = train_df['GarageQual'].fillna(0)
train_df['GarageCond'] = train_df['GarageCond'].fillna(0)
train_df['GarageFinish'] = train_df['GarageFinish'].fillna(0)
train_df['GarageYrBlt'] = train_df['GarageYrBlt'].fillna(0)
train_df['GarageType'] = train_df['GarageType'].fillna(0)

In [None]:
train_df['GarageArea'] = train_df['GarageArea'].fillna(train_df['GarageArea'].mode()[0])
train_df['GarageCars'] = train_df['GarageCars'].fillna(train_df['GarageCars'].mode()[0])

In [None]:
train_df['Functional'].unique()

In [None]:
train_df['Functional'].value_counts()

In [None]:
train_df['Functional'] = train_df['Functional'].fillna(train_df['Functional'].mode()[0])

In [None]:
train_df['KitchenQual'].unique()

In [None]:
train_df['KitchenQual'].value_counts()

In [None]:
train_df['KitchenQual'] = train_df['KitchenQual'].fillna(train_df['KitchenQual'].mode()[0])


In [None]:
train_df['BsmtQual'] = train_df['BsmtQual'].fillna(0)
train_df['BsmtCond'] = train_df['BsmtCond'].fillna(0)

In [None]:
train_df['BsmtFinType1'].value_counts()

In [None]:
train_df['BsmtFinType1'] = train_df['BsmtFinType1'].fillna(0)

In [None]:
train_df['BsmtExposure'] = train_df['BsmtExposure'].fillna(0)

In [None]:
train_df['BsmtFullBath'].value_counts()

In [None]:
train_df['BsmtFullBath'].unique()

In [None]:
train_df['BsmtFullBath'] = train_df['BsmtFullBath'].fillna(train_df['BsmtFullBath'].mode()[0])

In [None]:
train_df['BsmtHalfBath'].unique()

In [None]:
train_df['BsmtHalfBath'] = train_df['BsmtHalfBath'].fillna(train_df['BsmtHalfBath'].mode()[0])


In [None]:
train_df['MSZoning'] = train_df['MSZoning'].fillna(train_df['MSZoning'].mode()[0])

In [None]:
train_df['TotalBsmtSF'].unique()

In [None]:
train_df['TotalBsmtSF'].value_counts()

In [None]:
train_df['TotalBsmtSF'].isna().sum()

In [None]:
train_df['TotalBsmtSF'] = train_df['TotalBsmtSF'].fillna(train_df['TotalBsmtSF'].mode()[0])

In [None]:
train_df['Alley'] = train_df['Alley'].fillna(0)

In [None]:
train_df['Utilities'].isna().sum()

In [None]:
train_df['Utilities'].unique()

In [None]:
train_df['Utilities'] = train_df['Utilities'].fillna(train_df['Utilities'].mode()[0])

In [None]:
train_df['Alley'].unique()

In [None]:
train_df.info()

# Removing categorical features from columns_objects

In [None]:
for i in columns_objects:
    if i not in train_df.columns.values:
        columns_objects.remove(i)

# Checking the Data Distribution of all Numerical features of the dataset

In [None]:
train_df.hist(columns_objects)

# From the above Histograms, we can see that only 'GarageCars, GarageArea, MoSold, TotRmsAbvGrd' follows Gaussian Distribution

# Removing the above mentioned features from the rest of the Numerical features

In [None]:
normalized_cols = ['GarageCars', 'GarageArea', 'MoSold', 'TotRmsAbvGrd']
for i in normalized_cols:
    if i in columns_objects:
        columns_objects.remove(i)

# Removing the SalePrice target feature

In [None]:
columns_objects.remove('SalePrice')

# Feature Scaling

# Applying Normalization i.e. Min Max Scaling on Numerical features which do not follow Gaussian Distribution

In [None]:
from sklearn.preprocessing import MinMaxScaler

min_max_scaler = MinMaxScaler()

model = min_max_scaler.fit(train_df[columns_objects])

min_max_scaled_data = model.transform(train_df[columns_objects])


In [None]:
min_max_scaled_data.shape

# Applying Standardization i.e. Z-Score Normalization on Numerical features which follow Gaussian Distribution

In [None]:
from sklearn.preprocessing import StandardScaler

standard_scaler = StandardScaler()

model = standard_scaler.fit(train_df[normalized_cols])

standard_scaled_data = model.transform(train_df[normalized_cols])

In [None]:
standard_scaled_data.shape

# Assigning labels to scaled data and converting to Data Frame

In [None]:
min_max_scaled_df = pd.DataFrame(min_max_scaled_data, columns=columns_objects)
standard_scaled_df = pd.DataFrame(standard_scaled_data, columns=normalized_cols)

In [None]:
min_max_scaled_df

In [None]:
standard_scaled_df

# Updating the dataframe with standardized values

In [None]:
train_df[normalized_cols] = standard_scaled_df[normalized_cols]

In [None]:
train_df[normalized_cols]

# Updating the dataframe with normalized values

In [None]:
train_df[columns_objects] = min_max_scaled_df[columns_objects]

In [None]:
train_df

# Separating the train dataframe

In [None]:
train = train_df.iloc[0:1460,]
train

In [None]:
train

# Separating the test dataframe

In [None]:
test = train_df.iloc[1460:2919,].drop('SalePrice', axis=1)
test

# Feature Selection

# Spliting the training and testing data

In [None]:
from sklearn.model_selection import train_test_split

X = train.drop('SalePrice', axis=1)
y = train['SalePrice']

X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
X_train.shape, X_validation.shape

# Visualizing the correlated features using Heatmap

In [None]:
cor = X_train.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.CMRmap_r)
plt.show()

# Separating the correlated features

In [None]:
def find_correlation(df, threshold):
    correlated_cols = set()
    correlation_matrix = df.corr()
    
    for i in range(len(correlation_matrix.columns)):
        for j in range(i):
            if abs(correlation_matrix.iloc[i,j]) > threshold:
                column_name = correlation_matrix.columns[i]
                correlated_cols.add(column_name)
                
    return correlated_cols

In [None]:
correlated_features = find_correlation(X_train, 0.85)
correlated_features

In [None]:
X_train.drop(correlated_features, axis=1,inplace=True)
X_validation.drop(correlated_features, axis=1, inplace=True)

In [None]:
X_train.shape, X_validation.shape

# Model Training and Testing

# XGBoost Classifier

In [None]:
import xgboost

predictor_model = xgboost.XGBRegressor()
predictor_model.fit(X_train,y_train)

In [None]:
y_predict = predictor_model.predict(X_train)
y_predict_validation = predictor_model.predict(X_validation)

# Checking the Mean Absolute Percentage Error

In [None]:
from sklearn.metrics import mean_absolute_percentage_error

mean_absolute_percentage_error(y_validation, y_predict_validation)

# CatBoost Classifier

In [None]:
import catboost as cb

In [None]:
model = cb.CatBoostRegressor(loss_function='RMSE')

In [None]:
model.fit(X_train, y_train)

In [None]:
y_predict_cb = model.predict(X_validation)

# Testing with Test data

In [None]:
y_pred_test = model.predict(test)

# Checking the Mean Absolute Percentage Error

In [None]:
mean_absolute_percentage_error(y_validation, y_predict_cb)

In [None]:
y_pred_test

# Preparing the sample_submission.csv for Kaggle Submission

In [None]:
test_ids = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
test_ids = test_ids['Id']

In [None]:
test_ids

In [None]:
pred = pd.DataFrame(y_pred_test, columns=['SalePrice'])
pred

In [None]:
pred = pd.concat([test_ids, pred], axis=1)

In [None]:
pred

In [None]:
pred.to_csv('sample_submission.csv')