In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [None]:
train_file = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv") 
# Load the testing dataset
test_file = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")

In [None]:
pd.set_option('display.max_columns', None)
print(train_file.shape)
train_file.head(3)

## Data Information
- Null value finding
- Table Information
- 5pt Summary

In [None]:
train_file[[i for i in train_file.columns if train_file[i].isnull().sum()>0]].isnull().sum()

In [None]:
#Table Information
train_file.info()

In [None]:
train_file.describe()

## Feature Engineering - Data Cleaning and Creating an ADS for Analysis

In [None]:
#if the columns having more than 50% of missing values, then i'm removing it
train_file = train_file.drop(['Id', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'], axis = 1)

In [None]:

# As per our data, if Bmst value is Not available -> No basement, Garage value is Not available -> No Garage, Alley Value is NA -> No alley Access
train_file['BsmtQual'] = np.where(train_file['BsmtQual'].isnull() == True, "No Basement", train_file['BsmtQual'])
train_file['BsmtCond'] = np.where(train_file['BsmtCond'].isnull() == True, "No Basement", train_file['BsmtCond'])
train_file['BsmtExposure'] = np.where(train_file['BsmtExposure'].isnull() == True, "No Basement", train_file['BsmtExposure'])
train_file['BsmtFinType1'] = np.where(train_file['BsmtFinType1'].isnull() == True, "No Basement", train_file['BsmtFinType1'])
train_file['BsmtFinType2'] = np.where(train_file['BsmtFinType2'].isnull() == True, "No Basement", train_file['BsmtFinType2'])
#--------------------
train_file['Alley'] = np.where(train_file['Alley'].isnull() == True, "No Alley Access", train_file['Alley'])
#--------------------
train_file['GarageType'] = np.where(train_file['GarageType'].isnull() == True, "No Garage", train_file['GarageType'])
train_file['GarageYrBlt'] = np.where(train_file['GarageYrBlt'].isnull() == True, "No Garage", train_file['GarageYrBlt'])
train_file['GarageFinish'] = np.where(train_file['GarageFinish'].isnull() == True, "No Garage", train_file['GarageFinish'])
train_file['GarageQual'] = np.where(train_file['GarageQual'].isnull() == True, "No Garage", train_file['GarageQual'])
train_file['GarageCond'] = np.where(train_file['GarageCond'].isnull() == True, "No Garage", train_file['GarageCond'])

In [None]:
#Replace the grouped mode value of MasVnrType, Elecctrical, MasVnrArea
train_file['MasVnrType'] = train_file.groupby(['YearBuilt'], sort=False)['MasVnrType'].apply(lambda x: x.fillna(x.mode().iloc[0]))
train_file['Electrical'] = train_file.groupby(['YearBuilt'], sort=False)['Electrical'].apply(lambda x: x.fillna(x.mode().iloc[0]))
train_file['MasVnrArea'] = train_file['MasVnrArea'].fillna(train_file.groupby(['YearBuilt'])['MasVnrArea'].transform('mean'))

#### Treating the Null Values for LotFrontage 
- In our dataset, We have 259 null values for lotfrontage, if we replace with mean/median or some random values, It might effects on accuracy or model performance. To make model more efficiency, I'm predicting the value by consdering the relavent parameters of lotfrontage.
- Here I'm making LotFrontage as a dependent variable and rest all are independent variables. All the null values of LotFrontage, I'm considering as test dataset. So that we can predict the missing values.

In [None]:
train_LotFrontage_main = train_file[train_file.LotFrontage.isnull() != True]
test_LotFrontage_main = train_file[train_file.LotFrontage.isnull() == True]
#--------------------
test_LotFrontage = test_LotFrontage_main.drop('LotFrontage', axis = 1)
train_LotFrontage_xtrain = train_LotFrontage_main[['LotArea', 'Street', 'LotShape', 'LandSlope', 'LotConfig', 'HouseStyle', 'GarageArea']]
train_LotFrontage_ytrain = train_LotFrontage_main['LotFrontage']
#--------------------
test_LotFrontage = test_LotFrontage[['LotArea', 'Street', 'LotShape', 'LandSlope', 'LotConfig', 'HouseStyle', 'GarageArea']]

Before training the model, we make sure that all the features are in the form of int or float. But we have categorical features in our dataset. For "Logconig, Street" I'm used one hot enoding because those two columns are nominal categories for for "LotShape, LandSlope, HouseStyle" I'm used ordinal encoding technique.

In [None]:
#Lotconfig and Street are falls under Nominal category. So, I used nominal encoding technique to convert it to integer
train_LotFrontage_xtrain_Nomina_Encoding = pd.get_dummies(train_LotFrontage_xtrain[['LotConfig', 'Street']])
train_LotFrontage_xtrain = pd.concat([train_LotFrontage_xtrain, train_LotFrontage_xtrain_Nomina_Encoding], 1)
train_LotFrontage_xtrain = train_LotFrontage_xtrain.drop(['LotConfig', 'Street'], axis = 1)
#--------------------
#LotShape, LandSlope and HouseStyle are falls under Ordinal category. So, I used Ordinal encoding technique to convert it to integer
train_LotFrontage_xtrain['LotShape'] = train_LotFrontage_xtrain.LotShape.map({'IR3':0, 'IR2':1, 'IR1': 2, 'Reg': 3})
train_LotFrontage_xtrain['LandSlope'] = train_LotFrontage_xtrain.LandSlope.map({'Sev':0, 'Mod':1, 'Gtl': 2})
train_LotFrontage_xtrain['HouseStyle'] = train_LotFrontage_xtrain.HouseStyle.map({'1Story':0, '1.5Fin':1, '1.5Unf': 2, '2Story':3, '2.5Fin':4, '2.5Unf': 5, 'SFoyer':6, 'SLvl':7})
train_LotFrontage_xtrain = train_LotFrontage_xtrain.drop('LotConfig_FR3', axis = 1)

In [None]:
#Lotconfig and Street are falls under Nominal category. So, I used nominal encoding technique to convert it to integer
test_LotFrontage_Nomina_Encoding = pd.get_dummies(test_LotFrontage[['LotConfig', 'Street']])
test_LotFrontage = pd.concat([test_LotFrontage, test_LotFrontage_Nomina_Encoding], 1)
test_LotFrontage = test_LotFrontage.drop(['LotConfig', 'Street'], axis = 1)
#--------------------
#LotShape, LandSlope and HouseStyle are falls under Ordinal category. So, I used Ordinal encoding technique to convert it to integer
test_LotFrontage['LotShape'] = test_LotFrontage.LotShape.map({'IR3':0, 'IR2':1, 'IR1': 2, 'Reg': 3})
test_LotFrontage['LandSlope'] = test_LotFrontage.LandSlope.map({'Sev':0, 'Mod':1, 'Gtl': 2})
test_LotFrontage['HouseStyle'] = test_LotFrontage.HouseStyle.map({'1Story':0, '1.5Fin':1, '1.5Unf': 2, '2Story':3, '2.5Fin':4, '2.5Unf': 5, 'SFoyer':6, 'SLvl':7})

In [None]:
#By using Random Forest Algorithm I replaced the missing values of LotFrontage with predicted values
reg_rf = RandomForestRegressor(n_estimators=1000,min_samples_split=2,min_samples_leaf=1,max_features='sqrt',max_depth=25)
reg_rf.fit(train_LotFrontage_xtrain, train_LotFrontage_ytrain)
y_pred= reg_rf.predict(test_LotFrontage)
test_LotFrontage_main['LotFrontage'] = y_pred
print("Accuracy on Traing set: ",reg_rf.score(train_LotFrontage_xtrain,train_LotFrontage_ytrain))

In [None]:

train_file = test_LotFrontage_main.append(train_LotFrontage_main).sort_index()
print(train_file[[i for i in train_file.columns if train_file[i].isnull().sum()>0]].isnull().sum())
train_file.head(3)

## Feature Selection-Information gain - mutual information In Regression Problem Statements

- Feature Selection-Information gain - mutual information In Regression Problem Statements Mutual Information

- Estimate mutual information for a continuous target variable.

- Mutual information (MI) between two random variables is a non-negative value, which measures the dependency between the variables. It is equal to zero if and only if two random variables are independent, and higher values mean higher dependency.

- The function relies on nonparametric methods based on entropy estimation from k-nearest neighbors distances

- Mutual information is calculated between two variables and measures the reduction in uncertainty for one variable given a known value of the other variable.


In [None]:
train_file = train_file[['LotFrontage','LotArea','Alley','LotShape','Utilities','LandSlope','HouseStyle','OverallQual','OverallCond','YearBuilt','YearRemodAdd','ExterQual','ExterCond','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinSF1','BsmtFinType2','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','HeatingQC','CentralAir','1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea','BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr','KitchenQual','TotRmsAbvGrd','Fireplaces','GarageCars','GarageArea','GarageQual','GarageCond','PavedDrive','WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','YrSold', 'SalePrice']]

In [None]:
train_file['Alley'] = np.where(train_file['Alley'] == 'No Alley Access', 0, 1)
train_file['LotShape'] = train_file.LotShape.map({'IR3':0, 'IR2':1, 'IR1': 2, 'Reg': 3})
train_file['Utilities'] = train_file.Utilities.map({'ELO':0, 'NoSeWa':1, 'NoSewr': 2, 'AllPub': 3})
train_file['LandSlope'] = train_file.LandSlope.map({'Sev':0, 'Mod':1, 'Gtl': 2})
train_file['HouseStyle'] = train_file.HouseStyle.map({'1Story':0, '1.5Fin':1, '1.5Unf': 2, '2Story':3, '2.5Fin':4, '2.5Unf': 5, 'SFoyer':6, 'SLvl':7})
train_file['ExterQual'] = train_file.ExterQual.map({'Po':0, 'Fa':1, 'TA': 2, 'Gd': 3,'Ex': 4})
train_file['ExterCond'] = train_file.ExterCond.map({'Po':0, 'Fa':1, 'TA': 2, 'Gd': 3,'Ex': 4})
train_file['BsmtQual'] = train_file.BsmtQual.map({'Po':1, 'Fa':2, 'TA': 3, 'Gd': 4,'Ex': 5, 'No Basement': 0})
train_file['BsmtCond'] = train_file.BsmtCond.map({'Po':1, 'Fa':2, 'TA': 3, 'Gd': 4,'Ex': 5, 'No Basement': 0})
train_file['BsmtExposure'] = train_file.BsmtExposure.map({'No Basement':0, 'No':1, 'Mn': 2, 'Av': 3,'Gd': 4})
train_file['BsmtFinType1'] = train_file.BsmtFinType1.map({'Unf':1, 'LwQ':2, 'Rec': 3, 'BLQ': 4,'ALQ': 5,'GLQ': 6, 'No Basement': 0})
train_file['BsmtFinType2'] = train_file.BsmtFinType2.map({'Unf':1, 'LwQ':2, 'Rec': 3, 'BLQ': 4,'ALQ': 5,'GLQ': 6, 'No Basement': 0})
train_file['HeatingQC'] = train_file.HeatingQC.map({'Po':0, 'Fa':1, 'TA': 2, 'Gd': 3,'Ex': 4})
train_file['CentralAir'] = np.where(train_file['CentralAir'] == 'Y', 1, 0)
train_file['KitchenQual'] = train_file.KitchenQual.map({'Po':0, 'Fa':1, 'TA': 2, 'Gd': 3,'Ex': 4})
train_file['GarageQual'] = train_file.GarageQual.map({'Po':1, 'Fa':2, 'TA': 3, 'Gd': 4,'Ex': 5, 'No Garage': 0})
train_file['GarageCond'] = train_file.GarageCond.map({'Po':1, 'Fa':2, 'TA': 3, 'Gd': 4,'Ex': 5, 'No Garage': 0})
train_file['PavedDrive'] = train_file.PavedDrive.map({'N':0, 'P':1, 'Y': 2})

train_file['YearBuilt'] = train_file['YrSold'] - train_file['YearBuilt']
train_file['YearRemodAdd'] = train_file['YrSold'] - train_file['YearRemodAdd']

train_file = train_file.rename(columns={"YearBuilt": "BuiltYearsBack", "YearRemodAdd": "RemodYearsBack"})
print(train_file.shape)
train_file.head(3)

In [None]:
# Spliting data for training the model. Splitting the data will be done at the begining of feature seletion phase
X = train_file.drop('SalePrice', axis = 1)
y = train_file['SalePrice']
X_train, X_test, Y_train, Y_test = tts(X, y, test_size=0.20,random_state=42)

Scaled the data for each metrics by using feature scaling techniques to reduce the bias, to normalize the data within a range and speeding up the calculation while training the model. After applying the Standard Scaler, data range is in between -3 to 3


In [None]:
Standardscaler = StandardScaler()
X_train_col = X_train.columns
X_train_ADS = pd.DataFrame(Standardscaler.fit_transform(X_train),columns = X_train_col )
X_train_ADS.head(2)

In [None]:
# determine the mutual information
mutual_info = mutual_info_regression(X_train_ADS.fillna(0), Y_train)
mutual_info = pd.Series(mutual_info)
mutual_info.index = X_train_ADS.columns
mutual_info.sort_values(ascending=False)
#--------------------
#Considering the columns for training the model which are atleast 10% of information shared with dependent variable/feature
Req_Cols = list(mutual_info[mutual_info>0.1].index)
Req_Cols

In [None]:
#Creating the Training ADS with selected columns
Train_ADS = X_train_ADS[Req_Cols]
Train_ADS.head(3)

In [None]:
#Applying Scaling technique to Test Dataset
Standardscaler = StandardScaler()
X_test_col = X_test.columns
X_test_ADS = pd.DataFrame(Standardscaler.fit_transform(X_test),columns = X_test_col )
#--------------------
#Creating the Testing ADS with selected columns
Test_ADS = X_test_ADS[Req_Cols]
print(Test_ADS.shape)
Test_ADS.head(2)


## Linear regression

In [None]:
linear_reg = LinearRegression()
linear_reg.fit(Train_ADS, Y_train)
y_pred= linear_reg.predict(Test_ADS)
score_1=r2_score(Y_test,y_pred)
print("Accuracy on Traing set: ",linear_reg.score(Train_ADS,Y_train))
print("Accuracy on Testing set: ",linear_reg.score(Test_ADS,Y_test))
print("R2 score", score_1)

In linear reegression we didn't obtain good acuracy. So, Lets try with another alogorithm: RandomForest Regressor

## KNeighbors Regressor 

In [None]:
from sklearn.neighbors import KNeighborsRegressor
neigh = KNeighborsRegressor(n_neighbors=2)
neigh.fit(Train_ADS, Y_train)
y_pred= neigh.predict(Test_ADS)
score_1=r2_score(Y_test,y_pred)
print("Accuracy on Traing set: ",neigh.score(Train_ADS,Y_train))
print("Accuracy on Testing set: ",neigh.score(Test_ADS,Y_test))
print("R2 score", score_1)

Compare with Linear regressor, accuracy for KNN with K=2 is better. So I performed hyper parameter tunning to find the optimal K value

In [None]:
Score = []
for i in range(1,40):
    knn = KNeighborsRegressor(n_neighbors=i)
    knn.fit(Train_ADS, Y_train)
    pred_i = knn.predict(Test_ADS)
    score_1=r2_score(Y_test,pred_i)
    Score.append(score_1)

plt.figure(figsize=(10,6))
plt.plot(range(1,40),Score,color='blue', linestyle='dashed', 
         marker='o',markerfacecolor='red', markersize=1)
plt.title('Accuracy vs. K Value')
plt.xlabel('K')
plt.ylabel('Accuracy')
print("Max Accuracy error:-",max(Score),"at K =",Score.index(max(Score))+1)

After hyper parameter tuning we obtain good acuracy. But, Lets try with another alogorithm: RandomForest Regressor

## RandomForest Regressor 

In [None]:
reg_rf = RandomForestRegressor()
reg_rf.fit(Train_ADS, Y_train)
y_pred= reg_rf.predict(Test_ADS)
score_1=r2_score(Y_test,y_pred)
print("Accuracy on Traing set: ",reg_rf.score(Train_ADS,Y_train))
print("Accuracy on Testing set: ",reg_rf.score(Test_ADS,Y_test))
print("R2 score", score_1)

In RandomForest Regressor, accuracy is comparitively better than than above two models. To improve the accuracy, hyperparameter tuing is performed

In [None]:
from sklearn.model_selection import RandomizedSearchCV
#Randomized Search CV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 40)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 40, num = 6)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 20,25,30,35,40,100]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

rf_random = RandomizedSearchCV(estimator = reg_rf, param_distributions = random_grid,scoring='neg_mean_squared_error', n_iter = 10, cv = 5, verbose=2, random_state=42, n_jobs = 1)

rf_random.fit(Train_ADS, Y_train)

In [None]:
rf_random.best_params_

In [None]:
reg_rf = RandomForestRegressor(n_estimators=1220,min_samples_split=2,min_samples_leaf=1,max_features='sqrt',max_depth=33)
reg_rf.fit(Train_ADS, Y_train)
y_pred= reg_rf.predict(Test_ADS)
score_1=r2_score(Y_test,y_pred)
print("Accuracy on Traing set: ",reg_rf.score(Train_ADS,Y_train))
print("Accuracy on Testing set: ",reg_rf.score(Test_ADS,Y_test))
print("R2 score", score_1)

After hyper parameter tuning we obtain good acuracy. But, Lets try with another alogorithm: XGB Regressor

## XGBoost Regressor 

In [None]:
#!pip install xgboost
import xgboost as XGB

xgb_model = XGB.XGBRegressor()
xgb_model.fit(Train_ADS, Y_train)
y_pred= xgb_model.predict(Test_ADS)
score_1=r2_score(Y_test,y_pred)
print("Accuracy on Traing set: ",xgb_model.score(Train_ADS,Y_train))
print("Accuracy on Testing set: ",xgb_model.score(Test_ADS,Y_test))
print("R2 score", score_1)

In [None]:
learning_rate = [0.01, 0.1]
max_depth = [int(x) for x in np.linspace(5, 40, num = 6)]
min_child_weight = [int(x) for x in np.linspace(1, 20, num = 6)]
subsample =  [0.5, 0.7]
colsample_bytree = [0.5, 0.7]
objective = ['reg:squarederror']
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 40)]


random_grid = {'learning_rate': learning_rate,
               'max_depth': max_depth,
               'min_child_weight': min_child_weight,
               'subsample': subsample,
               'colsample_bytree': colsample_bytree,
               'objective': objective,
               'n_estimators': n_estimators}


rf_random = RandomizedSearchCV(estimator = xgb_model, param_distributions = random_grid,scoring='neg_mean_squared_error', n_iter = 10, cv = 5, verbose=2, random_state=42, n_jobs = 1)

rf_random.fit(Train_ADS, Y_train)

In [None]:
rf_random.best_params_

In [None]:
xgb_model = XGB.XGBRegressor(subsample=0.5, objective='reg:squarederror', n_estimators=1561, min_child_weight=16, max_depth=12, learning_rate=0.01, colsample_bytree=0.5)
xgb_model.fit(Train_ADS, Y_train)
y_pred= xgb_model.predict(Test_ADS)
score_1=r2_score(Y_test,y_pred)
print("Accuracy on Traing set: ",xgb_model.score(Train_ADS,Y_train))
print("Accuracy on Testing set: ",xgb_model.score(Test_ADS,Y_test))
print("R2 score", score_1)

## Feature Engineering & Feature Selection For Test Dataset

In [None]:
test_file[[i for i in test_file.columns if test_file[i].isnull().sum()>0]].isnull().sum()

In [None]:
test_file['BuiltYearsBack'] = test_file['YrSold'] - test_file['YearBuilt']
test_file['RemodYearsBack'] = test_file['YrSold'] - test_file['YearRemodAdd']

train_LotFrontage_main = test_file[test_file.LotFrontage.isnull() != True]
test_LotFrontage_main = test_file[test_file.LotFrontage.isnull() == True]


test_LotFrontage = test_LotFrontage_main.drop('LotFrontage', axis = 1)
train_LotFrontage_xtrain = train_LotFrontage_main[['LotArea', 'Street', 'LotShape', 'LandSlope', 'LotConfig', 'HouseStyle', 'GarageArea']]
train_LotFrontage_ytrain = train_LotFrontage_main['LotFrontage']

test_LotFrontage = test_LotFrontage[['LotArea', 'Street', 'LotShape', 'LandSlope', 'LotConfig', 'HouseStyle', 'GarageArea']]

#Lotconfig and Street are falls under Nominal category. So, I used nominal encoding technique to convert it to integer
train_LotFrontage_xtrain_Nomina_Encoding = pd.get_dummies(train_LotFrontage_xtrain[['LotConfig', 'Street']])
train_LotFrontage_xtrain = pd.concat([train_LotFrontage_xtrain, train_LotFrontage_xtrain_Nomina_Encoding], 1)
train_LotFrontage_xtrain = train_LotFrontage_xtrain.drop(['LotConfig', 'Street'], axis = 1)

#LotShape, LandSlope and HouseStyle are falls under Ordinal category. So, I used Ordinal encoding technique to convert it to integer
train_LotFrontage_xtrain['LotShape'] = train_LotFrontage_xtrain.LotShape.map({'IR3':0, 'IR2':1, 'IR1': 2, 'Reg': 3})
train_LotFrontage_xtrain['LandSlope'] = train_LotFrontage_xtrain.LandSlope.map({'Sev':0, 'Mod':1, 'Gtl': 2})
train_LotFrontage_xtrain['HouseStyle'] = train_LotFrontage_xtrain.HouseStyle.map({'1Story':0, '1.5Fin':1, '1.5Unf': 2, '2Story':3, '2.5Fin':4, '2.5Unf': 5, 'SFoyer':6, 'SLvl':7})
train_LotFrontage_xtrain = train_LotFrontage_xtrain.drop('LotConfig_FR3', axis = 1)

#Lotconfig and Street are falls under Nominal category. So, I used nominal encoding technique to convert it to integer
test_LotFrontage_Nomina_Encoding = pd.get_dummies(test_LotFrontage[['LotConfig', 'Street']])
test_LotFrontage = pd.concat([test_LotFrontage, test_LotFrontage_Nomina_Encoding], 1)
test_LotFrontage = test_LotFrontage.drop(['LotConfig', 'Street','LotConfig_FR3'], axis = 1)

#LotShape, LandSlope and HouseStyle are falls under Ordinal category. So, I used Ordinal encoding technique to convert it to integer
test_LotFrontage['LotShape'] = test_LotFrontage.LotShape.map({'IR3':0, 'IR2':1, 'IR1': 2, 'Reg': 3})
test_LotFrontage['LandSlope'] = test_LotFrontage.LandSlope.map({'Sev':0, 'Mod':1, 'Gtl': 2})
test_LotFrontage['HouseStyle'] = test_LotFrontage.HouseStyle.map({'1Story':0, '1.5Fin':1, '1.5Unf': 2, '2Story':3, '2.5Fin':4, '2.5Unf': 5, 'SFoyer':6, 'SLvl':7})


train_LotFrontage_xtrain[[i for i in train_LotFrontage_xtrain.columns if train_LotFrontage_xtrain[i].isnull().sum()>0]].isnull().sum()
test_file['GarageArea'] = test_file['GarageArea'].fillna(test_file.groupby('HouseStyle')['GarageArea'].transform('mean'))



In [None]:
#By using Random Forest Algorithm I replaced the missing values of LotFrontage with predicted values
reg_rf = RandomForestRegressor(n_estimators=1000,min_samples_split=2,min_samples_leaf=1,max_features='sqrt',max_depth=25)
reg_rf.fit(train_LotFrontage_xtrain.fillna(0), train_LotFrontage_ytrain)
y_pred= reg_rf.predict(test_LotFrontage)
test_LotFrontage_main['LotFrontage'] = y_pred
print("Accuracy on Traing set: ",reg_rf.score(train_LotFrontage_xtrain.fillna(0),train_LotFrontage_ytrain))
#--------------------
Test_ADS = test_LotFrontage_main.append(train_LotFrontage_main).sort_index()
Test_ADS = Test_ADS[Req_Cols]
Test_ADS['BsmtQual'] = np.where(Test_ADS['BsmtQual'].isnull() == True, "No Basement", Test_ADS['BsmtQual'])
Test_ADS['BsmtFinType1'] = np.where(Test_ADS['BsmtFinType1'].isnull() == True, "No Basement", Test_ADS['BsmtFinType1'])
Test_ADS['HouseStyle'] = Test_ADS.HouseStyle.map({'1Story':0, '1.5Fin':1, '1.5Unf': 2, '2Story':3, '2.5Fin':4, '2.5Unf': 5, 'SFoyer':6, 'SLvl':7})
Test_ADS['ExterQual'] = Test_ADS.ExterQual.map({'Po':0, 'Fa':1, 'TA': 2, 'Gd': 3,'Ex': 4})
Test_ADS['BsmtQual'] = Test_ADS.BsmtQual.map({'Po':1, 'Fa':2, 'TA': 3, 'Gd': 4,'Ex': 5, 'No Basement': 0})
Test_ADS['BsmtFinType1'] = Test_ADS.BsmtFinType1.map({'Unf':1, 'LwQ':2, 'Rec': 3, 'BLQ': 4,'ALQ': 5,'GLQ': 6, 'No Basement': 0})
Test_ADS['HeatingQC'] = Test_ADS.HeatingQC.map({'Po':0, 'Fa':1, 'TA': 2, 'Gd': 3,'Ex': 4})
Test_ADS['KitchenQual'] = Test_ADS.KitchenQual.map({'Po':0, 'Fa':1, 'TA': 2, 'Gd': 3,'Ex': 4})
Test_ADS.head(4)

In [None]:
Test_col = Test_ADS.columns
Test_ADS = pd.DataFrame(Standardscaler.fit_transform(Test_ADS),columns = Test_col )
print(Train_ADS.shape)
Test_ADS.head(2)

## Sales price prediction for test dataset

In [None]:
#Applying Scaling technique to Test Dataset
Standardscaler = StandardScaler()
X_test_col = X.columns
Train_ADS = pd.DataFrame(Standardscaler.fit_transform(X),columns = X_test_col )

#Creating the Testing ADS with selected columns
Train_ADS = Train_ADS[Req_Cols]
print(Train_ADS.shape)
Train_ADS.head(2)
Y_train = y

In [None]:
xgb_model = XGB.XGBRegressor(subsample=0.5,  n_estimators=1561, min_child_weight=16, max_depth=12, learning_rate=0.01, colsample_bytree=0.5)
xgb_model.fit(Train_ADS, Y_train)
y_pred= xgb_model.predict(Test_ADS)

print("Accuracy on Traing set: ",xgb_model.score(Train_ADS,Y_train))


In [None]:
test_file['Pred Price'] = y_pred

Sub_file = test_file[['Id','Pred Price']]
Sub_file.head(10)
#Sub_file.to_csv('/submission.csv',index=False)

I've tried with three algorithms (Linear Regression, KNN, Random Forest) and hyperparameter tuning for XGB regressor. If you like the notebook the Please Upvote add up your comments to this nootebook Happy coding