In [None]:
#Utility packages
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix

#Preprocessing related Imports
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, LabelBinarizer, OneHotEncoder
from sklearn.base import BaseEstimator,TransformerMixin

#Regressors
from sklearn.model_selection import cross_val_score, GridSearchCV,StratifiedShuffleSplit,RandomizedSearchCV
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor, GradientBoostingRegressor,HistGradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

#Metrics
from sklearn.metrics import mean_squared_error


In [None]:
train=pd.read_csv("/kaggle/input/house-price-prediction-challenge/train.csv").drop("ADDRESS",axis=1)
test=pd.read_csv("/kaggle/input/house-price-prediction-challenge/test.csv").drop("ADDRESS",axis=1)

# Quick View of top 5 rows

In [None]:
train.head()

# Finding type and missing data

In [None]:
train.info()

# Deeper analysis of the nature of the data

In [None]:
train.describe()

# Finding out the frequency distribution of each Attribute

In [None]:
train.hist(bins=50,figsize=(15,20))

# # Pearson Correlation of all independent variables with the dependent variable i.e "TARGET(PRICE_IN_LACS)"

In [None]:
train.corr()["TARGET(PRICE_IN_LACS)"]

# Train-Test Split using Stratified Sampling technique to remove the sampling bias.

In [None]:
train["bhk_cat"]=np.ceil(train["BHK_NO."]/1.5)
train["bhk_cat"].where(train["bhk_cat"]<5,5,inplace=True)
split=StratifiedShuffleSplit(n_splits=1,test_size=.2,random_state=42)
for train1,test1 in split.split(train,train['bhk_cat'],train['BHK_OR_RK']):
    strat_train=train.loc[train1]
    strat_test=train.loc[test1]

In [None]:
print("TRAIN RATIOS \n",strat_train["BHK_OR_RK"].value_counts()/len(strat_train["BHK_OR_RK"]))
print("TEST RATIOS \n",strat_test["BHK_OR_RK"].value_counts()/len(strat_test["BHK_OR_RK"]))

In [None]:
print("TRAIN RATIOS \n",strat_train["bhk_cat"].value_counts()/len(strat_train["bhk_cat"]))
print("TEST RATIOS \n",strat_test["bhk_cat"].value_counts()/len(strat_test["bhk_cat"]))

In [None]:
for set in (strat_train,strat_test):
    set.drop(["bhk_cat"],axis=1,inplace=True)

In [None]:
copied=strat_train.copy()

# Scatter plot of lattitude and longitude to understand geographical attribute's relationship with the Target Variable

In [None]:
copied.plot(kind='scatter',x='LONGITUDE',y='LATITUDE',alpha=0.4,c="TARGET(PRICE_IN_LACS)",cmap=plt.get_cmap("jet"),colorbar=True)

# Some other scatter plots for deeper analysis

In [None]:
scatter_matrix(copied,figsize=(20,20))

In [None]:
copied[copied["SQUARE_FT"]<2000000].plot(kind='scatter',x="SQUARE_FT",y="TARGET(PRICE_IN_LACS)",s="BHK_NO.",label="BHK_NO.",c="RESALE",cmap=plt.get_cmap("jet"),colorbar=True)
plt.legend()

# Preparing the data for the ML model

In [None]:
attribute_data=strat_train.drop("TARGET(PRICE_IN_LACS)",axis=1)
label_data=strat_train["TARGET(PRICE_IN_LACS)"]

# Creating Pipelines to preprocess both Numerical and catagorical attributes individually

In [None]:
num_attribs=['BHK_NO.','SQUARE_FT','LONGITUDE','LATITUDE']
cat_attribs=[ 'UNDER_CONSTRUCTION', 'RERA','READY_TO_MOVE', 'RESALE']
string_cat_attribs=['POSTED_BY','BHK_OR_RK']
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

class CustomLabelBinarizer(BaseEstimator, TransformerMixin):
    def __init__(self, sparse_output=False):
        self.sparse_output = sparse_output
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        enc = LabelBinarizer(sparse_output=self.sparse_output)
        return enc.fit_transform(X)

num_pipeline=Pipeline([
    ('selector',DataFrameSelector(num_attribs)),
    ('std',StandardScaler())
])
cat_pipeline=Pipeline([
    ('selector',DataFrameSelector(cat_attribs)),
    ('Binarizer',CustomLabelBinarizer(sparse_output=True))
])
string_cat_pipeline=Pipeline([
    ('selector',DataFrameSelector(string_cat_attribs)),
    ('Binarizer',OneHotEncoder())
])
finalPipeline=FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
    ("string_cat_pipeline", string_cat_pipeline),
])

In [None]:
attribute_data_corrected=finalPipeline.fit_transform(attribute_data)

# Functions for analysing the performance

In [None]:
def kfoldOut(model,inp,out,k=10):
    scores=cross_val_score(model,inp,out,scoring="neg_mean_squared_error",cv=k)
    return np.sqrt(-scores).mean()
    

def MSE(model,inp,out):
    housing_predicted=model.predict(inp)
    return np.sqrt(mean_squared_error(out,housing_predicted))

## LINEAR REGRESSION MODEL

In [None]:
param_grid = [
    { 'alpha': [0.01,0.02],'l1_ratio':[0.5],'max_iter':[100,200,400,500,800,],'tol':[1e-4,1e-5]},
  ]
reg=ElasticNet(random_state=42,precompute=True)
grid_search = GridSearchCV(reg, param_grid, cv=5,scoring='neg_mean_squared_error',verbose=10,n_jobs=-1)
grid_search.fit(attribute_data_corrected,label_data)
reg=grid_search.best_estimator_

## DECESION TREE MODEL with GRIDSEARCH for HYPERPARAMETER OPTIMIZATION

In [None]:
param_grid = [
    { 'max_features': [8],'max_depth':[25,30,35],'min_samples_split':[3,4],'criterion':["mae"],'splitter':[ "random"]},
  ]
DTR=DecisionTreeRegressor(random_state=42)

grid_search = GridSearchCV(DTR, param_grid, cv=5,scoring='neg_mean_squared_error',verbose=10,n_jobs=-1)
grid_search.fit(attribute_data_corrected,label_data)
DTR=grid_search.best_estimator_

In [None]:
DTR

## RANDOM FOREST REGRESSOR with GRIDSEARCH for HYPERPARAMETER OPTIMIZATION 

In [None]:
param_grid = [
    {'n_estimators': [65,60,75], 'max_features': [8],'max_depth':[30],'min_samples_split':[4]},
  ]

forest_reg = RandomForestRegressor(n_jobs=-1,random_state=42)

grid_search = GridSearchCV(forest_reg, param_grid, cv=5,scoring='neg_mean_squared_error',verbose=10,n_jobs=-1)
grid_search.fit(attribute_data_corrected,label_data)
forest_reg=grid_search.best_estimator_

# Adaptive Boosted Regression with GridSearchCV

In [None]:
param_grid = [
    {'n_estimators': [15,16,18,20],'loss':['exponential'],'learning_rate':[0.05],'base_estimator':[DecisionTreeRegressor(max_depth=i,max_features='auto',random_state=42,min_samples_split=j) for i in [10,13] for j in [3,4]]}]
ADA_REG = AdaBoostRegressor(random_state=1)

ADA = RandomizedSearchCV(ADA_REG, param_grid, cv=5, verbose=5,n_jobs=-1,scoring='neg_mean_squared_error',random_state=1,n_iter=900)
ADA.fit(attribute_data_corrected,label_data)
ADA_REG=ADA.best_estimator_

# Gradiant Boosted Regression with GridSearchCV

In [None]:
param_grid = [{"loss":['ls'],"learning_rate":[0.2,0.3],'n_iter_no_change':[40,50],'criterion' : ['friedman_mse'],"n_estimators":[280,290],"max_features":['auto'],"max_depth":[2,3,4],'validation_fraction':[0.1], 'min_samples_split':[4],'tol':[0.1]}]
GradBoosted_REG = GradientBoostingRegressor(random_state=1, presort=True)
Grad = GridSearchCV(GradBoosted_REG, param_grid, cv=10, verbose=10,n_jobs=-1,scoring='neg_mean_squared_error')
Grad.fit(attribute_data_corrected,label_data)
GradBoosted_REG=Grad.best_estimator_

# Histogram Gradient Boosted Regression with GridSearchCV

In [None]:
param_grid = [
    {'max_iter': [100],'loss':['least_squares'],'learning_rate':[0.3,0.5,0.7],'early_stopping':['True'],'l2_regularization':[0.8,0.9],'validation_fraction':[0.05],'n_iter_no_change':[200],'tol':[1e-5,1e-6]}]
HistGradBoost_REG = HistGradientBoostingRegressor(random_state=1,scoring='neg_mean_squared_error')

HGBR = RandomizedSearchCV(HistGradBoost_REG, param_grid, cv=5, verbose=5,n_jobs=-1,
                           scoring='neg_mean_squared_error',random_state=1,n_iter=900)
HGBR.fit(attribute_data_corrected.toarray(),label_data)
HistGradBoost_REG=HGBR.best_estimator_

In [None]:
HistGradBoost_REG

# Extreme Gradient Boosted Regression with GridSearchCV

In [None]:
param_grid = [{'n_estimators':[270,280,300],'max_depth':[5],'reg_lambda':[0.9,1,1.1],'learning_rate':[0.01,0.05,0.1], 'gamma':[0.9,1,1.1],'reg_alpha':[0.9,1,1.1],'booster':['dart']}]
XGB_REG = XGBRegressor(random_state=1,objective='reg:squarederror')

XGBR = RandomizedSearchCV(XGB_REG, param_grid, cv=5, verbose=5,
                           scoring='neg_mean_squared_error',random_state=1,n_iter=900,n_jobs=4)
XGBR.fit(attribute_data_corrected.toarray(),label_data)
XGB_REG=XGBR.best_estimator_

# Preparing the test dataset

In [None]:
test_attribute_data=strat_test.drop("TARGET(PRICE_IN_LACS)",axis=1)
test_label_data=strat_test["TARGET(PRICE_IN_LACS)"]

In [None]:
test_attribute_data_corrected=finalPipeline.transform(test_attribute_data)

# Comparing the Performance of each model and Selecting the best performer on the test data set

In [None]:
print("Linear Regression MSE: ",MSE(reg,test_attribute_data_corrected,test_label_data))
print("Decision Tree MSE: ",MSE(DTR,test_attribute_data_corrected,test_label_data))
print("Random Forest Regressor MSE: ",MSE(forest_reg,test_attribute_data_corrected,test_label_data))
print("Adaptive Boosted Regressor MSE: ",MSE(ADA_REG,test_attribute_data_corrected,test_label_data))
print("Gradient Boosted Regressor MSE: ",MSE(GradBoosted_REG,test_attribute_data_corrected,test_label_data))
print("Histogram Gradient Boosted Regressor MSE: ",MSE(HistGradBoost_REG,test_attribute_data_corrected.toarray(),test_label_data))
print("Extreme Gradient Boosted Regressor MSE: ",MSE(XGB_REG,test_attribute_data_corrected.toarray(),test_label_data))

In [None]:
model_list=[reg,DTR,forest_reg,ADA_REG,GradBoosted_REG,HistGradBoost_REG,XGB_REG]
mse_list=[
        MSE(reg,test_attribute_data_corrected,test_label_data),
        MSE(DTR,test_attribute_data_corrected,test_label_data),
        MSE(forest_reg,test_attribute_data_corrected,test_label_data),
        MSE(ADA_REG,test_attribute_data_corrected,test_label_data),
        MSE(GradBoosted_REG,test_attribute_data_corrected,test_label_data),
        MSE(HistGradBoost_REG,test_attribute_data_corrected.toarray(),test_label_data),
        MSE(XGB_REG,test_attribute_data_corrected.toarray(),test_label_data)]
final=model_list[np.argmin(mse_list)]
print("BEST MODEL IS: ",final)

# PREDICTION ON THE FINAL TEST DATA SET WITH RANDOM FOREST REGRESSOR

In [None]:
FINAL_test=finalPipeline.transform(test)

In [None]:
pd.DataFrame(final.predict(FINAL_test)).to_csv('submission.csv', index=False)

In [None]:
test["TARGET(PRICE_IN_LACS)"]=pd.DataFrame(DTR.predict(FINAL_test))

In [None]:
test