In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('/kaggle/input/house-price-prediction-challenge/train.csv')
df.head(10)

In [None]:
df[['BHK_NO.', 'BHK_OR_RK']]

In [None]:
df['BHK_OR_RK'].value_counts()

In [None]:
df.describe()

In [None]:
df['BHK_NO.'].unique()

In [None]:
df.dtypes

# Exploratory Data Analysis

In [None]:
plt.figure(figsize = (15, 6))
sns.heatmap(data = df.corr(), annot = True, cmap = 'RdYlGn')

In [None]:
fig = px.bar(x=df["BHK_NO."].unique(), y=df["BHK_NO."].value_counts())
fig.show()

In [None]:
plt.figure(figsize = (15, 6))
sns.barplot(data = df, x = 'RERA', y = 'TARGET(PRICE_IN_LACS)')

In [None]:
plt.figure(figsize = (15, 6))
sns.barplot(data = df, x = 'POSTED_BY', y = 'TARGET(PRICE_IN_LACS)')

In [None]:
plt.figure(figsize = (15, 6))
sns.barplot(data = df, x = 'UNDER_CONSTRUCTION', y = 'TARGET(PRICE_IN_LACS)')

In [None]:
plt.figure(figsize = (15, 6))
sns.barplot(data = df, x = 'BHK_NO.', y = 'TARGET(PRICE_IN_LACS)')

---------------------------------------------------------------------------------------------------------------------

# OneHotEncoding

In [None]:
df = df.drop(['BHK_OR_RK', 'ADDRESS', 'LATITUDE', 'LONGITUDE'], axis = 1)
df.head()

In [None]:
df = pd.get_dummies(df)

In [None]:
df.columns

In [None]:
df.head()

In [None]:
df = df.drop(['POSTED_BY_Builder'], axis = 1)
df.head()

# Feature Scaling

In [None]:
X = df.drop(columns = ['TARGET(PRICE_IN_LACS)'])
y = df['TARGET(PRICE_IN_LACS)']

In [None]:
X

In [None]:
y

In [None]:
#from sklearn.compose import ColumnTransformer
#from sklearn.preprocessing import StandardScaler
#ct = ColumnTransformer(transformers=[('SQUARE_FT', StandardScaler(), [3])], remainder='passthrough')
#X = pd.DataFrame(ct.fit_transform(X))

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)

In [None]:
X

In [None]:
X.shape

In [None]:
y.shape

# Splitting the dataset into Training and Testing Set**

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
X_train

# Model Building

## Multiple Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [None]:
y_pred = regressor.predict(X_test)

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

R2_Score = r2_score(y_test, y_pred)
Mean_Absolute_Error = mean_absolute_error(y_test, y_pred)
Mean_Square_Error = mean_squared_error(y_test, y_pred)
Root_Mean_Square_Error = np.sqrt(mean_squared_error(y_test, y_pred))

results = pd.DataFrame([['Multiple Linear Regression', R2_Score, Mean_Absolute_Error, Mean_Square_Error, Root_Mean_Square_Error]],
                      columns = ['Model', 'R2 Score', 'Mean Absolute Error', 'Mean Square Error', 'Root Mean Square Error'])

In [None]:
results

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 100, random_state = 1)
regressor.fit(X_train, y_train)

In [None]:
y_pred = regressor.predict(X_test)

In [None]:
R2_Score = r2_score(y_test, y_pred)
Mean_Absolute_Error = mean_absolute_error(y_test, y_pred)
Mean_Square_Error = mean_squared_error(y_test, y_pred)
Root_Mean_Square_Error = np.sqrt(mean_squared_error(y_test, y_pred))

model_results = pd.DataFrame([['Random Forest', R2_Score, Mean_Absolute_Error, Mean_Square_Error, Root_Mean_Square_Error]],
                      columns = ['Model', 'R2 Score', 'Mean Absolute Error', 'Mean Square Error', 'Root Mean Square Error'])
results = results.append(model_results, ignore_index = True)

In [None]:
results

## XGBoost

In [None]:
from xgboost import XGBRegressor
regressor = XGBRegressor(random_state = 2)
regressor.fit(X_train, y_train)

In [None]:
y_pred = regressor.predict(X_test)

In [None]:
R2_Score = r2_score(y_test, y_pred)
Mean_Absolute_Error = mean_absolute_error(y_test, y_pred)
Mean_Square_Error = mean_squared_error(y_test, y_pred)
Root_Mean_Square_Error = np.sqrt(mean_squared_error(y_test, y_pred))

model_results = pd.DataFrame([['XGB Regressor', R2_Score, Mean_Absolute_Error, Mean_Square_Error, Root_Mean_Square_Error]],
                      columns = ['Model', 'R2 Score', 'Mean Absolute Error', 'Mean Square Error', 'Root Mean Square Error'])
results = results.append(model_results, ignore_index = True)

In [None]:
results

# Hyper-Parameter Tuning Using RandomizedSearchCV

In [None]:
parameters = {"learning_rate": [0.05, 0.10, 0.15, 0.20, 0.25, 0.30], "max_depth": [3, 4, 5, 6, 8, 10, 12, 15], 
              "min_child_weight": [1, 3, 5, 7], "gamma": [0.0, 0.1, 0.2, 0.3, 0.4], "colsample_bytree": [0.3, 0.4, 0.5, 0.7]}

In [None]:
from sklearn.model_selection import RandomizedSearchCV
random_search = RandomizedSearchCV(estimator = regressor, param_distributions = parameters, n_iter = 5, scoring = 'r2', n_jobs = -1,
                                  cv = 10, verbose = 3)

In [None]:
import time

t0 = time.time()
random_search.fit(X_train, y_train)
t1 = time.time()
print("Took %0.2f Seconds" %(t1-t0))

In [None]:
random_search.best_estimator_

In [None]:
random_search.best_params_

In [None]:
regressor = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.3, gamma=0.0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.15, max_delta_step=0, max_depth=6,
             min_child_weight=3, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=2,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [None]:
regressor.fit(X_train, y_train)

In [None]:
y_pred = regressor.predict(X_test)

In [None]:
R2_Score = r2_score(y_test, y_pred)
Mean_Absolute_Error = mean_absolute_error(y_test, y_pred)
Mean_Square_Error = mean_squared_error(y_test, y_pred)
Root_Mean_Square_Error = np.sqrt(mean_squared_error(y_test, y_pred))

model_results = pd.DataFrame([['XGB Regressor(Hyper-Parameter Tuned)', R2_Score, Mean_Absolute_Error, Mean_Square_Error, Root_Mean_Square_Error]],
                      columns = ['Model', 'R2 Score', 'Mean Absolute Error', 'Mean Square Error', 'Root Mean Square Error'])
results = results.append(model_results, ignore_index = True)

In [None]:
results

# Hence, we will go with the normal XGBoost algorithm.