In [None]:
#main libraries
import os
import re
import pickle
import numpy as np
import pandas as pd

#visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly 
import plotly.graph_objs as go
import plotly.io as pio
from plotly.subplots import make_subplots
import plotly.express as px
from plotly.offline import iplot, init_notebook_mode
import cufflinks as cf
import plotly.figure_factory as ff 
from plotly.offline import iplot
from plotly import tools

#importing machine learning libraries
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import StratifiedKFold, cross_validate, train_test_split, KFold, cross_val_score
from sklearn.preprocessing  import StandardScaler, LabelEncoder, MinMaxScaler, RobustScaler
from sklearn.ensemble import RandomForestRegressor, RandomForestRegressor
from xgboost import XGBRegressor, XGBClassifier
from sklearn.linear_model import LinearRegression, ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import (BaggingRegressor, AdaBoostRegressor,GradientBoostingRegressor, 
                              RandomForestRegressor,  GradientBoostingRegressor)
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error, r2_score
from mlxtend.regressor import StackingCVRegressor
import xgboost as xgb
from lightgbm import LGBMRegressor

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#import the data
full_df = pd.read_csv('/kaggle/input/bengaluru-house-price-data/Bengaluru_House_Data.csv')

In [None]:
full_df

In [None]:
full_df = full_df.drop(columns =['availability','balcony','area_type','society'])
full_df

In [None]:
full_df.isnull().sum()

In [None]:
full_df['size'].fillna('0 BHK', inplace=True)
full_df['bath'].fillna(0, inplace=True)
full_df['location'].fillna('NA', inplace=True) 
full_df.isnull().sum()

In [None]:
full_df['size'] = full_df['size'].apply(lambda x: int(x.split(' ')[0]))
full_df

In [None]:
def convert_sqft_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None

In [None]:
full_df.total_sqft = full_df.total_sqft.apply(convert_sqft_to_num)
full_df.total_sqft

In [None]:
#descriptive statistics summary
full_df['price'].describe()

In [None]:
#histogram
sns.distplot(full_df['price']);

In [None]:
#skewness and kurtosis
print("Skewness: %f" % full_df['price'].skew())
print("Kurtosis: %f" % full_df['price'].kurt())

In [None]:
#scatter plot size/price
var = 'size'
data = pd.concat([full_df['price'], full_df[var]], axis=1)
data.plot.scatter(x=var, y='price');

In [None]:
#scatter plot total_sqft/price
var = 'total_sqft'
data = pd.concat([full_df['price'], full_df[var]], axis=1)
data.plot.scatter(x=var, y='price');

In [None]:
#scatter plot bath/price
var = 'bath'
data = pd.concat([full_df['price'], full_df[var]], axis=1)
data.plot.scatter(x=var, y='price');

In [None]:
#scatter plot balcony/price
var = 'location'
data = pd.concat([full_df['price'], full_df[var]], axis=1)
data.plot.scatter(x=var, y='price');

In [None]:
#correlation matrix
corrmat = full_df.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);

In [None]:
#Using z_scores to remove outliers


cols = ['size', 'total_sqft', 'bath']


def z_score(full_df):
    full_df.columns = [x + "_zscore" for x in full_df.columns.tolist()]
    return ((full_df - full_df.mean())/full_df.std(ddof=0))

In [None]:
z_scores = z_score(full_df[cols])
z_scores['ID'] = z_scores.index
z_scores

In [None]:
full_df['ID'] = full_df.index
full_df = pd.merge(full_df, z_scores)
full_df

In [None]:
full_df = full_df.loc[full_df['size_zscore'].abs()<=3]
full_df = full_df.loc[full_df['total_sqft_zscore'].abs()<=3]
full_df = full_df.loc[full_df['bath_zscore'].abs()<=3]
full_df = full_df.drop(columns =['size_zscore','total_sqft_zscore','bath_zscore','ID'])
full_df.reset_index(drop=True, inplace=True)
full_df

In [None]:
Y_train = full_df['price']
del full_df['price']

#Converting the saleprice with Logarithms to over come the high skewness and the outliers
Y_train = np.log1p(Y_train) 

In [None]:
#convert categorical variable into dummy
full_df = pd.get_dummies(full_df)
full_df

In [None]:
full_df.fillna(0, inplace=True)
train_set = full_df
train_set

In [None]:
# define models to test:

base_models = {"Elastic Net":make_pipeline(RobustScaler(),                    #Elastic Net model(Regularized model)
                                            ElasticNet(alpha=0.0005,
                                                       l1_ratio=0.9)),
               "Kernel Ridge" :KernelRidge(),                                 #Kernel Ridge model(Regularized model)
               "Lasso" : make_pipeline(RobustScaler(), Lasso(alpha =0.0005,   #Lasso model(Regularized model)
                                                             random_state=1)),
               "Linear Regression" : LinearRegression(),                      #Linear Regression model
               "Random Forest": RandomForestRegressor(n_estimators=300),      #Random Forest model
               "SVM": SVR(),                                                  #Support Vector Machines
               "XGBoost": XGBRegressor(),                                     #XGBoost model                                              
               "Gradient Boosting":make_pipeline(StandardScaler(),
                                                 GradientBoostingRegressor(n_estimators=3000, #GradientBoosting model
                                                                           learning_rate=0.005,     
                                                                           max_depth=4, max_features='sqrt',
                                                                           min_samples_leaf=15, min_samples_split=10, 
                                                                           loss='huber', random_state =5))}

In [None]:
# Preprocessing, fitting, making predictions and scoring for every model:
models_data = {'R^2':{'Training':{},'Testing':{}},
               'Adjusted R^2':{'Training':{},'Testing':{}},
               'MAE':{'Training':{},'Testing':{}},
               'MSE':{'Training':{},'Testing':{}},
               'RMSE':{'Training':{},'Testing':{}}}

X_train, X_test, y_train, y_test = train_test_split(train_set, Y_train, test_size=0.2, random_state=42)
p = train_set.shape[1]
train_n = X_train.shape[0]
test_n = X_test.shape[0]

for name in base_models:
    #fitting the model
    model = base_models[name].fit(X_train, y_train)
    #make predictions with train and test datasets
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    #calculate the R-Squared for training and testing
    r2_train,r2_test = model.score(X_train, y_train), model.score(X_test, y_test)
    models_data['R^2']['Training'][name], models_data['R^2']['Testing'][name] = r2_train, r2_test
            
    #calculate the Adjusted R-Squared for training and testing
    adj_train, adj_test = (1-(1-r2_train)*(train_n-1)/(train_n-p-1)) ,(1-(1-r2_test)*(train_n-1)/(train_n-p-1))
    models_data['Adjusted R^2']['Training'][name], models_data['Adjusted R^2']['Testing'][name] = adj_train, adj_test
               
    #calculate the Mean absolute error for training and testing
    mae_train, mae_test = mean_absolute_error(y_train, y_pred_train), mean_squared_error(y_test, y_pred_test)         
    models_data['MAE']['Training'][name], models_data['MAE']['Testing'][name] = mae_train, mae_test
               
    #calculate Mean square error for training and testing
    mse_train, mse_test = mean_squared_error(y_train, y_pred_train), mean_squared_error(y_test, y_pred_test)
    models_data['MSE']['Training'][name], models_data['MSE']['Testing'][name] = mse_train, mse_test

    #calculate Root mean error for training and testing    
    rmse_train, rmse_test = np.sqrt(mse_train), np.sqrt(mse_test)
    models_data['RMSE']['Training'][name], models_data['RMSE']['Testing'][name] = rmse_train, rmse_test
    
    print('\n========================={}========================='.format(name))
    print('**********Training**********************Testing********')
    print('R^2    : ',r2_train,' '*(25-len(str(r2_train))),r2_test) 
    print('Adj R^2: ',adj_train,' '*(25-len(str(adj_train))),adj_test) 
    print('MAE    : ',mae_train,' '*(25-len(str(mae_train))),mae_test) 
    print('MSE    : ',mse_train,' '*(25-len(str(mse_train))),mse_test) 
    print('RMSE   : ',rmse_train,' '*(25-len(str(rmse_train))),rmse_test)

In [None]:
R_2 = pd.DataFrame(models_data['R^2']).sort_values(by='Testing',ascending=False)
Adjusted_R_2 = pd.DataFrame(models_data['Adjusted R^2']).sort_values(by='Testing',ascending=False)
MAE = pd.DataFrame(models_data['MAE']).sort_values(by='Testing',ascending=True)
MSE = pd.DataFrame(models_data['MSE']).sort_values(by='Testing',ascending=True)
RMSE = pd.DataFrame(models_data['RMSE']).sort_values(by='Testing',ascending=True)

In [None]:
#order the results by testing values

fig1 = px.line(data_frame=R_2.reset_index(),
        x='index',y=['Training','Testing'],
        title='R-Squared for training and testing')
fig1.update_yaxes(range=[-0.4, 1])

In [None]:
#order the results by testing values

fig2 = px.line(data_frame=Adjusted_R_2.reset_index(),
        x='index',y=['Training','Testing'],
        title='Adjusted R-Squared for training and testing')
fig2.update_yaxes(range=[-0.4, 1])

In [None]:
#order the results by testing values

fig3 = px.line(data_frame=MAE.reset_index(),
        x='index',y=['Training','Testing'],
        title='Mean absolute error for training and testing')
fig3.update_yaxes(range=[-0.4, 1])

In [None]:
#order the results by testing values

fig4 = px.line(data_frame=MSE.reset_index(),
        x='index',y=['Training','Testing'],
        title='Mean square error for training and testing')
fig4.update_yaxes(range=[-0.4, 1])

In [None]:
#order the results by testing values

fig5 = px.line(data_frame=RMSE.reset_index(),
        x='index',y=['Training','Testing'],
        title='Root mean square error for training and testing')
fig5.update_yaxes(range=[-0.4, 1])

In [None]:
# Using XG Boost as it has maximum accuracy for the test dataset
predictor = XGBRegressor()

In [None]:
#fitting the model to our data
predictor.fit(train_set,Y_train)

In [None]:
#see the results of the model for training

predictor_score = round(predictor.score(train_set, Y_train)*100, 3)
predictions = predictor.predict(train_set)
predictor_rmse = round(np.sqrt(mean_squared_error(Y_train, predictions).mean())*100, 3)
print(' _'*15)
print('\nStacking Results for trining test : \n')
print(f'Score : {predictor_score}%')
print(f'RMSE  : {predictor_rmse}%')
print(' _'*15)

In [None]:
df1 = pd.DataFrame({'Actual': Y_train,'Predicted': predictions })
df1