## Problem Statement: I have to present a working model for the House Price Predictions for Delhi Localities to the Stakeholders

In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pylab as plt
import plotly.express as px 
import plotly.graph_objects as go
%matplotlib inline

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [3]:
df =  pd.read_csv('delhi_house_price.csv' )

In [4]:
df.head()

Unnamed: 0,Area,BHK,Bathroom,Furnishing,Locality,Parking,Price,Status,Transaction,Type,Per_Sqft
0,800.0,3,2.0,Semi-Furnished,Rohini Sector 25,1.0,6500000,Ready_to_move,New_Property,Builder_Floor,
1,750.0,2,2.0,Semi-Furnished,"J R Designers Floors, Rohini Sector 24",1.0,5000000,Ready_to_move,New_Property,Apartment,6667.0
2,950.0,2,2.0,Furnished,"Citizen Apartment, Rohini Sector 13",1.0,15500000,Ready_to_move,Resale,Apartment,6667.0
3,600.0,2,2.0,Semi-Furnished,Rohini Sector 24,1.0,4200000,Ready_to_move,Resale,Builder_Floor,6667.0
4,650.0,2,2.0,Semi-Furnished,Rohini Sector 24 carpet area 650 sqft status R...,1.0,6200000,Ready_to_move,New_Property,Builder_Floor,6667.0


## Data Exploring and Cleaning

In [5]:
df.shape

(1259, 11)

In [6]:
# checking Null Values
df.isnull().sum()

Area             0
BHK              0
Bathroom         2
Furnishing       5
Locality         0
Parking         33
Price            0
Status           0
Transaction      0
Type             5
Per_Sqft       241
dtype: int64

In [7]:
#creating  price_per_sqft columns to understand the data
df['price_per_sqft'] = df['Price']/df['Area']
df['price_per_sqft'] = df['price_per_sqft'].astype(int)

In [8]:
df.head(3)

Unnamed: 0,Area,BHK,Bathroom,Furnishing,Locality,Parking,Price,Status,Transaction,Type,Per_Sqft,price_per_sqft
0,800.0,3,2.0,Semi-Furnished,Rohini Sector 25,1.0,6500000,Ready_to_move,New_Property,Builder_Floor,,8125
1,750.0,2,2.0,Semi-Furnished,"J R Designers Floors, Rohini Sector 24",1.0,5000000,Ready_to_move,New_Property,Apartment,6667.0,6666
2,950.0,2,2.0,Furnished,"Citizen Apartment, Rohini Sector 13",1.0,15500000,Ready_to_move,Resale,Apartment,6667.0,16315


In [9]:
df.describe()

Unnamed: 0,Area,BHK,Bathroom,Parking,Price,Per_Sqft,price_per_sqft
count,1259.0,1259.0,1257.0,1226.0,1259.0,1018.0,1259.0
mean,1466.452724,2.796664,2.556086,1.935563,21306700.0,15690.136542,14581.774424
std,1568.05504,0.954425,1.04222,6.279212,25601150.0,21134.738568,20420.174601
min,28.0,1.0,1.0,1.0,1000000.0,1259.0,204.0
25%,800.0,2.0,2.0,1.0,5700000.0,6364.0,6757.0
50%,1200.0,3.0,2.0,1.0,14200000.0,11291.5,11666.0
75%,1700.0,3.0,3.0,2.0,25500000.0,18000.0,17549.0
max,24300.0,10.0,7.0,114.0,240000000.0,183333.0,538461.0


#### As we know we can fill the null values by CENTERAL TENDENCIES but in pur use case we cannot do this: according to business domain understanding

In [10]:
# Droping null values (as we cant even out average values due to not fitting correctly)
df = df[df['Furnishing'].notna()]
df = df[df['Bathroom'].notna()]
df = df[df['Parking'].notna()]

# Price per_sqft varies according to location and various term so we can substitue it by mean or median
df = df[df['Per_Sqft'].notna()]

In [11]:
df.isnull().sum()

Area              0
BHK               0
Bathroom          0
Furnishing        0
Locality          0
Parking           0
Price             0
Status            0
Transaction       0
Type              0
Per_Sqft          0
price_per_sqft    0
dtype: int64

In [12]:
# new dataframe without any null values
df1 = df

In [13]:
df1.shape

(1005, 12)

In [14]:
# Removing columns per_sqft and Shifting Price columns to last
df2 = df1.drop(columns=['Per_Sqft','price_per_sqft'])
col9 = df2.pop('Price')
df2.insert(9,'Price', col9)
df2.head()

Unnamed: 0,Area,BHK,Bathroom,Furnishing,Locality,Parking,Status,Transaction,Type,Price
1,750.0,2,2.0,Semi-Furnished,"J R Designers Floors, Rohini Sector 24",1.0,Ready_to_move,New_Property,Apartment,5000000
2,950.0,2,2.0,Furnished,"Citizen Apartment, Rohini Sector 13",1.0,Ready_to_move,Resale,Apartment,15500000
3,600.0,2,2.0,Semi-Furnished,Rohini Sector 24,1.0,Ready_to_move,Resale,Builder_Floor,4200000
4,650.0,2,2.0,Semi-Furnished,Rohini Sector 24 carpet area 650 sqft status R...,1.0,Ready_to_move,New_Property,Builder_Floor,6200000
5,1300.0,4,3.0,Semi-Furnished,Rohini Sector 24,1.0,Ready_to_move,New_Property,Builder_Floor,15500000


## DATA VISULISATION: EDA


In [15]:
df2.head()

Unnamed: 0,Area,BHK,Bathroom,Furnishing,Locality,Parking,Status,Transaction,Type,Price
1,750.0,2,2.0,Semi-Furnished,"J R Designers Floors, Rohini Sector 24",1.0,Ready_to_move,New_Property,Apartment,5000000
2,950.0,2,2.0,Furnished,"Citizen Apartment, Rohini Sector 13",1.0,Ready_to_move,Resale,Apartment,15500000
3,600.0,2,2.0,Semi-Furnished,Rohini Sector 24,1.0,Ready_to_move,Resale,Builder_Floor,4200000
4,650.0,2,2.0,Semi-Furnished,Rohini Sector 24 carpet area 650 sqft status R...,1.0,Ready_to_move,New_Property,Builder_Floor,6200000
5,1300.0,4,3.0,Semi-Furnished,Rohini Sector 24,1.0,Ready_to_move,New_Property,Builder_Floor,15500000


In [16]:
# Area of a property VS Price distribution
fig = px.histogram(df1,x='Area',y='Price',
                    title='Area vs Price',
                    opacity=0.8,
                    color_discrete_sequence=['indianred'],
                    labels={'Area':'Area in Sqft', 'Price':'Price'})
fig.update_layout(barmode='group', bargap=0.30,bargroupgap=0.0,height=420,width=1080)
fig.show()


In [17]:
# Type vs Price
fig = px.histogram(df2,x='Type',
                    title='Count of House Types',
                    opacity=0.8,
                    color = 'Type',
                    #color_discrete_sequence=['indianred'],
                    labels={'Type':'Type of House', 'Price':'Price'}
)
fig.update_layout(barmode='group', bargap=0.30,bargroupgap=0.0,height=520,width=600)
fig.show()

In [18]:
# Furnishing types vs count
fig = px.histogram(df2,x='Furnishing')
fig.update_layout(barmode='group', bargap=0.20,bargroupgap=0.0,height=420,width=600)
fig.show()

In [19]:
# Furnshing types vs Price
fig = px.scatter(df,x='Furnishing',y='Price')
fig.update_layout(height=420,width=500)
fig.show()

In [20]:
# Status of any property vs count

fig = px.histogram(df2,x='Status')
fig.update_layout(height=420,width=600)
fig.show()

In [21]:
# Status of any property vs PRice
fig = px.scatter(df2,x='Status',y='Price')
fig.update_layout(height=420,width=400)
fig.show()

In [22]:
# Which types of houses are present for TRANSACTIONS
fig = px.histogram(df2,x='Transaction',color='Transaction')
fig.update_layout(height=420,width=400)
fig.show()

In [23]:
# Transaction vs Price
fig = px.scatter(df2,x='Transaction',y='Price')
fig.update_layout(height=420,width=400)
fig.show()

In [24]:
# Parkings Count: we can see that this is not balanced
fig = px.histogram(df1,x='Parking',color='Parking')
fig.update_layout(height=420,width=1080)
fig.show()

In [25]:
# Parkings VS Price 
fig = px.scatter(df1,x='Parking',y='Price',
                color='Parking')
fig.update_layout(height=420,width=1080)
fig.show()

## As we saw in many case that 
a. Our data is not distributed uniformally.<br>
b. Outliers are present. <br><br>
So we need to make is normally distributed for better model accuracy we can use : <br>
**we try to remove from 1st Business understanding**<br>
1. IQR <br>
2. Z-score

In [26]:
df2.head()

Unnamed: 0,Area,BHK,Bathroom,Furnishing,Locality,Parking,Status,Transaction,Type,Price
1,750.0,2,2.0,Semi-Furnished,"J R Designers Floors, Rohini Sector 24",1.0,Ready_to_move,New_Property,Apartment,5000000
2,950.0,2,2.0,Furnished,"Citizen Apartment, Rohini Sector 13",1.0,Ready_to_move,Resale,Apartment,15500000
3,600.0,2,2.0,Semi-Furnished,Rohini Sector 24,1.0,Ready_to_move,Resale,Builder_Floor,4200000
4,650.0,2,2.0,Semi-Furnished,Rohini Sector 24 carpet area 650 sqft status R...,1.0,Ready_to_move,New_Property,Builder_Floor,6200000
5,1300.0,4,3.0,Semi-Furnished,Rohini Sector 24,1.0,Ready_to_move,New_Property,Builder_Floor,15500000


In [27]:
df2.dtypes

Area           float64
BHK              int64
Bathroom       float64
Furnishing      object
Locality        object
Parking        float64
Status          object
Transaction     object
Type            object
Price            int64
dtype: object

In [28]:
# Parkings VS Price 
fig = px.scatter(df2,x='Locality',y='Price')
fig.update_layout(height=420,width=1080)
fig.show()

In [29]:
df2.Locality.nunique()

304

## Outliers Detection

According to BUsiness domain understanding

In [30]:
## 1. According to BUsiness domain i am trying to find how many Localities are there where 2bhk costs more than 3bhk ny Area
def plot_scatter(df2,Locality):
    bhk2 = df2[(df2.Locality == Locality) &(df2.BHK ==2)]
    bhk3 = df2[(df2.Locality == Locality) &(df2.BHK ==3)]
    fig = go.Figure() 
    fig.add_trace(go.Scatter(x=bhk2.Area,y=bhk2.Price,
                    mode='markers',
                    name='2 BHK'))
    fig.add_trace(go.Scatter(x=bhk3.Area,y=bhk3.Price,
                    mode='markers',
                    name='3 BHK'))
    fig.update_layout(title='2bhj vs 3bhk', xaxis_title='Area in Sqft', yaxis_title='Price',height=420,width=680)
    fig.show()

plot_scatter(df2,'Alaknanda')

In [31]:
# 2.Dropping Data where Number of bathrooms are more than BHK from 2
df2.drop(df2[df2.BHK+2 < df2.Bathroom].index, inplace= True)

In [32]:
# 3. Droping all the data points where Area is Less than 300m2 
df2.drop(df2[df2['Area'] < 300].index,inplace=True)

In [33]:
# 4. As we Know minimum Area required for a flat of 2bhk is more than 400 sqrt_meter
df2.drop(df2[(df2['Area'] < 400) & (df2['BHK'] >=2)].index, inplace=True)

In [34]:
df2.shape

(969, 10)

In [35]:
# Checking normal outliers in our data points by BOXPLOT

In [36]:
fig = px.box(df2,y='Price')
fig.show()

In [37]:
fig = px.box(df2,y='Area')
fig.show()

In [38]:
fig = px.scatter(df2,x='Area',y='Price',
                    title='Area vs Price',
                    opacity=0.8,
                    color_discrete_sequence=['indianred'],
                    labels={'Area':'Area in Sqft', 'Price':'Price'})
fig.update_layout(barmode='group', bargap=0.30,bargroupgap=0.0,height=420,width=1080)
fig.show()

### According to business domain i got clearance that these are not outliers

## Feature Engineering

In [39]:
df_final = df2

In [40]:
df_final.head(5)

Unnamed: 0,Area,BHK,Bathroom,Furnishing,Locality,Parking,Status,Transaction,Type,Price
1,750.0,2,2.0,Semi-Furnished,"J R Designers Floors, Rohini Sector 24",1.0,Ready_to_move,New_Property,Apartment,5000000
2,950.0,2,2.0,Furnished,"Citizen Apartment, Rohini Sector 13",1.0,Ready_to_move,Resale,Apartment,15500000
3,600.0,2,2.0,Semi-Furnished,Rohini Sector 24,1.0,Ready_to_move,Resale,Builder_Floor,4200000
4,650.0,2,2.0,Semi-Furnished,Rohini Sector 24 carpet area 650 sqft status R...,1.0,Ready_to_move,New_Property,Builder_Floor,6200000
5,1300.0,4,3.0,Semi-Furnished,Rohini Sector 24,1.0,Ready_to_move,New_Property,Builder_Floor,15500000


In [41]:
# Checking which of the Localities doesnt occured more than 2 times

df_loc = df_final.Locality.value_counts()
loc_lessthan2 = df_loc[df_loc==1]

In [42]:
# Changing the localities name as Other For who are less occuring

df_final.Locality = df_final.Locality.apply(lambda x: 'other' if x in loc_lessthan2 else x)

In [43]:
df_final.head()

Unnamed: 0,Area,BHK,Bathroom,Furnishing,Locality,Parking,Status,Transaction,Type,Price
1,750.0,2,2.0,Semi-Furnished,"J R Designers Floors, Rohini Sector 24",1.0,Ready_to_move,New_Property,Apartment,5000000
2,950.0,2,2.0,Furnished,other,1.0,Ready_to_move,Resale,Apartment,15500000
3,600.0,2,2.0,Semi-Furnished,Rohini Sector 24,1.0,Ready_to_move,Resale,Builder_Floor,4200000
4,650.0,2,2.0,Semi-Furnished,other,1.0,Ready_to_move,New_Property,Builder_Floor,6200000
5,1300.0,4,3.0,Semi-Furnished,Rohini Sector 24,1.0,Ready_to_move,New_Property,Builder_Floor,15500000


### Encoding the categorical columns because they may have some correlation with Price

In [44]:
df_final.dtypes

Area           float64
BHK              int64
Bathroom       float64
Furnishing      object
Locality        object
Parking        float64
Status          object
Transaction     object
Type            object
Price            int64
dtype: object

In [45]:
col =[]
for column in df_final.columns: 
    if df_final[column].dtype == 'object':
        col.append(column)
        

In [46]:
col

['Furnishing', 'Locality', 'Status', 'Transaction', 'Type']

In [48]:
for cols in df_final[['Furnishing', 'Locality', 'Status', 'Transaction', 'Type']]:
    encho = pd.get_dummies(df_final[cols])
    df_final = pd.concat([df_final,encho],axis='columns')

In [51]:
df_final_en = df_final

In [52]:
df_final_en.drop(columns=['Furnishing','Locality','Status','Transaction','Type'],inplace=True)

In [53]:
df_final_en.head()

Unnamed: 0,Area,BHK,Bathroom,Parking,Price,Furnished,Semi-Furnished,Unfurnished,"APL Builder Floor, Greater Kailash 1","Abul Fazal Enclave Part 1, Okhla",...,"Virat Residency, Dwarka Mor","Yamuna Apartment, Alaknanda","Yamuna Vihar, Shahdara",other,Almost_ready,Ready_to_move,New_Property,Resale,Apartment,Builder_Floor
1,750.0,2,2.0,1.0,5000000,0,1,0,0,0,...,0,0,0,0,0,1,1,0,1,0
2,950.0,2,2.0,1.0,15500000,1,0,0,0,0,...,0,0,0,1,0,1,0,1,1,0
3,600.0,2,2.0,1.0,4200000,0,1,0,0,0,...,0,0,0,0,0,1,0,1,0,1
4,650.0,2,2.0,1.0,6200000,0,1,0,0,0,...,0,0,0,1,0,1,1,0,0,1
5,1300.0,4,3.0,1.0,15500000,0,1,0,0,0,...,0,0,0,0,0,1,1,0,0,1


In [54]:
df_final_en.corr()

Unnamed: 0,Area,BHK,Bathroom,Parking,Price,Furnished,Semi-Furnished,Unfurnished,"APL Builder Floor, Greater Kailash 1","Abul Fazal Enclave Part 1, Okhla",...,"Virat Residency, Dwarka Mor","Yamuna Apartment, Alaknanda","Yamuna Vihar, Shahdara",other,Almost_ready,Ready_to_move,New_Property,Resale,Apartment,Builder_Floor
Area,1.000000,0.467155,0.521879,0.019436,0.557266,-0.053864,0.074275,-0.039808,0.030613,-0.017056,...,-0.017655,-0.006463,-0.041035,-0.054333,0.171653,-0.171653,0.123594,-0.123594,-0.102466,0.102466
BHK,0.467155,1.000000,0.816768,-0.055509,0.607505,0.046844,0.089935,-0.133195,0.057133,-0.014766,...,0.011274,-0.038732,0.032923,-0.051883,0.123871,-0.123871,0.166840,-0.166840,-0.129672,0.129672
Bathroom,0.521879,0.816768,1.000000,-0.003248,0.727241,-0.038220,0.109420,-0.089839,0.016960,-0.046412,...,-0.030987,-0.025288,-0.072347,-0.084376,0.137773,-0.137773,0.199662,-0.199662,-0.132910,0.132910
Parking,0.019436,-0.055509,-0.003248,1.000000,0.056035,-0.032120,-0.077468,0.108489,-0.010355,-0.010355,...,-0.012688,-0.010355,-0.027772,-0.008427,-0.014637,0.014637,-0.041758,0.041758,0.049894,-0.049894
Price,0.557266,0.607505,0.727241,0.056035,1.000000,-0.061194,0.064496,-0.023629,0.061371,-0.028556,...,-0.036362,-0.012609,-0.034625,-0.065501,0.192979,-0.192979,0.217395,-0.217395,-0.111469,0.111469
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ready_to_move,-0.171653,-0.123871,-0.137773,0.014637,-0.192979,-0.013002,0.068115,-0.064109,0.012094,0.012094,...,0.014820,0.012094,0.043276,0.092867,-1.000000,1.000000,-0.250430,0.250430,-0.085220,0.085220
New_Property,0.123594,0.166840,0.199662,-0.041758,0.217395,-0.126483,0.166249,-0.084612,0.055891,0.009443,...,0.068488,-0.037005,-0.079231,-0.134606,0.250430,-0.250430,1.000000,-1.000000,-0.193560,0.193560
Resale,-0.123594,-0.166840,-0.199662,0.041758,-0.217395,0.126483,-0.166249,0.084612,-0.055891,-0.009443,...,-0.068488,0.037005,0.079231,0.134606,-0.250430,0.250430,-1.000000,1.000000,0.193560,-0.193560
Apartment,-0.102466,-0.129672,-0.132910,0.049894,-0.111469,0.031492,-0.105104,0.090254,-0.041132,-0.041132,...,0.061616,0.050283,-0.094847,0.117494,0.085220,-0.085220,-0.193560,0.193560,1.000000,-1.000000


In [55]:
df_final_en.shape

(969, 157)

## Feature Selection and Scaling
 
 (Scaling requied for models which involve Distance,Gradient  eg)


In [56]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [57]:
x = df_final_en.drop(columns=['Price'])
y = df_final_en['Price']

In [58]:
scale = StandardScaler()
x = scale.fit_transform(x)

In [59]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=42)

In [60]:
print(x_train.shape)
print(y_train.shape)

(726, 156)
(726,)


# Model Training

As i know that my dataset has many outlier so I will try to select such model which are less sensitive towards outliers.

In [61]:
from sklearn.model_selection import cross_val_score

In [62]:
from sklearn.tree  import DecisionTreeRegressor
decision=DecisionTreeRegressor()
#start = time()
dtr = decision.fit(x_train, y_train)
#end=time()
#train_time_dec=end-start
print('The training Data accuracy:',dtr.score(x_train,y_train))
print('The test Data accuracy:',dtr.score(x_test,y_test))

The training Data accuracy: 0.9996438391423799
The test Data accuracy: 0.6233423725857472


The above model is overfitting so i will use ensemble models

### 1. Random FOrest Regressor (base model)

In [63]:
from sklearn.ensemble import RandomForestRegressor


In [64]:
rand = RandomForestRegressor()
rand.fit(x_train,y_train)

RandomForestRegressor()

In [65]:
print('The training Data accuracy:',rand.score(x_train,y_train))
print('The test Data accuracy:',rand.score(x_test,y_test))

The training Data accuracy: 0.9649636168976166
The test Data accuracy: 0.8285415476922479


In [66]:
# StratifiedKFold (best)
from sklearn.model_selection import StratifiedKFold
skfold=StratifiedKFold(n_splits=5)
rand1 = RandomForestRegressor()
scores=cross_val_score(rand1,x,y,cv=skfold)
print(scores)

[0.80044442 0.8063876  0.88771923 0.8099328  0.71551016]


## 2. XGBoost (base model boosting)

In [67]:
import xgboost as xgb
from xgboost.sklearn import XGBRegressor

In [91]:
xg = XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)


In [92]:
xg.fit(x_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, eta=0.1, gamma=0,
             gpu_id=-1, importance_type='gain', interaction_constraints='',
             learning_rate=0.100000001, max_delta_step=0, max_depth=7,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=1000, n_jobs=8, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.7,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [93]:
print('The training Data accuracy:',xg.score(x_train,y_train))
print('The test Data accuracy:',xg.score(x_test,y_test))

The training Data accuracy: 0.9994779616620486
The test Data accuracy: 0.8872360235840229


### Hyperparameter Tunning (XGBOOST Regressor)

In [96]:
from sklearn.model_selection import GridSearchCV
def hyperParameterTuning(x_train, y_train):
    param_tuning = {
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5, 7, 10],
        'min_child_weight': [1, 3, 5],
        'subsample': [0.5, 0.7],
        'colsample_bytree': [0.5, 0.7],
        'n_estimators' : [100, 200, 500],
        'objective': ['reg:squarederror']
    }

    xgb_model = XGBRegressor()

    gsearch = GridSearchCV(estimator = xgb_model,
                           param_grid = param_tuning,                        
                           #scoring = 'neg_mean_absolute_error', #MAE
                           #scoring = 'neg_mean_squared_error',  #MSE
                           cv = 5,
                           n_jobs = -1,
                           verbose = 1)

    gsearch.fit(x_train,y_train)

    return gsearch.best_params_

In [97]:
hyperParameterTuning(x_train, y_train)

Fitting 5 folds for each of 288 candidates, totalling 1440 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   12.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   55.4s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  8.3min
[Parallel(n_jobs=-1)]: Done 1440 out of 1440 | elapsed: 10.2min finished


{'colsample_bytree': 0.5,
 'learning_rate': 0.1,
 'max_depth': 3,
 'min_child_weight': 1,
 'n_estimators': 500,
 'objective': 'reg:squarederror',
 'subsample': 0.7}

In [98]:
xgb_model = XGBRegressor(
        objective = 'reg:squarederror',
        colsample_bytree = 0.5,
        learning_rate = 0.1,
        max_depth = 3,
        min_child_weight = 1,
        n_estimators = 500,
        subsample = 0.7)

In [99]:
xgb_model.fit(x_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.5, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=3,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=500, n_jobs=8, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.7,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [100]:
print('The training Data accuracy:',xgb_model.score(x_train,y_train))
print('The test Data accuracy:',xgb_model.score(x_test,y_test))

The training Data accuracy: 0.9730599333983871
The test Data accuracy: 0.8783015285330915


### Cross Validations kfold / stratifiedKFold

 As i am stating my model accuracy to the stakeholders then i can not just say a accuracy then i used cross-Validations method to get range of accuracy

# KFold method
from sklearn.model_selection import KFold
xg=XGBRegressor()
kfold_validation=KFold(10)


#from sklearn.model_selection import cross_val_score
results=cross_val_score(xg,x,y,cv=kfold_validation)
print(results)
print(np.mean(results))

In [72]:
# StratifiedKFold (best)
from sklearn.model_selection import StratifiedKFold
skfold=StratifiedKFold(n_splits=5)
xg1=XGBRegressor()
scores=cross_val_score(xg1,x,y,cv=skfold)
print(scores)

[0.79461074 0.86147479 0.93244435 0.74651577 0.78138733]


In [76]:
from sklearn.model_selection import ShuffleSplit
xg2=XGBRegressor()
ssplit=ShuffleSplit(n_splits=10,test_size=0.30)
results=cross_val_score(xg2,x,y,cv=ssplit)
results

array([0.81706452, 0.79180822, 0.79077121, 0.81941426, 0.8622063 ,
       0.86851217, 0.8254654 , 0.75792921, 0.89543342, 0.72002339])

### 3. Tried Ridge and lasso (l1 and l2 regularisation)

In [77]:
from sklearn.linear_model import Ridge
rd = Ridge()
rd.fit(x_train,y_train)

Ridge()

In [78]:
print('The training Data accuracy:',rd.score(x_train,y_train))
print('The test Data accuracy:',rd.score(x_test,y_test))

The training Data accuracy: 0.8154257682204588
The test Data accuracy: 0.7751895744863864


In [79]:
from sklearn.linear_model import Lasso
ls = Lasso()
ls.fit(x_train,y_train)

Lasso()

In [80]:
print('The training Data accuracy:',ls.score(x_train,y_train))
print('The test Data accuracy:',ls.score(x_test,y_test))

The training Data accuracy: 0.815427029517917
The test Data accuracy: 0.7753097665687607


### 4. LightGBM

In [81]:
import lightgbm as lgb

In [380]:
#df = df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

In [82]:
clf = lgb.LGBMRegressor()
clf.fit(x_train, y_train)

LGBMRegressor()

In [83]:
print('The training Data accuracy:',clf.score(x_train,y_train))
print('The test Data accuracy:',clf.score(x_test,y_test))

The training Data accuracy: 0.8410087282818203
The test Data accuracy: 0.7596629465824446


In [84]:
# StratifiedKFold (best)
from sklearn.model_selection import StratifiedKFold
skfold=StratifiedKFold(n_splits=5)
clf1 = lgb.LGBMRegressor()
scores=cross_val_score(clf1,x,y,cv=skfold)
print(scores)

[0.75960267 0.76737394 0.77709935 0.74966052 0.59413762]


### 5. Adaboost


In [85]:
from sklearn.ensemble import AdaBoostRegressor

In [86]:
#n_estimators: It controls the number of weak learners.
#learning_rate:Controls the contribution of weak learners in the final combination. There is a trade-off between learning_rate and n_estimators.
#base_estimators: It helps to specify different ML algorithm. By default sklearn uses decision tree
adaboost_regressor = AdaBoostRegressor(n_estimators=1500, learning_rate = 0.001, loss='exponential')
ada_model = adaboost_regressor.fit(x_train, y_train)
prediction_test_ada = ada_model.predict(x_test)


print('The training Data accuracy:',ada_model.score(x_train,y_train))
print('The test Data accuracy:',ada_model.score(x_test,y_test))

The training Data accuracy: 0.8116543176559504
The test Data accuracy: 0.7700300579146491


### 6. GradientBoostingRegressor

In [87]:
from sklearn.ensemble import GradientBoostingRegressor

###start = time()
est=GradientBoostingRegressor(n_estimators=400, max_depth=5, loss='ls',min_samples_split=2,learning_rate=0.1).fit(x_train, y_train)
#end=time()
#train_time_g=end-start
gradient=est.score(x_test,y_test)

In [88]:
print('The training Data accuracy:',est.score(x_train,y_train))
print('The test Data accuracy:',est.score(x_test,y_test))

The training Data accuracy: 0.9907997716824845
The test Data accuracy: 0.8517535766973363


In [89]:
# StratifiedKFold (best)
from sklearn.model_selection import StratifiedKFold
skfold=StratifiedKFold(n_splits=5)
est1=GradientBoostingRegressor()
scores=cross_val_score(est1,x,y,cv=skfold)
print(scores)

[0.83727051 0.80731342 0.90953988 0.81991652 0.78058482]


In [104]:
# Comparing Models on the basis of Model's Accuracy Score of different models
models_cross = pd.DataFrame({
    'Model': ['Gradient Boosting','AdaBoost','lightGBM','lasso Regression','Ridge Regression','XGB','Random Forest','Decision Tree'],
    'Score': [est.score(x_test,y_test),ada_model.score(x_test,y_test),clf.score(x_test,y_test),ls.score(x_test,y_test),rd.score(x_test,y_test)
    ,xg.score(x_test,y_test),rand.score(x_test,y_test),dtr.score(x_test,y_test)]})

models_cross.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
5,XGB,0.887236
0,Gradient Boosting,0.851754
6,Random Forest,0.828542
3,lasso Regression,0.77531
4,Ridge Regression,0.77519
1,AdaBoost,0.77003
2,lightGBM,0.759663
7,Decision Tree,0.623342


## According to all ensemble models **XGBOOST** is best fitting model for our required Use Case

### For Further MOdel Productions and predictions i will use *XGBOOST* 

### My Results:

I will Present my findings to the stakeholders and i will say that I got a working model for this use case having accuracy: <br> 
**Model accuracy ranging from 93% to 85%**

Performance Matrix for Regression models: 
MSE,RMSE,R2