# The dataset used is from Kaggle dataset. https://www.kaggle.com/datasets/amitabhajoy/bengaluru-house-price-data

# Goal: To build a machine learning app to help predict house prices in a particular part of a city based on key features

<span style="font-size: 24px; color: blue; font-style: italic; font-weight: bold;">Import Libraries</span>

In [4]:
import pandas as pd
import numpy as np

<span style="font-size: 24px; color: blue; font-style: italic; font-weight: bold;">Load Data</span>

In [6]:
Housingdata = pd.read_csv(r"./Bengaluru_House_Data.csv")

In [7]:
Housingdata.shape

(13320, 9)

In [8]:
Housingdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [9]:
for column in Housingdata.columns:
    print(Housingdata[column].value_counts())
    print("*"*20)

area_type
Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: count, dtype: int64
********************
availability
Ready To Move    10581
18-Dec             307
18-May             295
18-Apr             271
18-Aug             200
                 ...  
15-Aug               1
17-Jan               1
16-Nov               1
16-Jan               1
14-Jul               1
Name: count, Length: 81, dtype: int64
********************
location
Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: count, Length: 1305, dtype: int64
********************
size
2 BHK    

In [10]:
Housingdata.isna().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

<span style="font-size: 24px; color: blue; font-style: italic; font-weight: bold;">Exploratory Data Analysis</span>

In [12]:
Housingdata.drop(columns=['area_type','availability','society','balcony'], inplace=True)

In [13]:
Housingdata.describe()

Unnamed: 0,bath,price
count,13247.0,13320.0
mean,2.69261,112.565627
std,1.341458,148.971674
min,1.0,8.0
25%,2.0,50.0
50%,2.0,72.0
75%,3.0,120.0
max,40.0,3600.0


In [14]:
Housingdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13319 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13247 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


In [15]:
Housingdata['location'].value_counts()

location
Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: count, Length: 1305, dtype: int64

<span style="font-size: 24px; color: blue; font-style: italic; font-weight: bold;">Data Cleaning</span>

In [17]:
Housingdata['location'].fillna('Sarjapur  Road', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  Housingdata['location'].fillna('Sarjapur  Road', inplace=True)


In [18]:
Housingdata['size'].value_counts()

size
2 BHK         5199
3 BHK         4310
4 Bedroom      826
4 BHK          591
3 Bedroom      547
1 BHK          538
2 Bedroom      329
5 Bedroom      297
6 Bedroom      191
1 Bedroom      105
8 Bedroom       84
7 Bedroom       83
5 BHK           59
9 Bedroom       46
6 BHK           30
7 BHK           17
1 RK            13
10 Bedroom      12
9 BHK            8
8 BHK            5
11 BHK           2
11 Bedroom       2
10 BHK           2
14 BHK           1
13 BHK           1
12 Bedroom       1
27 BHK           1
43 Bedroom       1
16 BHK           1
19 BHK           1
18 Bedroom       1
Name: count, dtype: int64

In [19]:
Housingdata['size'].fillna('2 BHK', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  Housingdata['size'].fillna('2 BHK', inplace=True)


In [20]:
Housingdata['bath'] = Housingdata['bath'].fillna(Housingdata['bath'].median())

In [21]:
Housingdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   size        13320 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13320 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


In [22]:
Housingdata['bhk'] = Housingdata['size'].str.split().str.get(0).astype(int)

In [23]:
Housingdata[Housingdata.bhk >20]

Unnamed: 0,location,size,total_sqft,bath,price,bhk
1718,2Electronic City Phase II,27 BHK,8000,27.0,230.0,27
4684,Munnekollal,43 Bedroom,2400,40.0,660.0,43


In [24]:
Housingdata['total_sqft'].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [25]:
def convertRange(x):
    temp = x.split('-')
    if len(temp) == 2:
        return (float(temp[0]) + float(temp[1]))/2
    try:
        return float(x)
    except:
        return None

In [26]:
Housingdata['total_sqft'] = Housingdata['total_sqft'].apply(convertRange)

In [27]:
Housingdata.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3
4,Kothanur,2 BHK,1200.0,2.0,51.0,2


In [28]:
Housingdata['price_per_sqft'] = Housingdata['price']*100000/Housingdata['total_sqft']

In [29]:
Housingdata.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,13274.0,13320.0,13320.0,13320.0,13274.0
mean,1559.626694,2.688814,112.565627,2.802778,7907.501
std,1238.405258,1.338754,148.971674,1.294496,106429.6
min,1.0,1.0,8.0,1.0,267.8298
25%,1100.0,2.0,50.0,2.0,4266.865
50%,1276.0,2.0,72.0,3.0,5434.306
75%,1680.0,3.0,120.0,3.0,7311.746
max,52272.0,40.0,3600.0,43.0,12000000.0


In [30]:
Housingdata['location'].value_counts()

location
Whitefield                        540
Sarjapur  Road                    400
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: count, Length: 1305, dtype: int64

In [31]:
Housingdata['location'] = Housingdata['location'].apply(lambda x : x.strip())
location_count = Housingdata['location'].value_counts()

In [32]:
location_count_less_10 = location_count[location_count<=10]
location_count_less_10

location
Dairy Circle                      10
Nagappa Reddy Layout              10
Basapura                          10
1st Block Koramangala             10
Sector 1 HSR Layout               10
                                  ..
Bapuji Layout                      1
1st Stage Radha Krishna Layout     1
BEML Layout 5th stage              1
singapura paradise                 1
Abshot Layout                      1
Name: count, Length: 1053, dtype: int64

In [33]:
Housingdata['location'] = Housingdata['location'].apply(lambda x: 'other' if x in location_count_less_10 else x)

In [34]:
Housingdata['location'].value_counts()

location
other                 2885
Whitefield             541
Sarjapur  Road         400
Electronic City        304
Kanakpura Road         273
                      ... 
Nehru Nagar             11
Banjara Layout          11
LB Shastri Nagar        11
Pattandur Agrahara      11
Narayanapura            11
Name: count, Length: 242, dtype: int64

In [35]:
Housingdata.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,13274.0,13320.0,13320.0,13320.0,13274.0
mean,1559.626694,2.688814,112.565627,2.802778,7907.501
std,1238.405258,1.338754,148.971674,1.294496,106429.6
min,1.0,1.0,8.0,1.0,267.8298
25%,1100.0,2.0,50.0,2.0,4266.865
50%,1276.0,2.0,72.0,3.0,5434.306
75%,1680.0,3.0,120.0,3.0,7311.746
max,52272.0,40.0,3600.0,43.0,12000000.0


In [36]:
(Housingdata['total_sqft']/Housingdata['bhk']).describe()

count    13274.000000
mean       575.074878
std        388.205175
min          0.250000
25%        473.333333
50%        552.500000
75%        625.000000
max      26136.000000
dtype: float64

In [37]:
Housingdata = Housingdata[((Housingdata['total_sqft']/Housingdata['bhk']) >=300)]
Housingdata.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,12530.0,12530.0,12530.0,12530.0,12530.0
mean,1594.564544,2.559537,111.382401,2.650838,6303.979357
std,1261.271296,1.077938,152.077329,0.976678,4162.237981
min,300.0,1.0,8.44,1.0,267.829813
25%,1116.0,2.0,49.0,2.0,4210.526316
50%,1300.0,2.0,70.0,3.0,5294.117647
75%,1700.0,3.0,115.0,3.0,6916.666667
max,52272.0,16.0,3600.0,16.0,176470.588235


In [38]:
Housingdata.shape

(12530, 7)

In [39]:
Housingdata.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,51.0,2,4250.0


In [40]:
Housingdata.bhk.unique()

array([ 2,  4,  3,  1,  8,  6,  5,  7, 11,  9, 10, 16, 13])

In [41]:
def remove_outliers_sqft(df):
    df_output = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        gen_df = subdf[(subdf.price_per_sqft > (m-st)) & (subdf.price_per_sqft <= (m+st))]
        df_output = pd.concat([df_output,gen_df], ignore_index=True)
    return df_output

Housingdata = remove_outliers_sqft(Housingdata)
Housingdata.describe()


Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,10301.0,10301.0,10301.0,10301.0,10301.0
mean,1508.440608,2.471702,91.286372,2.574896,5659.062876
std,880.694214,0.979449,86.342786,0.897649,2265.774749
min,300.0,1.0,10.0,1.0,1250.0
25%,1110.0,2.0,49.0,2.0,4244.897959
50%,1286.0,2.0,67.0,2.0,5175.600739
75%,1650.0,3.0,100.0,3.0,6428.571429
max,30400.0,16.0,2200.0,16.0,24509.803922


In [42]:
def remove_bhk_outliers(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk] = {
                'mean': np.mean(bhk_df.price_per_sqft),
                'std': np.std(bhk_df.price_per_sqft),
                'count': bhk_df.shape[0]
            }
        for bhk, bhk_df in location_df.groupby('bhk'):
            stats = bhk_stats.get(bhk-1)
            if stats and stats['count']>5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)
    return df.drop(exclude_indices,axis='index')
Housingdata = remove_bhk_outliers(Housingdata)
Housingdata.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,7360.0,7360.0,7360.0,7360.0,7360.0
mean,1496.928526,2.448098,99.094892,2.500543,6127.742397
std,865.839979,1.011563,93.343057,0.929357,2409.144904
min,300.0,1.0,10.0,1.0,1300.0
25%,1095.75,2.0,50.0,2.0,4600.0
50%,1260.0,2.0,73.1,2.0,5680.396754
75%,1680.0,3.0,113.0,3.0,6896.551724
max,30000.0,16.0,2200.0,16.0,24509.803922


In [43]:
# Housingdata[Housingdata.bath>Housingdata.bhk+2]

# Again the business manager has a conversation with you (i.e. a data scientist) that if you have 4 bedroom home and even if you have bathroom in all 4 rooms plus one guest bathroom, you will have total bath = total bed + 1 max. Anything above that is an outlier or a data error and can be removed

Housingdata = Housingdata[Housingdata.bath<Housingdata.bhk+2]
Housingdata.shape

(7281, 7)

In [44]:
Housingdata.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,7281.0,7281.0,7281.0,7281.0,7281.0
mean,1474.107622,2.40901,96.797474,2.484411,6100.781165
std,821.275387,0.934824,88.031001,0.912104,2381.406175
min,300.0,1.0,10.0,1.0,1300.0
25%,1092.0,2.0,50.0,2.0,4591.304348
50%,1255.0,2.0,72.2,2.0,5666.003976
75%,1655.0,3.0,110.0,3.0,6861.642295
max,30000.0,16.0,2200.0,16.0,24509.803922


In [45]:
Housingdata['bhk'].value_counts()

bhk
2     3641
3     2471
1      526
4      501
5       71
6       38
8       14
7        9
9        5
11       2
16       1
10       1
13       1
Name: count, dtype: int64

In [46]:
Housingdata = Housingdata[Housingdata.bhk<=6]
Housingdata.shape

(7248, 7)

In [47]:
Housingdata.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,7248.0,7248.0,7248.0,7248.0,7248.0
mean,1461.513051,2.384106,95.905409,2.456954,6099.478941
std,788.846032,0.846477,83.74913,0.808279,2372.924376
min,300.0,1.0,10.0,1.0,1300.0
25%,1090.0,2.0,50.0,2.0,4593.188866
50%,1255.0,2.0,72.0,2.0,5666.666667
75%,1650.0,3.0,110.0,3.0,6861.842597
max,30000.0,7.0,2100.0,6.0,24509.803922


In [48]:
# from matplotlib import pyplot as plt
# %matplotlib inline
# import matplotlib 
# matplotlib.rcParams["figure.figsize"] = (20,10)
# matplotlib.rcParams["figure.figsize"] = (20,10)
# plt.hist(Housingdata.price_per_sqft,rwidth=0.8)
# plt.xlabel("Price Per Square Feet")
# plt.ylabel("Count")

In [49]:
Housingdata = Housingdata[Housingdata.price_per_sqft<=10000]
Housingdata.shape

(6763, 7)

In [50]:
Housingdata = Housingdata.drop(['size','price_per_sqft'],axis='columns')
Housingdata.head(3)

Unnamed: 0,location,total_sqft,bath,price,bhk
8,1st Phase JP Nagar,1875.0,3.0,167.0,3
9,1st Phase JP Nagar,1500.0,5.0,85.0,5
13,1st Phase JP Nagar,1394.0,2.0,100.0,2


In [51]:
Housingdata.to_csv("cleaned_data.csv")

In [52]:
Housingdata.dtypes

location       object
total_sqft    float64
bath          float64
price         float64
bhk             int32
dtype: object

<span style="font-size: 24px; color: blue; font-style: italic; font-weight: bold;">Feature Engineering</span>

In [54]:
# One hot encoding

dummies = pd.get_dummies(Housingdata.location,dtype=int)
dummies.head(3)

Unnamed: 0,1st Phase JP Nagar,2nd Phase Judicial Layout,5th Block Hbr Layout,5th Phase JP Nagar,6th Phase JP Nagar,7th Phase JP Nagar,8th Phase JP Nagar,9th Phase JP Nagar,AECS Layout,Abbigere,...,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur,other
8,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
Housingdata = pd.concat([Housingdata,dummies.drop('other',axis='columns')],axis='columns')
Housingdata.head()

Unnamed: 0,location,total_sqft,bath,price,bhk,1st Phase JP Nagar,2nd Phase Judicial Layout,5th Block Hbr Layout,5th Phase JP Nagar,6th Phase JP Nagar,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
8,1st Phase JP Nagar,1875.0,3.0,167.0,3,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,1st Phase JP Nagar,1500.0,5.0,85.0,5,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13,1st Phase JP Nagar,1394.0,2.0,100.0,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,1st Phase JP Nagar,1077.0,2.0,93.0,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16,1st Phase JP Nagar,840.0,2.0,50.0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [56]:
Housingdata = Housingdata.drop('location',axis='columns')
Housingdata.head(2)

Unnamed: 0,total_sqft,bath,price,bhk,1st Phase JP Nagar,2nd Phase Judicial Layout,5th Block Hbr Layout,5th Phase JP Nagar,6th Phase JP Nagar,7th Phase JP Nagar,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
8,1875.0,3.0,167.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,1500.0,5.0,85.0,5,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [57]:
Housingdata.shape

(6763, 240)

<span style="font-size: 24px; color: blue; font-style: italic; font-weight: bold;">Data Splitting</span>

In [59]:
X = Housingdata.drop(['price'],axis='columns')
X.head(3)

Unnamed: 0,total_sqft,bath,bhk,1st Phase JP Nagar,2nd Phase Judicial Layout,5th Block Hbr Layout,5th Phase JP Nagar,6th Phase JP Nagar,7th Phase JP Nagar,8th Phase JP Nagar,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
8,1875.0,3.0,3,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,1500.0,5.0,5,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13,1394.0,2.0,2,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [60]:
X.shape

(6763, 239)

In [61]:
y = Housingdata.price
y.head(3)

8     167.0
9      85.0
13    100.0
Name: price, dtype: float64

In [62]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)

<span style="font-size: 24px; color: blue; font-style: italic; font-weight: bold;">Model Training</span>

In [63]:
from sklearn.linear_model import LinearRegression
lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)
lr_clf.score(X_test,y_test)

0.9043485217565308

In [64]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

cross_val_score(LinearRegression(), X, y, cv=cv)

array([0.90551159, 0.88620978, 0.8911428 , 0.90383014, 0.94639205])

<span style="font-size: 24px; color: blue; font-style: italic; font-weight: bold;">Hyperparameter Tuning</span>

In [65]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import StandardScaler

def find_best_model_using_gridsearchcv(X, y):
    algos = {
        'linear_regression': {
            'model': LinearRegression(),
            'params': {
                'fit_intercept': [True, False]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1, 2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion': ['squared_error', 'friedman_mse'],
                'splitter': ['best', 'random']
            }
        }
    }
    
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    
    # Standardize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    for algo_name, config in algos.items():
        gs = GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X_scaled, y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])

# Usage
find_best_model_using_gridsearchcv(X, y)

Unnamed: 0,model,best_score,best_params
0,linear_regression,0.904207,{'fit_intercept': True}
1,lasso,0.869578,"{'alpha': 1, 'selection': 'random'}"
2,decision_tree,0.787632,"{'criterion': 'squared_error', 'splitter': 'be..."


In [66]:
def predict_price(location,sqft,bath,bhk):    
    loc_index = np.where(X.columns==location)[0][0]

    x = np.zeros(len(X.columns))
    x[0] = sqft
    x[1] = bath
    x[2] = bhk
    if loc_index >= 0:
        x[loc_index] = 1

    return lr_clf.predict([x])[0]

In [67]:
predict_price('1st Phase JP Nagar',1000, 2, 2)



81.75940906364136

In [68]:
import pickle
with open('banglore_home_prices_model.pickle','wb') as f:
    pickle.dump(lr_clf,f)

## Considering performance, Linear Regression Model will be used for developing the app for predicting house Prices

In [69]:
import json
columns = {
    'data_columns' : [col.lower() for col in X.columns]
}
with open("columns.json","w") as f:
    f.write(json.dumps(columns))

In [70]:
X.to_excel("data.xlsx")

<span style="font-size: 24px; color: blue; font-style: italic; font-weight: bold;">Performance Metrics</span>

In [71]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# After training the model and making predictions
y_pred = lr_clf.predict(X_test)

# Calculate regression metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared Score: {r2}")

Mean Squared Error: 261.0331904219601
Mean Absolute Error: 11.181707045970143
R-squared Score: 0.9043485217565308


## R-squared (R²) represents the proportion of variance in the dependent variable that's predictable from the independent variable(s).
## Your R² of 0.9043 is very good, suggesting that about 90.43% of the variance in house prices can be explained by your model.

## The MSE measures the average squared difference between predicted and actual values.
## In this case, the MSE is relatively low, considering we're dealing with house prices (which can have large values).


