In [1]:
import pandas as pd
import numpy as np
data=pd.read_csv('Bengaluru_House_Data.csv')

In [2]:
data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [3]:
data.shape

(13320, 9)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [5]:
# calculate values in each column
for column in data.columns:
    print (data[column].value_counts())
    print ("*"*20)

Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: area_type, dtype: int64
********************
Ready To Move    10581
18-Dec             307
18-May             295
18-Apr             271
18-Aug             200
                 ...  
16-Nov               1
17-Jan               1
14-Jul               1
16-Jan               1
15-Aug               1
Name: availability, Length: 81, dtype: int64
********************
Whitefield                   540
Sarjapur  Road               399
Electronic City              302
Kanakpura Road               273
Thanisandra                  234
                            ... 
manyata                        1
Handenahalli                   1
Electronic city phase 1,       1
 ittamadu                      1
Aavalahalli                    1
Name: location, Length: 1305, dtype: int64
********************
2 BHK         5199
3 BHK         4310
4 Bedroom      826
4 BHK          591
3 Bedroom  

In [6]:
#count null values in each column
data.isna().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [7]:
#drop un wanted feature
data.drop(columns=['area_type', 'availability', 'society', 'balcony'], inplace=True)
data.describe()


Unnamed: 0,bath,price
count,13247.0,13320.0
mean,2.69261,112.565627
std,1.341458,148.971674
min,1.0,8.0
25%,2.0,50.0
50%,2.0,72.0
75%,3.0,120.0
max,40.0,3600.0


In [8]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13319 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13247 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


In [9]:
#start filling null values of remaining columns
data['location'].value_counts()

Whitefield                   540
Sarjapur  Road               399
Electronic City              302
Kanakpura Road               273
Thanisandra                  234
                            ... 
manyata                        1
Handenahalli                   1
Electronic city phase 1,       1
 ittamadu                      1
Aavalahalli                    1
Name: location, Length: 1305, dtype: int64

In [10]:
#location has only one null value fo we fill it with the sarjapur
data['location']=data['location'].fillna("Sarjapur Road")

In [11]:
data['size'].value_counts()

2 BHK         5199
3 BHK         4310
4 Bedroom      826
4 BHK          591
3 Bedroom      547
1 BHK          538
2 Bedroom      329
5 Bedroom      297
6 Bedroom      191
1 Bedroom      105
8 Bedroom       84
7 Bedroom       83
5 BHK           59
9 Bedroom       46
6 BHK           30
7 BHK           17
1 RK            13
10 Bedroom      12
9 BHK            8
8 BHK            5
11 Bedroom       2
10 BHK           2
11 BHK           2
14 BHK           1
19 BHK           1
18 Bedroom       1
43 Bedroom       1
16 BHK           1
27 BHK           1
13 BHK           1
12 Bedroom       1
Name: size, dtype: int64

In [12]:
# 2BHK has highest frequency
data['size']= data['size'].fillna('2 BHK')

In [13]:
data['bath'] = data['bath'].fillna(data['bath'].median())
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   size        13320 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13320 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


In [14]:
# size-2 BHK or 2 Bedroom
#data[size].str.split()==['2','BHK']
#str.get(0)----string
#astype(int)---str to int 

# bhk will be a new column 
data['bhk']=data['size'].str.split().str.get(0).astype(int)


In [15]:
#remove outlier
data[data.bhk > 20]

Unnamed: 0,location,size,total_sqft,bath,price,bhk
1718,2Electronic City Phase II,27 BHK,8000,27.0,230.0,27
4684,Munnekollal,43 Bedroom,2400,40.0,660.0,43


In [16]:
# now square fit
data['total_sqft'].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [17]:
#some entries present in range
#so we split them by '-' 
#then we find mean of integer values
def convertRange(x):

    temp = x.split('-')
    if len(temp) == 2:
        return (float(temp[0])+ float(temp[1]))/2
    try:
        #in case of Nan values it will through exception
        return float(x)
    except:
        return None

In [18]:
data[ 'total_sqft']=data['total_sqft'].apply(convertRange)

In [19]:
data.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3
4,Kothanur,2 BHK,1200.0,2.0,51.0,2


In [20]:
# now make new column
#price per sq fit
data['price_per_sqft'] = data['price'] *100000/data['total_sqft']
data['price_per_sqft']

0         3699.810606
1         4615.384615
2         4305.555556
3         6245.890861
4         4250.000000
             ...     
13315     6689.834926
13316    11111.111111
13317     5258.545136
13318    10407.336319
13319     3090.909091
Name: price_per_sqft, Length: 13320, dtype: float64

In [21]:
data.describe()


Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,13274.0,13320.0,13320.0,13320.0,13274.0
mean,1559.626694,2.688814,112.565627,2.802778,7907.501
std,1238.405258,1.338754,148.971674,1.294496,106429.6
min,1.0,1.0,8.0,1.0,267.8298
25%,1100.0,2.0,50.0,2.0,4266.865
50%,1276.0,2.0,72.0,3.0,5434.306
75%,1680.0,3.0,120.0,3.0,7311.746
max,52272.0,40.0,3600.0,43.0,12000000.0


In [22]:
# location
data['location'].value_counts()

Whitefield                         540
Sarjapur  Road                     399
Electronic City                    302
Kanakpura Road                     273
Thanisandra                        234
                                  ... 
Masjid e Alkareem                    1
3rd Stage Raja Rajeshwari Nagar      1
Sundar Ram Shetty Nagar              1
Mukkutam Nagar                       1
1 Ramamurthy Nagar                   1
Name: location, Length: 1306, dtype: int64

In [23]:
# We will replace all the locations having value count less than 10.

#remove space
data['location'] = data['location'].apply(lambda x: x.strip())

In [24]:
location_count= data['location'].value_counts()
location_count

Whitefield              541
Sarjapur  Road          399
Electronic City         304
Kanakpura Road          273
Thanisandra             237
                       ... 
Thirumalashettyhally      1
Aavalahalli               1
Kanakadasa Layout         1
1 Ramamurthy Nagar        1
Rukmaiah Layout           1
Name: location, Length: 1295, dtype: int64

In [25]:
#count locations having value count less than 10
location_count_less_10 = location_count [location_count<=10]
location_count_less_10


Nagappa Reddy Layout    10
Sadashiva Nagar         10
BTM 1st Stage           10
Nagadevanahalli         10
Sector 1 HSR Layout     10
                        ..
Thirumalashettyhally     1
Aavalahalli              1
Kanakadasa Layout        1
1 Ramamurthy Nagar       1
Rukmaiah Layout          1
Name: location, Length: 1054, dtype: int64

In [26]:
# replace with 'other'
data['location']=data['location'].apply(lambda x: 'other' if x in location_count_less_10 else x)
data['location'].value_counts()


other                        2886
Whitefield                    541
Sarjapur  Road                399
Electronic City               304
Kanakpura Road                273
                             ... 
Thyagaraja Nagar               11
Marsur                         11
Narayanapura                   11
Banjara Layout                 11
2nd Phase Judicial Layout      11
Name: location, Length: 242, dtype: int64

In [27]:
# now we calculate per bhk square fit
(data[ 'total_sqft']/data['bhk']).describe()

count    13274.000000
mean       575.074878
std        388.205175
min          0.250000
25%        473.333333
50%        552.500000
75%        625.000000
max      26136.000000
dtype: float64

In [28]:
#0.250000----this will we outlier
#so we set criteria that 
# if per bhk square fit area  < 300--drop them 
data = data[((data['total_sqft']/data['bhk']) >= 300)]
data.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,12530.0,12530.0,12530.0,12530.0,12530.0
mean,1594.564544,2.559537,111.382401,2.650838,6303.979357
std,1261.271296,1.077938,152.077329,0.976678,4162.237981
min,300.0,1.0,8.44,1.0,267.829813
25%,1116.0,2.0,49.0,2.0,4210.526316
50%,1300.0,2.0,70.0,3.0,5294.117647
75%,1700.0,3.0,115.0,3.0,6916.666667
max,52272.0,16.0,3600.0,16.0,176470.588235


In [29]:
#(13320, 9)---previous
data.shape

(12530, 7)

In [30]:
data.price_per_sqft.describe()

count     12530.000000
mean       6303.979357
std        4162.237981
min         267.829813
25%        4210.526316
50%        5294.117647
75%        6916.666667
max      176470.588235
Name: price_per_sqft, dtype: float64

In [31]:
#max      176470.588235---unit outlier in price-per-sq-fit
def remove_outliers_sqft(df):
    df_output = pd.DataFrame()
    # group key-location, value : subdata
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        st = np.std (subdf.price_per_sqft)
        gen_df = subdf[(subdf.price_per_sqft> (m-st)) & (subdf.price_per_sqft <= (m+st))]
        df_output = pd.concat([df_output, gen_df],ignore_index =True)

    return df_output

In [32]:
data = remove_outliers_sqft (data)
data.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,10301.0,10301.0,10301.0,10301.0,10301.0
mean,1508.440608,2.471702,91.286372,2.574896,5659.062876
std,880.694214,0.979449,86.342786,0.897649,2265.774749
min,300.0,1.0,10.0,1.0,1250.0
25%,1110.0,2.0,49.0,2.0,4244.897959
50%,1286.0,2.0,67.0,2.0,5175.600739
75%,1650.0,3.0,100.0,3.0,6428.571429
max,30400.0,16.0,2200.0,16.0,24509.803922


In [33]:
# remove bhk outlier
def bhk_outlier_remover(df):
    e=np.array([])
    for location, location_df in df.groupby('location'):
        bhk_stats={}
        for bhk,bhk_df in location_df.groupby('bhk'):
            bhk_stats={
                'mean':np.mean(bhk_df.price_per_sqft),
                'std':np.std(bhk_df.price_per_sqft),
                'count':bhk_df.shape[0]
            }
        for bhk , bhk_df in location_df.groupby('bhk'):
            stats=bhk_stats.get(bhk-1)
            if stats and stats['count']>5:
                e=np.append(e,bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)
    return df.drop(e,axis='index')


In [34]:
data=bhk_outlier_remover(data)

In [35]:
data.shape


(10301, 7)

In [36]:
data

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,1st Block Jayanagar,4 BHK,2850.0,4.0,428.00,4,15017.543860
1,1st Block Jayanagar,3 BHK,1630.0,3.0,194.00,3,11901.840491
2,1st Block Jayanagar,3 BHK,1875.0,2.0,235.00,3,12533.333333
3,1st Block Jayanagar,3 BHK,1200.0,2.0,130.00,3,10833.333333
4,1st Block Jayanagar,2 BHK,1235.0,2.0,148.00,2,11983.805668
...,...,...,...,...,...,...,...
10296,other,2 BHK,1353.0,2.0,110.00,2,8130.081301
10297,other,1 Bedroom,812.0,1.0,26.00,1,3201.970443
10298,other,3 BHK,1440.0,2.0,63.93,3,4439.583333
10299,other,2 BHK,1075.0,2.0,48.00,2,4465.116279


In [37]:
#price per sq fit only used for removing outlier now we remove it
data.drop(columns=['size','price_per_sqft'],inplace=True)

In [38]:
#now we save this clean data
data.to_csv('Cleaned_data.csv')

In [39]:
#x-independent,y-dependent
x=data.drop(columns=['price']) 
y=data['price']

In [40]:
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer 
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score



In [41]:
X_train, X_test,y_train,y_test= train_test_split(x, y, test_size=0.2, random_state=0)
print (X_train.shape) 
print(X_test.shape)

(8240, 4)
(2061, 4)


In [42]:
#Applying linear Regression
# column transformation
column_trans=make_column_transformer ((OneHotEncoder (sparse=False), ['location']),remainder='passthrough')
                                      

In [43]:
#scaling the things
scaler=StandardScaler()

In [44]:
#linear regression
Ir=LinearRegression(normalize=True)

In [45]:
pipe = make_pipeline (column_trans, scaler, Ir)

In [46]:
pipe.fit(X_train,y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(sparse=False),
                                                  ['location'])])),
                ('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression(normalize=True))])

In [47]:
y_pred_1r = pipe.predict(X_test)
r2_score(y_test, y_pred_1r)

0.8284802484854614

In [48]:
#applying Lasso
lasso= Lasso()
pipe=make_pipeline (column_trans, scaler, lasso)

In [49]:
pipe.fit(X_train, y_train)
y_pred_lasso = pipe.predict(X_test)
r2_score(y_test, y_pred_lasso)

0.821316543187225

In [50]:
#applying Ridge
ridge = Ridge()

In [51]:
pipe = make_pipeline (column_trans, scaler, ridge)
pipe.fit (X_train,y_train)
y_pred_ridge = pipe.predict(X_test)

In [52]:
r2_score(y_test, y_pred_ridge)

0.8284595107915464

In [53]:
print("Linear Reggression ", r2_score(y_test, y_pred_1r))
print("Lasso: ", r2_score(y_test, y_pred_lasso))
print("Ridge: ", r2_score(y_test, y_pred_ridge))

Linear Reggression  0.8284802484854614
Lasso:  0.821316543187225
Ridge:  0.8284595107915464


In [54]:
import pickle

In [55]:
pickle.dump(pipe, open('RidgeModel.pkl', 'wb'))

In [56]:
def predict_(location,bhk,bath,sqft):
    input=pd.DataFrame([[location,sqft,bath,bhk]],columns=['location','total_sqft','bath','bhk'])
    op=pipe.predict(input)[0]
    return str(op)

In [57]:
predict_("5th Phase JP Nagar",4,3,2000)

'107.8307531634678'

In [58]:
from tkinter import *
import pandas as pd
from tkinter import messagebox
file=pd.read_csv("Cleaned_data.csv")


In [59]:
root = Tk()
root.geometry("500x500")
root.title('House Price Predictor')
label_0 =Label(root,text="Require Details", width=20,font=("bold",20))
label_0.place(x=90,y=60)

In [63]:
from tkinter import *
import pandas as pd
from tkinter import messagebox
file=pd.read_csv("Cleaned_data.csv")
root = Tk()
root.geometry("500x500")
root.wm_iconbitmap("my.ico")
root.title('House Price Predictor')
label_0 =Label(root,text="Require Details", width=20,font=("bold",20))
label_0.place(x=90,y=60)

#----------Area------------------
#file['total_sqft'] = file['total_sqft'].unique()
lis=list(set(file['total_sqft']))
label_1 =Label(root,text="Area(sq ft)", width=20,font=("bold",10))
label_1.place(x=80,y=130)
per_Sqft=StringVar()
entry_1=Entry(root,textvariable=per_Sqft)
entry_1.place(x=240,y=130)


#---------Bathroom---------------
label_2 =Label(root,text="No. of Bathrooms", width=20,font=("bold",10))
label_2.place(x=68,y=180)
lis=range(int(list(set(file['bath']))[1]),int(list(set(file['bath']))[-1])+1)
bathroom=StringVar()
droplist2=OptionMenu(root,bathroom, *lis)
droplist2.config(width=15)
bathroom.set('No. of Bathroom')
droplist2.place(x=240,y=180)

#------------BHK----------------------------
label_3 =Label(root,text="BHK", width=20,font=("bold",10))
label_3.place(x=70,y=230)
lis=list(set(file['bhk']))
bhk=StringVar()
droplist3=OptionMenu(root,bhk, *lis)
droplist3.config(width=15)
bhk.set('Select BHK')
droplist3.place(x=240,y=230)

#----------------Location-----------------
label_4=Label(root,text="Location",width=20,font=("bold",10))
label_4.place(x=70,y=280)
list_of_location=list(set(file['location']))
location=StringVar()
droplist4=OptionMenu(root,location, *list_of_location)
droplist4.config(width=15)
location.set('Select Location')
droplist4.place(x=240,y=280)

def predict():
    Area=per_Sqft.get()
    Bathroom=bathroom.get()
    BHK=bhk.get()
    Location=location.get()
    l=[]
    if Area=='' or Bathroom=='No. of Bathroom' or BHK=='Select BHK' or Location=='Select Location':
        if Area=='':
            l.append('Area')
        if Bathroom=='No. of Bathroom':
            l.append(Bathroom)
        if BHK=='Select BHK':
            l.append(BHK)
        if Location=='Select Location':
            l.append("Location")
        s=",".join(l)
        messagebox.showwarning("Warning",f"Select Atleat one value for {s}")
        
    else:
        if Area.isnumeric():
            entry_1.destroy()
            droplist2.destroy()
            droplist3.destroy()
            droplist4.destroy()

            root.title('House Price Predictor')
            label_0 =Label(root,text="Predicted Price of\nGiven Details", width=20,font=("bold",20))
            label_0.place(x=90,y=40)
        
            label_1 =Label(root,text="Area(sq ft) : ", width=20,font=("bold",13))
            label_1.place(x=80,y=130)
            label1 =Label(root,text=Area, width=20,font=("bold",13))
            label1.place(x=240,y=130)
        
            label_2 =Label(root,text="No. of Bathrooms : ", width=20,font=("bold",13))
            label_2.place(x=80,y=180)
            label2 =Label(root,text=Bathroom, width=20,font=("bold",13))
            label2.place(x=240,y=180)
        
            label_3 =Label(root,text="BHK : ", width=20,font=("bold",13))
            label_3.place(x=80,y=230)
            label3 =Label(root,text=BHK, width=20,font=("bold",13))
            label3.place(x=240,y=230)

            label_4=Label(root,text="Location : ",width=20,font=("bold",13))
            label_4.place(x=80,y=280)
            label4 =Label(root,text=Location, width=20,font=("bold",13))
            label4.place(x=240,y=280)

            label_5=Label(root,text="Predicted Price : ",width=20,fg='red',font=("bold",13))
            label_5.place(x=90,y=330)
            input=pd.DataFrame([[Location,int(Area),int(Bathroom),int(BHK)]],columns=['location','total_sqft','bath','bhk'])
            op=pipe.predict(input)[0]
            ans = "{:.2f}".format(op)
            label5 =Label(root,text=ans, width=20,fg='red',font=("bold",13))
            label5.place(x=240,y=330)
        else:
            messagebox.showwarning("Warning","Area field accept only numeric value")

            
#-----------------Predict Button--------------------------

Button(root, text='Predicted Price' , width=20,bg="black",fg='white',command=predict).place(x=90,y=380)
Button(root, text='Quit' , width=20,bg="black",fg='white',command=root.destroy).place(x=240,y=380)
root.mainloop()