In [78]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn import metrics


In [79]:
df=pd.read_csv('/kaggle/input/mumbai-house-prices/Mumbai House Prices.csv')
df.head()

Unnamed: 0,bhk,type,locality,area,price,price_unit,region,status,age
0,3,Apartment,Lak And Hanware The Residency Tower,685,2.5,Cr,Andheri West,Ready to move,New
1,2,Apartment,Radheya Sai Enclave Building No 2,640,52.51,L,Naigaon East,Under Construction,New
2,2,Apartment,Romell Serene,610,1.73,Cr,Borivali West,Under Construction,New
3,2,Apartment,Soundlines Codename Urban Rainforest,876,59.98,L,Panvel,Under Construction,New
4,2,Apartment,Origin Oriana,659,94.11,L,Mira Road East,Under Construction,New


In [80]:
#Data Description
print(df.shape)
df.describe()

(76038, 9)


Unnamed: 0,bhk,area,price
count,76038.0,76038.0,76038.0
mean,2.015111,1024.53685,29.38227
std,0.922754,670.276165,32.90345
min,1.0,127.0,1.0
25%,1.0,640.0,1.75
50%,2.0,872.0,5.5
75%,3.0,1179.0,59.0
max,10.0,16000.0,99.99


In [81]:
df.type.value_counts()

type
Apartment            74854
Studio Apartment       882
Villa                  226
Independent House       73
Penthouse                3
Name: count, dtype: int64

In [82]:
df.region.value_counts()

region
Thane West         14868
Mira Road East      9902
Dombivali           3041
Kandivali East      2568
Kharghar            2362
                   ...  
Police Colony          1
GTB Nagar              1
Bandra                 1
Sector 14 Vashi        1
Goregaon               1
Name: count, Length: 228, dtype: int64

In [83]:
#Data Preprocessing
df.isnull().sum()

bhk           0
type          0
locality      0
area          0
price         0
price_unit    0
region        0
status        0
age           0
dtype: int64

In [84]:
#dropping columns which are not req for prediction
df1=df.drop(columns=['locality','age','status'],axis=1)
df1.head()

Unnamed: 0,bhk,type,area,price,price_unit,region
0,3,Apartment,685,2.5,Cr,Andheri West
1,2,Apartment,640,52.51,L,Naigaon East
2,2,Apartment,610,1.73,Cr,Borivali West
3,2,Apartment,876,59.98,L,Panvel
4,2,Apartment,659,94.11,L,Mira Road East


In [85]:
#converting all prices to lakhs
x = df1.shape[0]
l=[]
for i in range(x):
    if df1.price_unit[i]=='Cr':
        l.append(100*df1.price[i])
    elif df1.price_unit[i]=='L':
        l.append(df1.price[i])
df1['New Price']=l
df1.drop(columns='price',axis=1,inplace=True)
df1.head()

Unnamed: 0,bhk,type,area,price_unit,region,New Price
0,3,Apartment,685,Cr,Andheri West,250.0
1,2,Apartment,640,L,Naigaon East,52.51
2,2,Apartment,610,Cr,Borivali West,173.0
3,2,Apartment,876,L,Panvel,59.98
4,2,Apartment,659,L,Mira Road East,94.11


In [87]:
#creating a new column for price per square feet
df1['Price per Sq Ft']=df1['New Price']/df1.area
df1['Price per Sq Ft'].describe()

count    76038.000000
mean         0.151188
std          0.093329
min          0.006468
25%          0.085714
50%          0.130000
75%          0.194961
max          1.823077
Name: Price per Sq Ft, dtype: float64

In [88]:
df1.head()

Unnamed: 0,bhk,type,area,price_unit,region,New Price,Price per Sq Ft
0,3,Apartment,685,Cr,Andheri West,250.0,0.364964
1,2,Apartment,640,L,Naigaon East,52.51,0.082047
2,2,Apartment,610,Cr,Borivali West,173.0,0.283607
3,2,Apartment,876,L,Panvel,59.98,0.06847
4,2,Apartment,659,L,Mira Road East,94.11,0.142807


In [92]:
#labelling type column
d={'Apartment':0,'Studio Apartment':1,'Villa':2,'Independent House':3,'Penthouse':4}
df1.replace({'type':d},inplace=True)
df1.type.value_counts()

type
0    74854
1      882
2      226
3       73
4        3
Name: count, dtype: int64

In [95]:
#labelling region column
label=LabelEncoder()
labels=label.fit_transform(df1.region)
df1['Region Number']=labels
df1.head()

Unnamed: 0,bhk,type,area,price_unit,region,New Price,Price per Sq Ft,Region Number
0,3,0,685,Cr,Andheri West,250.0,0.364964,8
1,2,0,640,L,Naigaon East,52.51,0.082047,124
2,2,0,610,Cr,Borivali West,173.0,0.283607,31
3,2,0,876,L,Panvel,59.98,0.06847,140
4,2,0,659,L,Mira Road East,94.11,0.142807,117


In [97]:
X=df1.drop(columns=['price_unit','region','New Price'],axis=1)
y=df1['New Price']
df1.head()

Unnamed: 0,bhk,type,area,price_unit,region,New Price,Price per Sq Ft,Region Number
0,3,0,685,Cr,Andheri West,250.0,0.364964,8
1,2,0,640,L,Naigaon East,52.51,0.082047,124
2,2,0,610,Cr,Borivali West,173.0,0.283607,31
3,2,0,876,L,Panvel,59.98,0.06847,140
4,2,0,659,L,Mira Road East,94.11,0.142807,117


In [98]:
#training model
x_train,x_test,y_train,y_test=train_test_split(X,y,random_state=2,test_size=0.2)

In [99]:
#Machine Learning model
from xgboost import XGBRegressor
model=XGBRegressor()
model.fit(x_train,y_train)
model.score(x_train,y_train)

0.9997469182916016

In [100]:
#Testing Data
pred=model.predict(x_test)
print(y_test)
print(pred)

44537     75.00
54707    162.00
33616    236.00
10735    800.00
75123     88.00
          ...  
4102      61.00
35796     82.50
27138     17.75
15674    107.00
26342    230.00
Name: New Price, Length: 15208, dtype: float64
[ 77.49196  163.22496  234.40395  ...  16.896292 106.1398   227.19217 ]


In [101]:
#Predictive Model
bhk=int(input('Enter number of bedrooms='))
house_type=int(input('Enter type of house (0 for Apartment,1 for Studio Apartment,2 for Villa,3 for Independent House,4 for Penthouse)='))
area=int(input('Enter Area Required='))
ppsqft=float(input('Enter Price per Square Feet='))
reg=str(input('Enter Region As Stated in the catalouge='))
a=df1.shape[0]
for i in range(x):
    if df1.region[i]==reg:
        b=df1['Region Number'][i]
        break
        
t=(bhk,house_type,area,ppsqft,b)
nparr=np.asarray(t)
nparr2=nparr.reshape(1,-1)
prediction=model.predict(nparr2)
print(prediction)

Enter number of bedrooms= 2
Enter type of house (0 for Apartment,1 for Studio Apartment,2 for Villa,3 for Independent House,4 for Penthouse)= 0
Enter Area Required= 876
Enter Price per Square Feet= 0.068470
Enter Region As Stated in the catalouge= Panvel


[62.08596]


In [102]:
#saving model
import pickle
with open('real_estate_prediction','wb') as f:
    pickle.dump(model,f)

In [105]:
with open('real_estate_prediction','rb') as f:
    mp=pickle.load(f)
mp.predict(nparr2)

array([62.08596], dtype=float32)