# Real Estate Price Prediction

##### Dataset_link: https://www.kaggle.com/amitabhajoy/bengaluru-house-price-data

### Import required libraries

In [214]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib 
# matplotlib.rcParams["figure.figsize"] = (20,10)

### load real estate dataset

In [215]:
df = pd.read_csv('bengaluru_house_prices.csv')
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [216]:
df.shape

(13320, 9)

## We will take the following features for our model
 - size
 - total_sqft
 - bath
 - balcony

In [217]:
df = df.drop(['area_type','availability','location','society'],axis=1)
df.head()

Unnamed: 0,size,total_sqft,bath,balcony,price
0,2 BHK,1056,2.0,1.0,39.07
1,4 Bedroom,2600,5.0,3.0,120.0
2,3 BHK,1440,2.0,3.0,62.0
3,3 BHK,1521,3.0,1.0,95.0
4,2 BHK,1200,2.0,1.0,51.0


### Data Cleaning

In [218]:
df.isnull().sum()

size           16
total_sqft      0
bath           73
balcony       609
price           0
dtype: int64

In [219]:
df = df.dropna()
df.isnull().sum()

size          0
total_sqft    0
bath          0
balcony       0
price         0
dtype: int64

In [220]:
df.shape

(12711, 5)

#### Engineering the size feature

In [221]:
df['size'].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '3 Bedroom', '1 BHK', '1 RK',
       '4 BHK', '1 Bedroom', '2 Bedroom', '6 Bedroom', '8 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', '9 Bedroom', '27 BHK', '11 Bedroom', '43 Bedroom',
       '14 BHK', '8 BHK', '12 Bedroom', '10 Bedroom', '13 BHK'],
      dtype=object)

In [222]:
def get_bhk(size):
    return int(size.split(' ')[0])

In [223]:
get_bhk('9 bedroom')

9

In [224]:
df['bhk']  = df['size'].apply(get_bhk)
df.head()

Unnamed: 0,size,total_sqft,bath,balcony,price,bhk
0,2 BHK,1056,2.0,1.0,39.07,2
1,4 Bedroom,2600,5.0,3.0,120.0,4
2,3 BHK,1440,2.0,3.0,62.0,3
3,3 BHK,1521,3.0,1.0,95.0,3
4,2 BHK,1200,2.0,1.0,51.0,2


In [225]:
df.drop(['size'],axis=1,inplace=True)
df.head(2)

Unnamed: 0,total_sqft,bath,balcony,price,bhk
0,1056,2.0,1.0,39.07,2
1,2600,5.0,3.0,120.0,4


### Engineering the total_sqft feature

In [226]:
df['total_sqft'].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [227]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12711 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_sqft  12711 non-null  object 
 1   bath        12711 non-null  float64
 2   balcony     12711 non-null  float64
 3   price       12711 non-null  float64
 4   bhk         12711 non-null  int64  
dtypes: float64(3), int64(1), object(1)
memory usage: 595.8+ KB


In [228]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [229]:
df[~df['total_sqft'].apply(is_float)].head(10)

Unnamed: 0,total_sqft,bath,balcony,price,bhk
30,2100 - 2850,4.0,0.0,186.0,4
122,3067 - 8156,4.0,0.0,477.0,4
137,1042 - 1105,2.0,0.0,54.005,2
165,1145 - 1340,2.0,0.0,43.49,2
188,1015 - 1540,2.0,0.0,56.8,2
410,34.46Sq. Meter,1.0,0.0,18.5,1
549,1195 - 1440,2.0,0.0,63.77,2
661,1120 - 1145,2.0,0.0,48.13,2
672,3090 - 5002,4.0,0.0,445.0,4
772,1160 - 1195,2.0,0.0,59.935,2


In [230]:
def convert_sqft_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None   

In [231]:
convert_sqft_to_num('2166')

2166.0

In [232]:
convert_sqft_to_num('2216 - 2850')

2533.0

In [233]:
convert_sqft_to_num('34.46Sq. Meter')

In [234]:
sum([int(x) for x in list(df['total_sqft']=='34.46Sq. Meter')])

1

In [235]:
df1 = df.copy()
df1.total_sqft = df1.total_sqft.apply(convert_sqft_to_num)
df1 = df1[df1.total_sqft.notnull()]
df1.head(2)

Unnamed: 0,total_sqft,bath,balcony,price,bhk
0,1056.0,2.0,1.0,39.07,2
1,2600.0,5.0,3.0,120.0,4


In [236]:
df1.to_csv("data.csv",index=False)

### Removing Outliers


In [237]:
df1.head(1)

Unnamed: 0,total_sqft,bath,balcony,price,bhk
0,1056.0,2.0,1.0,39.07,2


In [238]:
df1.describe()

Unnamed: 0,total_sqft,bath,balcony,price,bhk
count,12669.0,12669.0,12669.0,12669.0,12669.0
mean,1511.842126,2.616308,1.585682,105.951073,2.736443
std,1162.051672,1.223838,0.816734,131.808053,1.202598
min,5.0,1.0,0.0,8.0,1.0
25%,1100.0,2.0,1.0,49.02,2.0
50%,1260.0,2.0,2.0,70.0,3.0
75%,1640.0,3.0,2.0,115.0,3.0
max,52272.0,40.0,3.0,2912.0,43.0


In [239]:
#removing outliers using IQR

df1 = df1[df1['total_sqft']<df1['total_sqft'].quantile(.99)]

In [240]:
df1 = df1[df1['bath']<df1['bath'].quantile(.99)]

In [241]:
df1 = df1[df1['price']<df1['price'].quantile(.99)]

In [242]:
df1 = df1[df1['bhk']<df1['bhk'].quantile(.99)]

In [243]:
df1.describe()

Unnamed: 0,total_sqft,bath,balcony,price,bhk
count,12053.0,12053.0,12053.0,12053.0,12053.0
mean,1416.580896,2.462292,1.572969,91.028511,2.574131
std,602.801017,0.885875,0.803095,72.022717,0.811417
min,11.0,1.0,0.0,8.0,1.0
25%,1093.0,2.0,1.0,48.0,2.0
50%,1252.0,2.0,2.0,67.9,2.0
75%,1600.0,3.0,2.0,105.0,3.0
max,4723.0,6.0,3.0,500.0,5.0


## Predicting price using Machine Learning

In [244]:
df1.shape

(12053, 5)

In [245]:
df1.head()

Unnamed: 0,total_sqft,bath,balcony,price,bhk
0,1056.0,2.0,1.0,39.07,2
1,2600.0,5.0,3.0,120.0,4
2,1440.0,2.0,3.0,62.0,3
3,1521.0,3.0,1.0,95.0,3
4,1200.0,2.0,1.0,51.0,2


In [246]:
X = df1.drop(['price'],axis=1)
X.head(3)

Unnamed: 0,total_sqft,bath,balcony,bhk
0,1056.0,2.0,1.0,2
1,2600.0,5.0,3.0,4
2,1440.0,2.0,3.0,3


In [247]:
X.shape

(12053, 4)

In [248]:
y = df1.price
y.head(3)

0     39.07
1    120.00
2     62.00
Name: price, dtype: float64

In [249]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=10)

In [250]:
X_test,X_val,y_test,y_val = train_test_split(X_test,y_test,test_size=0.2,random_state=20)
X_test.shape,X_val.shape
    

((2892, 4), (724, 4))

In [251]:
X_train.shape,X_test.shape

((8437, 4), (2892, 4))

### Baseline Model

In [252]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score

reg = GradientBoostingRegressor()
reg.fit(X_train,y_train)

preds = reg.predict(X_val)
r2_score(y_val,preds)

0.6089687647398904

### K Fold cross validation

In [253]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

cross_val_score(reg, X_train, y_train, cv=cv)

array([0.64938375, 0.68333476, 0.6914194 , 0.66273364, 0.68248593])

In [254]:
#Testing the model on test_data

reg.fit(X_test,y_test)
y_pred = reg.predict(X_test)
print(f"The r2 score of the model is {r2_score(y_test,y_pred)}")

The r2 score of the model is 0.7257362273181565


### Exporting final model to a pickle file

In [255]:
import pickle
pickle.dump(reg,open('model.pkl','wb'))

-----------