<h1 style = 'color: #CF53CC' align = 'center'> Real Estate Price Prediction (Case study: New Delhi) </h1>

<h3 style = 'color: #27B5D9' align = 'center'> Dependencies </h3>

In [1]:
import json, pickle
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error

<h3 style = 'color: #27B5D9' align = 'center'> Data Exploration </h3>

In [2]:
delhi_data = pd.read_csv('../Data/newdelhi_dataset.csv')
delhi_data.head()

Unnamed: 0,Area,BHK,Bathroom,Furnishing,Locality,Parking,Price,Status,Transaction,Type,Per_Sqft
0,800.0,3,2.0,Semi-Furnished,Rohini Sector 25,1.0,6500000,Ready_to_move,New_Property,Builder_Floor,
1,750.0,2,2.0,Semi-Furnished,"J R Designers Floors, Rohini Sector 24",1.0,5000000,Ready_to_move,New_Property,Apartment,6667.0
2,950.0,2,2.0,Furnished,"Citizen Apartment, Rohini Sector 13",1.0,15500000,Ready_to_move,Resale,Apartment,6667.0
3,600.0,2,2.0,Semi-Furnished,Rohini Sector 24,1.0,4200000,Ready_to_move,Resale,Builder_Floor,6667.0
4,650.0,2,2.0,Semi-Furnished,Rohini Sector 24 carpet area 650 sqft status R...,1.0,6200000,Ready_to_move,New_Property,Builder_Floor,6667.0


In [3]:
delhi_data.shape

(1259, 11)

In [4]:
delhi_data.columns

Index(['Area', 'BHK', 'Bathroom', 'Furnishing', 'Locality', 'Parking', 'Price',
       'Status', 'Transaction', 'Type', 'Per_Sqft'],
      dtype='object')

In [5]:
delhi_data.describe()

Unnamed: 0,Area,BHK,Bathroom,Parking,Price,Per_Sqft
count,1259.0,1259.0,1257.0,1226.0,1259.0,1018.0
mean,1466.452724,2.796664,2.556086,1.935563,21306700.0,15690.136542
std,1568.05504,0.954425,1.04222,6.279212,25601150.0,21134.738568
min,28.0,1.0,1.0,1.0,1000000.0,1259.0
25%,800.0,2.0,2.0,1.0,5700000.0,6364.0
50%,1200.0,3.0,2.0,1.0,14200000.0,11291.5
75%,1700.0,3.0,3.0,2.0,25500000.0,18000.0
max,24300.0,10.0,7.0,114.0,240000000.0,183333.0


In [6]:
delhi_data.dtypes

Area           float64
BHK              int64
Bathroom       float64
Furnishing      object
Locality        object
Parking        float64
Price            int64
Status          object
Transaction     object
Type            object
Per_Sqft       float64
dtype: object

In [7]:
col_with_missing_val = delhi_data.isnull().any()
col_with_missing_val

Area           False
BHK            False
Bathroom        True
Furnishing      True
Locality       False
Parking         True
Price          False
Status         False
Transaction    False
Type            True
Per_Sqft        True
dtype: bool

<h3 style = 'color: #27B5D9' align = 'center'> Preparing Data for Training </h3>

<h5 style = 'color: #565B5E' align = 'center'> Drop Unnecessary Columns </h5>

In [8]:
dataframe = delhi_data.drop(['Parking', 'Status', 'Type'], axis = 1)
dataframe.head()

Unnamed: 0,Area,BHK,Bathroom,Furnishing,Locality,Price,Transaction,Per_Sqft
0,800.0,3,2.0,Semi-Furnished,Rohini Sector 25,6500000,New_Property,
1,750.0,2,2.0,Semi-Furnished,"J R Designers Floors, Rohini Sector 24",5000000,New_Property,6667.0
2,950.0,2,2.0,Furnished,"Citizen Apartment, Rohini Sector 13",15500000,Resale,6667.0
3,600.0,2,2.0,Semi-Furnished,Rohini Sector 24,4200000,Resale,6667.0
4,650.0,2,2.0,Semi-Furnished,Rohini Sector 24 carpet area 650 sqft status R...,6200000,New_Property,6667.0


<h5 style = 'color: #565B5E' align = 'center'> Count Missing Values per Column </h5>

In [9]:
count_missing_vals_by_col = dataframe.isnull().sum()
print(count_missing_vals_by_col[count_missing_vals_by_col > 0])

Bathroom        2
Furnishing      5
Per_Sqft      241
dtype: int64


<h5 style = 'color: #565B5E' align = 'center'> Convert Indian Rupee to American Dollar (1INR = 0.01$) </h5>

In [10]:
dataframe['Price'] = dataframe['Price'] / 100
dataframe.head()

Unnamed: 0,Area,BHK,Bathroom,Furnishing,Locality,Price,Transaction,Per_Sqft
0,800.0,3,2.0,Semi-Furnished,Rohini Sector 25,65000.0,New_Property,
1,750.0,2,2.0,Semi-Furnished,"J R Designers Floors, Rohini Sector 24",50000.0,New_Property,6667.0
2,950.0,2,2.0,Furnished,"Citizen Apartment, Rohini Sector 13",155000.0,Resale,6667.0
3,600.0,2,2.0,Semi-Furnished,Rohini Sector 24,42000.0,Resale,6667.0
4,650.0,2,2.0,Semi-Furnished,Rohini Sector 24 carpet area 650 sqft status R...,62000.0,New_Property,6667.0


<h5 style = 'color: #565B5E' align = 'center'> Edit Per_Sqft Column </h5>

In [11]:
dataframe['Per_Sqft'] = dataframe['Price'] / dataframe['Area']
dataframe['Per_Sqft']

0        81.250000
1        66.666667
2       163.157895
3        70.000000
4        95.384615
           ...    
1254    133.559981
1255    119.047619
1256    200.000000
1257    116.161616
1258     16.742081
Name: Per_Sqft, Length: 1259, dtype: float64

In [12]:
dataframe.head()

Unnamed: 0,Area,BHK,Bathroom,Furnishing,Locality,Price,Transaction,Per_Sqft
0,800.0,3,2.0,Semi-Furnished,Rohini Sector 25,65000.0,New_Property,81.25
1,750.0,2,2.0,Semi-Furnished,"J R Designers Floors, Rohini Sector 24",50000.0,New_Property,66.666667
2,950.0,2,2.0,Furnished,"Citizen Apartment, Rohini Sector 13",155000.0,Resale,163.157895
3,600.0,2,2.0,Semi-Furnished,Rohini Sector 24,42000.0,Resale,70.0
4,650.0,2,2.0,Semi-Furnished,Rohini Sector 24 carpet area 650 sqft status R...,62000.0,New_Property,95.384615


<h5 style = 'color: #565B5E' align = 'center'> Apply the Demensionality Reduction to Reduce the Locality Categories </h5>

In [13]:
locality_stats = dataframe['Locality'].value_counts(ascending = False)
print(len(locality_stats))
print(len(locality_stats[locality_stats > 10]))
print(len(locality_stats[locality_stats <= 10])) # Listed as "other" locality

365
27
338


In [14]:
locality_less_than_10 = locality_stats[locality_stats <= 10]
print(locality_less_than_10)                             

Mahavir Enclave                                   10
Rohini Sector 24                                  10
Vasant Kunj                                       10
The Leela Sky Villas, Patel Nagar                 10
Narmada Apartment, Alaknanda                      10
                                                  ..
Gold Croft Apartment, Aashirwaad Chowk, Dwarka     1
Rohitash Nagar East, Shahdara                      1
Bhorgarh, Narela                                   1
New Delhi Apartment, Vasundhara Enclave            1
Kiran Garden, Uttam Nagar                          1
Name: Locality, Length: 338, dtype: int64


In [15]:
len(dataframe.Locality.unique())

365

In [16]:
dataframe['Locality'] = dataframe.Locality.apply(lambda x: 'other' if x in locality_less_than_10 else x)
len(dataframe.Locality.unique())

28

In [17]:
dataframe[dataframe['Locality'] == 'other']

Unnamed: 0,Area,BHK,Bathroom,Furnishing,Locality,Price,Transaction,Per_Sqft
0,800.0,3,2.0,Semi-Furnished,other,65000.0,New_Property,81.250000
2,950.0,2,2.0,Furnished,other,155000.0,Resale,163.157895
3,600.0,2,2.0,Semi-Furnished,other,42000.0,Resale,70.000000
4,650.0,2,2.0,Semi-Furnished,other,62000.0,New_Property,95.384615
5,1300.0,4,3.0,Semi-Furnished,other,155000.0,New_Property,119.230769
...,...,...,...,...,...,...,...,...
1238,1200.0,3,3.0,Semi-Furnished,other,222000.0,New_Property,185.000000
1241,2430.0,4,4.0,Furnished,other,300000.0,Resale,123.456790
1244,1350.0,3,2.0,Semi-Furnished,other,175000.0,Resale,129.629630
1252,1800.0,3,3.0,Semi-Furnished,other,260000.0,Resale,144.444444


In [18]:
dataframe['Locality'].unique()

array(['other', 'J R Designers Floors, Rohini Sector 24',
       'Lajpat Nagar 2', 'Lajpat Nagar 3', 'The Amaryllis, Karol Bagh',
       'New Friends Colony', 'Kailash Colony, Greater Kailash',
       'Yamuna Vihar, Shahdara', 'Laxmi Nagar', 'Patel Nagar West',
       'Sukhdev Vihar, Okhla', 'Saket', 'Safdarjung Enclave',
       'Common Wealth Games Village, Commonwealth Games Village 2010',
       'Alaknanda', 'DDA Flats Sarita Vihar, Sarita Vihar, Mathura Road',
       'New Manglapuri, Sultanpur', 'Chhattarpur', 'Mehrauli',
       'Mahavir Enclave Part 1', 'Malviya Nagar',
       'Dilshad Colony, Dilshad Garden', 'Vasundhara Enclave',
       'DLF Capital Greens, New Moti Nagar, Kirti Nagar',
       'New Moti Nagar, Kirti Nagar', 'Sheikh Sarai Phase 1', 'Hauz Khas',
       'Chittaranjan Park'], dtype=object)

In [19]:
loc_values = dataframe['Locality'].unique()

localities = {
    'localities_data': [loc for loc in loc_values]
}

with open('..\data\localities.json', 'w') as file:
    file.write(json.dumps(localities))

In [21]:
with open("..\data\localities.json", "r") as file:
        localities_data = json.load(file)['localities_data']
localities_data

['other',
 'J R Designers Floors, Rohini Sector 24',
 'Lajpat Nagar 2',
 'Lajpat Nagar 3',
 'The Amaryllis, Karol Bagh',
 'New Friends Colony',
 'Kailash Colony, Greater Kailash',
 'Yamuna Vihar, Shahdara',
 'Laxmi Nagar',
 'Patel Nagar West',
 'Sukhdev Vihar, Okhla',
 'Saket',
 'Safdarjung Enclave',
 'Common Wealth Games Village, Commonwealth Games Village 2010',
 'Alaknanda',
 'DDA Flats Sarita Vihar, Sarita Vihar, Mathura Road',
 'New Manglapuri, Sultanpur',
 'Chhattarpur',
 'Mehrauli',
 'Mahavir Enclave Part 1',
 'Malviya Nagar',
 'Dilshad Colony, Dilshad Garden',
 'Vasundhara Enclave',
 'DLF Capital Greens, New Moti Nagar, Kirti Nagar',
 'New Moti Nagar, Kirti Nagar',
 'Sheikh Sarai Phase 1',
 'Hauz Khas',
 'Chittaranjan Park']

<h5 style = 'color: #565B5E' align = 'center'> Fill Missing Values in (Furnishing, Bathroom) Columns </h5>

In [22]:
dataframe.Furnishing = dataframe['Furnishing'].fillna(method = 'bfill')
dataframe['Furnishing'].isnull().any()

False

In [23]:
dataframe.Bathroom = dataframe['Bathroom'].fillna(method = 'bfill')
dataframe['Bathroom'].isnull().any()

False

In [24]:
dataframe.isnull().any()

Area           False
BHK            False
Bathroom       False
Furnishing     False
Locality       False
Price          False
Transaction    False
Per_Sqft       False
dtype: bool

<h5 style = 'color: #565B5E' align = 'center'> Encode Categorical Columns </h5>

In [25]:
furnishing = LabelEncoder()
transaction = LabelEncoder()
locality = LabelEncoder()

dataframe['Furnishing'] = furnishing.fit_transform(dataframe['Furnishing'])
dataframe['Transaction'] = transaction.fit_transform(dataframe['Transaction'])
dataframe['Locality'] = locality.fit_transform(dataframe['Locality'])

dataframe.head()

# Furnishing categories: Furnished: 0 | Semi-Furnished: 1 | Unfurnished: 2
# Transaction categories: New property: 0 | Resale: 1
# Locality categories: "other" category: 27 

Unnamed: 0,Area,BHK,Bathroom,Furnishing,Locality,Price,Transaction,Per_Sqft
0,800.0,3,2.0,1,27,65000.0,0,81.25
1,750.0,2,2.0,1,8,50000.0,0,66.666667
2,950.0,2,2.0,0,27,155000.0,1,163.157895
3,600.0,2,2.0,1,27,42000.0,1,70.0
4,650.0,2,2.0,1,27,62000.0,0,95.384615


In [26]:
dataframe['Locality'].value_counts(ascending = False)

27    737
11     34
10     33
8      31
9      30
26     29
12     25
15     24
2      24
21     21
0      20
4      19
20     18
24     17
1      17
22     17
6      16
3      15
13     15
19     15
5      14
18     14
16     14
14     13
7      12
23     12
25     12
17     11
Name: Locality, dtype: int64

<h5 style = 'color: #565B5E' align = 'center'> Edit Columns' names </h5>

In [27]:
df = dataframe.rename(columns = {'Area': 'Area (sf)', 'BHK': 'Bedroom', 'Price': 'Price ($)', 'Per_Sqft': 'Per Sf ($)'})
df.head()

Unnamed: 0,Area (sf),Bedroom,Bathroom,Furnishing,Locality,Price ($),Transaction,Per Sf ($)
0,800.0,3,2.0,1,27,65000.0,0,81.25
1,750.0,2,2.0,1,8,50000.0,0,66.666667
2,950.0,2,2.0,0,27,155000.0,1,163.157895
3,600.0,2,2.0,1,27,42000.0,1,70.0
4,650.0,2,2.0,1,27,62000.0,0,95.384615


<h5 style = 'color: #565B5E' align = 'center'> Detect Outliers </h5>

In [28]:
min_thresold, max_thresold = df['Price ($)'].quantile([0.001, 0.999])
min_thresold, max_thresold

(10000.0, 2348400.0000000377)

<h5 style = 'color: #565B5E' align = 'center'> Take a Look at Detected Outliers </h5>

In [29]:
df_outliers = df[(df['Price ($)'] < min_thresold) | (df['Price ($)'] > max_thresold)]
df_outliers

Unnamed: 0,Area (sf),Bedroom,Bathroom,Furnishing,Locality,Price ($),Transaction,Per Sf ($)
57,8000.0,4,5.0,1,27,2400000.0,0,300.0
109,8000.0,4,5.0,1,27,2400000.0,0,300.0


<h5 style = 'color: #565B5E' align = 'center'> Remove Detected Outliers </h5>

In [30]:
df = df[(df['Price ($)'] > min_thresold) & (df['Price ($)'] < max_thresold)]
df

Unnamed: 0,Area (sf),Bedroom,Bathroom,Furnishing,Locality,Price ($),Transaction,Per Sf ($)
0,800.0,3,2.0,1,27,65000.0,0,81.250000
1,750.0,2,2.0,1,8,50000.0,0,66.666667
2,950.0,2,2.0,0,27,155000.0,1,163.157895
3,600.0,2,2.0,1,27,42000.0,1,70.000000
4,650.0,2,2.0,1,27,62000.0,0,95.384615
...,...,...,...,...,...,...,...,...
1254,4118.0,4,5.0,2,2,550000.0,0,133.559981
1255,1050.0,3,2.0,1,2,125000.0,1,119.047619
1256,875.0,3,3.0,1,2,175000.0,0,200.000000
1257,990.0,2,2.0,2,27,115000.0,1,116.161616


<h5 style = 'color: #565B5E' align = 'center'> Split up into training and testing sets </h5>

In [31]:
X = df.drop(['Price ($)', 'Per Sf ($)'], axis = 1)
y = df['Price ($)']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

print(len(X_train))
print(len(X_test))

1003
251


In [32]:
X

Unnamed: 0,Area (sf),Bedroom,Bathroom,Furnishing,Locality,Transaction
0,800.0,3,2.0,1,27,0
1,750.0,2,2.0,1,8,0
2,950.0,2,2.0,0,27,1
3,600.0,2,2.0,1,27,1
4,650.0,2,2.0,1,27,0
...,...,...,...,...,...,...
1254,4118.0,4,5.0,2,2,0
1255,1050.0,3,2.0,1,2,1
1256,875.0,3,3.0,1,2,0
1257,990.0,2,2.0,2,27,1


In [33]:
y

0        65000.0
1        50000.0
2       155000.0
3        42000.0
4        62000.0
          ...   
1254    550000.0
1255    125000.0
1256    175000.0
1257    115000.0
1258    185000.0
Name: Price ($), Length: 1254, dtype: float64

<h5 style = 'color: #565B5E' align = 'center'> Train and Measure Model Accuracy </h5>

In [34]:
def train_model(trained_model):
    
    model = trained_model
    model.fit(X_train, y_train)
    
    y_predicted = model.predict(X_test)  
    
    print(mean_absolute_error(y_test, y_predicted))
    
print('MAE using Random Forest Regressor ($): ')
train_model(RandomForestRegressor(random_state = 42, n_estimators = 300))

print('MAE using Linear Regression ($): ')
train_model(LinearRegression())

print('MAE using Decision Tree Regressor ($): ')
train_model(DecisionTreeRegressor(random_state = 42, criterion = 'mae', max_depth = 20, max_leaf_nodes = 50))

MAE using Random Forest Regressor ($): 
66206.13051230766
MAE using Linear Regression ($): 
106120.12387605029
MAE using Decision Tree Regressor ($): 
75339.44223107569


<h5 style = 'color: #565B5E' align = 'center'> Measure the Model Performance using Cross Validation </h5>

In [35]:
def get_kfold_cross(model, data, target):
    
    folds = StratifiedKFold(n_splits = 5)
    
    scores = cross_val_score(model, X, y)
    
    return scores.mean()

print(get_kfold_cross(RandomForestRegressor(n_estimators = 300, random_state = 42), X, y))
print(get_kfold_cross(LinearRegression(), X, y))
print(get_kfold_cross(DecisionTreeRegressor(), X, y))  

0.7044135492038781
0.4842574159191638
0.49636661732724663


<h5 style = 'color: #565B5E' align = 'center'>  Test the Trained Model </h5>

In [36]:
model = RandomForestRegressor(n_estimators = 300, random_state = 42)
model.fit(X_train, y_train)

RandomForestRegressor(n_estimators=300, random_state=42)

In [37]:
model.predict([[375, 3, 1, 2, 7, 0]])

array([82639.11111111])

In [38]:
model.predict([[750, 3, 1, 2, 7, 0]])

array([70141.94444444])

In [39]:
model.predict([[300, 3, 1, 2, 9, 0]])

array([83753.77777778])

<h5 style = 'color: #565B5E' align = 'center'> Save the Trained Model to a Pickle File </h5>

In [40]:
with open('../models/model_0.h5', 'wb') as file: 
    pickle.dump(model, file)

In [41]:
with open('../models/model_0.h5', 'rb') as file:
    trained_model = pickle.load(file)

In [42]:
trained_model.predict([[800, 3, 2, 1, 27, 0]])

array([54818.94444444])

In [43]:
trained_model.predict([[300, 3, 1, 2, 9, 0]])

array([83753.77777778])

In [44]:
trained_model.predict([[300, 4, 1, 2, 9, 1]])

array([97166.11111111])