<a href="https://colab.research.google.com/github/shahroz-dev/Machine-Learning-Based-Real-Estate-Price-Prediction/blob/main/Machine_Learning_Based_Real_Estate_Price_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Import Libraries

In [127]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import zipfile
from sklearn.impute import SimpleImputer

### Extract Data

In [128]:
!wget https://github.com/shahroz-dev/Machine-Learning-Based-Real-Estate-Price-Prediction/raw/main/Data.zip
zip_ref = zipfile.ZipFile("Data.zip")
zip_ref.extractall()
zip_ref.close()

--2022-07-29 19:27:57--  https://github.com/shahroz-dev/Machine-Learning-Based-Real-Estate-Price-Prediction/raw/main/Data.zip
Resolving github.com (github.com)... 140.82.112.3
Connecting to github.com (github.com)|140.82.112.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/shahroz-dev/Machine-Learning-Based-Real-Estate-Price-Prediction/main/Data.zip [following]
--2022-07-29 19:27:57--  https://raw.githubusercontent.com/shahroz-dev/Machine-Learning-Based-Real-Estate-Price-Prediction/main/Data.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 190803 (186K) [application/zip]
Saving to: ‘Data.zip’


2022-07-29 19:27:57 (11.5 MB/s) - ‘Data.zip’ saved [190803/190803]



### Data Read and Exploration

In [129]:
df1 = pd.read_csv("Bengaluru_House_Data.csv")

In [130]:
df1.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [131]:
# Look for types of areas
df1.groupby('area_type').agg({'area_type':'count'})

Unnamed: 0_level_0,area_type
area_type,Unnamed: 1_level_1
Built-up Area,2418
Carpet Area,87
Plot Area,2025
Super built-up Area,8790


In [132]:
# It is observed that some features didn't affect our house prices which needs to be removed from our dataset
df2 = df1.drop(['area_type', 'availability', 'society', 'balcony'], axis='columns')
df2.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [133]:
# Check for missing values
df2.isnull().sum()

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [134]:
# Select numerical and object columns list for filling NaNs
numerical_columns = df2.select_dtypes(include=['float64']).columns.tolist()
object_columns = df2.select_dtypes(include=['object']).columns.tolist()
numerical_columns, object_columns

(['bath', 'price'], ['location', 'size', 'total_sqft'])

In [143]:
# Select suitable imputer for filling NaNs for both numerical and object columns
numerical_imputer = SimpleImputer(missing_values=np.NaN, strategy='most_frequent')
object_imputer = SimpleImputer(missing_values=np.NaN, strategy='most_frequent')

df2[numerical_columns] = numerical_imputer.fit_transform(df2[numerical_columns])
df2[object_columns] = object_imputer.fit_transform(df2[object_columns])

In [146]:
# Again check for missing values
df2.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
bhk           0
dtype: int64

In [147]:
# the size column needs to be transform numerically into a new feature 'bhk'
df2['bhk'] = df2['size'].apply(lambda x: int(x.split(' ')[0]))
df2.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0,4
2,Uttarahalli,3 BHK,1440,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0,3
4,Kothanur,2 BHK,1200,2.0,51.0,2


In [149]:
# Let's Explore total_sqft data
df2['total_sqft'].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [151]:
def is_float(x):
  try:
    float(x)
  except:
    return False
  return True

In [153]:
df2[~df2['total_sqft'].apply(is_float)].head(10)

Unnamed: 0,location,size,total_sqft,bath,price,bhk
30,Yelahanka,4 BHK,2100 - 2850,4.0,186.0,4
56,Devanahalli,4 Bedroom,3010 - 3410,2.0,192.0,4
81,Hennur Road,4 Bedroom,2957 - 3450,2.0,224.5,4
122,Hebbal,4 BHK,3067 - 8156,4.0,477.0,4
137,8th Phase JP Nagar,2 BHK,1042 - 1105,2.0,54.005,2
165,Sarjapur,2 BHK,1145 - 1340,2.0,43.49,2
188,KR Puram,2 BHK,1015 - 1540,2.0,56.8,2
224,Devanahalli,3 BHK,1520 - 1740,2.0,74.82,3
410,Kengeri,1 BHK,34.46Sq. Meter,1.0,18.5,1
549,Hennur Road,2 BHK,1195 - 1440,2.0,63.77,2


**Note:** There are some ranges and string form data in total_sqft which needs to be transformed

In [154]:
def convert_sqft_to_num(x):
  token = x.split('-')
  if (token == 2):
    return (float(token[0]) + float(token[1]))/2
  try:
    return float(x)
  except:
    return None

In [156]:
df3 = df2.copy()
df3['total_sqft'] = df3['total_sqft'].apply(convert_sqft_to_num)
df3['total_sqft'].unique()

array([1056., 2600., 1440., ..., 2758.,  774., 4689.])