# Project Name: Real Estate Price Prediction Project

## Exploratory Data Analysis

### Common Library Call

In [51]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")

### Dataset Call

In [52]:
# to call the dat-set
data=pd.read_csv("Bengaluru_House_Data.csv")
data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


### Data Shape

In [53]:
# to show the numbers of rows and columns of the data-set
print(f"Rows: {data.shape[0]}\nColumns: {data.shape[1]}")

Rows: 13320
Columns: 9


In [54]:
# here we are going to drop some columns (namely : "area_type","availability","society","balcony")
df1=data.drop(["area_type","availability","society"],axis=True)
df1.head()

Unnamed: 0,location,size,total_sqft,bath,balcony,price
0,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,3.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0
4,Kothanur,2 BHK,1200,2.0,1.0,51.0


### Data Cleaning

In [55]:
# checking for the number of Nan values inside the columns
df1.isnull().sum()

location        1
size           16
total_sqft      0
bath           73
balcony       609
price           0
dtype: int64

In [56]:
# the feature bath and balcony has missing values among the numerical features. we replace it by the corresponding median
df2=df1.copy()
df2["bath"]=np.where(df2["bath"].isnull()==True,df2["bath"].median(),df2["bath"])
df2["balcony"]=np.where(df2["balcony"].isnull()==True,df2["balcony"].median(),df2["balcony"])
df2.isnull().sum()

location       1
size          16
total_sqft     0
bath           0
balcony        0
price          0
dtype: int64

In [57]:
df2.dropna(inplace=True)
df2.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
balcony       0
price         0
dtype: int64

In [58]:
df2["size"].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

In [59]:
# to extract the number of bhk from the string format 
df3=df2.copy()
df3["bhk"]=df3["size"].apply(lambda x:int(x.split(" ")[0]))
df3.head()

Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk
0,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0,4
2,Uttarahalli,3 BHK,1440,2.0,3.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0,3
4,Kothanur,2 BHK,1200,2.0,1.0,51.0,2


In [60]:
# delete the columns "size"
df4=df3.drop("size",axis=1)
df4.head()

Unnamed: 0,location,total_sqft,bath,balcony,price,bhk
0,Electronic City Phase II,1056,2.0,1.0,39.07,2
1,Chikka Tirupathi,2600,5.0,3.0,120.0,4
2,Uttarahalli,1440,2.0,3.0,62.0,3
3,Lingadheeranahalli,1521,3.0,1.0,95.0,3
4,Kothanur,1200,2.0,1.0,51.0,2


In [64]:
# getting idea of the data set under "total_sqft"
print(list(df4["total_sqft"].unique()))

['1056', '2600', '1440', '1521', '1200', '1170', '2732', '3300', '1310', '1020', '1800', '2785', '1000', '1100', '2250', '1175', '1180', '1540', '2770', '600', '1755', '2800', '1767', '510', '1250', '660', '1610', '1151', '1025', '2100 - 2850', '1075', '1760', '1693', '1925', '700', '1070', '1724', '1290', '1143', '1296', '1254', '1330.74', '970', '1459', '800', '869', '1270', '1670', '2010', '1185', '1600', '3010 - 3410', '1500', '1407', '840', '4395', '845', '5700', '1160', '3000', '1140', '1220', '1350', '1005', '500', '1358', '1569', '1240', '2089', '1206', '1150', '2511', '460', '4400', '1660', '2957 - 3450', '1326', '1325', '1499', '1665', '708', '1060', '710', '1450', '2894', '1330', '2502', '650', '2400', '1007', '966', '1630', '1640', '782', '1260', '1413', '1116', '1530', '3700', '2497', '1436', '276', '1427', '2061', '3067 - 8156', '2650', '1282', '1050', '945', '950', '1870', '880', '1535', '1360', '1042 - 1105', '1280', '5000', '3050', '1563.05', '1167', '4000', '1828', '8

In [68]:
# defining a function to replace odd values in "total_sqft"
def convert_total_sqft(x):
    token=x.split("-")
    if len(token)==2:
        return 0.5*(float(token[0])+float(token[1]))
    try:
        return float(x)
    except:
        return None

# replacing odd values in "total_sqft"
df5=df4.copy()
df5["total_sqft"]=df4["total_sqft"].apply(convert_total_sqft)
df5.head()

Unnamed: 0,location,total_sqft,bath,balcony,price,bhk
0,Electronic City Phase II,1056.0,2.0,1.0,39.07,2
1,Chikka Tirupathi,2600.0,5.0,3.0,120.0,4
2,Uttarahalli,1440.0,2.0,3.0,62.0,3
3,Lingadheeranahalli,1521.0,3.0,1.0,95.0,3
4,Kothanur,1200.0,2.0,1.0,51.0,2


In [69]:
# further checking for Nan values since the user defined functon may have generated some
df5.isnull().sum()

location       0
total_sqft    46
bath           0
balcony        0
price          0
bhk            0
dtype: int64

In [70]:
# removing further null values
df6=df5.dropna()
df6.isnull().sum()

location      0
total_sqft    0
bath          0
balcony       0
price         0
bhk           0
dtype: int64

In [72]:
# saving data to pass to feature engineering
df6.to_csv("data_for_feat_engg.csv",index=False)