In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt

In [13]:
df1 = pd.read_csv("./data/Bengaluru_House_Data.csv")
df1.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


<h1 style="color:grey;font-size:40px;">Drop unwanted columns</h1>  

In [19]:
df2 = df1.drop(["area_type", "availability", "society"], axis="columns")
df2

Unnamed: 0,location,size,total_sqft,bath,balcony,price
0,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.00
2,Uttarahalli,3 BHK,1440,2.0,3.0,62.00
3,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.00
4,Kothanur,2 BHK,1200,2.0,1.0,51.00
...,...,...,...,...,...,...
13315,Whitefield,5 Bedroom,3453,4.0,0.0,231.00
13316,Richards Town,4 BHK,3600,5.0,,400.00
13317,Raja Rajeshwari Nagar,2 BHK,1141,2.0,1.0,60.00
13318,Padmanabhanagar,4 BHK,4689,4.0,1.0,488.00


<h1 style="color:grey;font-size:40px;">Check and Remove NANs</h1>

In [21]:
df2.isnull().sum()

location        1
size           16
total_sqft      0
bath           73
balcony       609
price           0
dtype: int64

In [24]:
df3 = df2.dropna()
df3.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
balcony       0
price         0
dtype: int64

In [26]:
df3.shape

(12710, 6)

<h1 style="color:grey;font-size:40px;">Feature Engineering</h1>
<b>creating a new feature named bhk because the original size colums has different types of data (bedrooms and bhk)</b>

In [117]:
df3["bhk"] = df3.loc[:, "size"].apply(lambda x: int(x.split(" ")[0]))
df3.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3["bhk"] = df3.loc[:, "size"].apply(lambda x: int(x.split(" ")[0]))


Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk
0,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0,4
2,Uttarahalli,3 BHK,1440,2.0,3.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0,3
4,Kothanur,2 BHK,1200,2.0,1.0,51.0,2


<b>Trying to see if the total_sqft data has only float values or not</b>

In [118]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [119]:
df3[~df3["total_sqft"].apply(is_float)].head(10)

Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk
30,Yelahanka,4 BHK,2100 - 2850,4.0,0.0,186.0,4
122,Hebbal,4 BHK,3067 - 8156,4.0,0.0,477.0,4
137,8th Phase JP Nagar,2 BHK,1042 - 1105,2.0,0.0,54.005,2
165,Sarjapur,2 BHK,1145 - 1340,2.0,0.0,43.49,2
188,KR Puram,2 BHK,1015 - 1540,2.0,0.0,56.8,2
410,Kengeri,1 BHK,34.46Sq. Meter,1.0,0.0,18.5,1
549,Hennur Road,2 BHK,1195 - 1440,2.0,0.0,63.77,2
661,Yelahanka,2 BHK,1120 - 1145,2.0,0.0,48.13,2
672,Bettahalsoor,4 Bedroom,3090 - 5002,4.0,0.0,445.0,4
772,Banashankari Stage VI,2 BHK,1160 - 1195,2.0,0.0,59.935,2


<b> As we can see there are many values in the total_sqft that are just ranges and also there are values of different units. We will get the average of the ranges and drop values that have different units</b>

In [120]:
def avg(x):
    lst = x.split("-")
    if len(lst) == 2:
        return float(lst[0])+float(lst[1])/2
    else:
        try:
            return float(x)
        except:
            return None

In [121]:
df4 = df3.copy()
df4["total_sqft"] = df4["total_sqft"].apply(avg)
df4.isnull().sum()

location       0
size           0
total_sqft    42
bath           0
balcony        0
price          0
bhk            0
dtype: int64

<b> Now we have the average of the total_sqft data that were ranges and we have dropped the sqft rows that had different units but doing that has created 42 nan rows which we need to remove too</b>

In [122]:
df5 = df4.dropna()
df5.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
balcony       0
price         0
bhk           0
dtype: int64

<b> Here we will create another feature named price per square feet which will be essential to check data anomalies </b>

In [123]:
df6 = df5.copy()
df6["price_per_sqft"] = df6["price"]*100000/df6["total_sqft"]
df6

Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,1.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,3.0,120.00,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,3.0,62.00,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,1.0,95.00,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,1.0,51.00,2,4250.000000
...,...,...,...,...,...,...,...,...
13314,Green Glen Layout,3 BHK,1715.0,3.0,3.0,112.00,3,6530.612245
13315,Whitefield,5 Bedroom,3453.0,4.0,0.0,231.00,5,6689.834926
13317,Raja Rajeshwari Nagar,2 BHK,1141.0,2.0,1.0,60.00,2,5258.545136
13318,Padmanabhanagar,4 BHK,4689.0,4.0,1.0,488.00,4,10407.336319


In [124]:
df6["price_per_sqft"].describe()

count    1.266800e+04
mean     6.858379e+03
std      2.263535e+04
min      2.678298e+02
25%      4.204933e+03
50%      5.350249e+03
75%      7.142857e+03
max      2.300000e+06
Name: price_per_sqft, dtype: float64

In [125]:
len(df6["location"].unique())

1259

<h1 style="color:grey;font-size:40px;">Dimension Reduction</h1>
<b> Currently in our data set we have lots of places but many of these places probably just has 1-2 samples, using such places as extra columns will give us too many features so we need to reduce it</b>

In [126]:
location_stats = df6["location"].value_counts(ascending=False)
location_stats

Whitefield                                        513
Sarjapur  Road                                    372
Electronic City                                   300
Kanakpura Road                                    259
Thanisandra                                       230
                                                 ... 
rr nagar                                            1
Ananthanagar Phase 1,Electronic City , phase 2      1
akshaya nagar t c palya                             1
Kengeri Satellite Town KHB Apartment                1
Geetanjali Layout                                   1
Name: location, Length: 1259, dtype: int64

<b> We make an other column comprising of locations that has less than 10 samples</b>

In [127]:
location_stats.values.sum()

12668

In [128]:
len(df6["location"].unique())

1259

In [97]:
len(location_stats[location_stats<=10])

1024

In [129]:
loc_less_than_ten = location_stats[location_stats<=10]
df7 = df6.copy()
df7["location"] = df7["location"].apply(lambda x: "other" if x in loc_less_than_ten else x)
len(df7["location"].unique())

236

In [130]:
df7.head(300)

Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,1.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,3.0,120.00,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,3.0,62.00,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,1.0,95.00,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,1.0,51.00,2,4250.000000
...,...,...,...,...,...,...,...,...
310,Panathur,2 BHK,1438.0,2.0,3.0,100.00,2,6954.102921
311,other,3 BHK,1560.0,3.0,3.0,115.00,3,7371.794872
312,Yelahanka,2 BHK,1350.0,2.0,1.0,55.55,2,4114.814815
313,Kanakpura Road,3 BHK,1550.0,3.0,3.0,67.00,3,4322.580645


<h1 style="color:Grey; font-size:40px">Outlier removal</h1>
<b> here we will try to remove data that feels wrong, first thing we will check is the size of each room and see if they are normal or not. Usually one bedroom should be around 300 sqft</b>

In [134]:
df7[df7["total_sqft"]/df7["bhk"] < 300]

Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk,price_per_sqft
58,Murugeshpalya,6 Bedroom,1407.0,4.0,1.0,150.0,6,10660.980810
68,other,8 Bedroom,1350.0,7.0,0.0,85.0,8,6296.296296
70,other,3 Bedroom,500.0,3.0,2.0,100.0,3,20000.000000
78,Kaval Byrasandra,2 BHK,460.0,1.0,0.0,22.0,2,4782.608696
89,Rajaji Nagar,6 Bedroom,710.0,6.0,3.0,160.0,6,22535.211268
...,...,...,...,...,...,...,...,...
13219,Laggere,7 Bedroom,1590.0,9.0,3.0,132.0,7,8301.886792
13221,other,9 Bedroom,1178.0,9.0,1.0,75.0,9,6366.723260
13281,Margondanahalli,5 Bedroom,1375.0,5.0,1.0,125.0,5,9090.909091
13303,Vidyaranyapura,5 Bedroom,774.0,5.0,3.0,70.0,5,9043.927649


<b> as we can see there are 655 samples that have room sizes lower than 300sqft, we can remove them </b>

In [136]:
df7.shape

(12668, 8)

In [140]:
df8 = df7[~(df7["total_sqft"]/df7["bhk"] < 300)]
df8

Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,1.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,3.0,120.00,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,3.0,62.00,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,1.0,95.00,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,1.0,51.00,2,4250.000000
...,...,...,...,...,...,...,...,...
13314,Green Glen Layout,3 BHK,1715.0,3.0,3.0,112.00,3,6530.612245
13315,Whitefield,5 Bedroom,3453.0,4.0,0.0,231.00,5,6689.834926
13317,Raja Rajeshwari Nagar,2 BHK,1141.0,2.0,1.0,60.00,2,5258.545136
13318,Padmanabhanagar,4 BHK,4689.0,4.0,1.0,488.00,4,10407.336319
