In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams['figure.figsize'] = (20, 10)
# Above line of code will implement every figure implemented using matplotlib of size 20 X 10
# There are differnt parameters such as figure size, font sizes, line widths, and more that can be implemented

In [2]:
file = pd.read_csv("Datasets/Bengaluru_House_Data.csv")

In [3]:
file.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [4]:
# Lets drop some columns which I think are not that useful

In [5]:
file = file.drop(['area_type', 'availability', 'society', 'balcony'], axis='columns')

In [6]:
file.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [7]:
file.isna().sum()

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [8]:
file.shape

(13320, 5)

In [9]:
# There are lot more number of records than that of the rows present in the dataset
# Hence we can drop the rows which contains null values
# Or we can replace the null values with median values too

In [10]:
bath_median = file['bath'].median()

In [11]:
bath_median

2.0

In [12]:
file['bath'] = file.bath.fillna(bath_median)

In [13]:
file.isna().sum()

location       1
size          16
total_sqft     0
bath           0
price          0
dtype: int64

In [14]:
file = file.dropna()

In [15]:
file.isna().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

In [16]:
# Now all the null values are droped from the dataset

In [17]:
file['size'].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

In [18]:
# Have a look we have got size factor in different paramters 
# Some rooms size is given in the form of BHK while some in the form of Bedrooms
# We will clear this using the function given below

In [20]:
def total_rooms(size):
    return int(size.split(' ')[0])

In [21]:
total_rooms('5 Bedroom')

5

In [22]:
# Look we have got the size of number of the rooms in the form of numeric values

In [23]:
file['size'] = file['size'].apply(total_rooms)

In [24]:
file.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2,1056,2.0,39.07
1,Chikka Tirupathi,4,2600,5.0,120.0
2,Uttarahalli,3,1440,2.0,62.0
3,Lingadheeranahalli,3,1521,3.0,95.0
4,Kothanur,2,1200,2.0,51.0


In [25]:
file['total_sqft'].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [26]:
# It seems that there are differnt types of values when we consider the total_sqft feature of the dataset
# Hence we have to consider if they are numbers string or they are values of certain range

In [27]:
def is_float(sqft):
    try:
        float(sqft)
    except:
        return False
    return True

In [28]:
file[~file['total_sqft'].apply(is_float)].head(10)

Unnamed: 0,location,size,total_sqft,bath,price
30,Yelahanka,4,2100 - 2850,4.0,186.0
56,Devanahalli,4,3010 - 3410,2.0,192.0
81,Hennur Road,4,2957 - 3450,2.0,224.5
122,Hebbal,4,3067 - 8156,4.0,477.0
137,8th Phase JP Nagar,2,1042 - 1105,2.0,54.005
165,Sarjapur,2,1145 - 1340,2.0,43.49
188,KR Puram,2,1015 - 1540,2.0,56.8
224,Devanahalli,3,1520 - 1740,2.0,74.82
410,Kengeri,1,34.46Sq. Meter,1.0,18.5
549,Hennur Road,2,1195 - 1440,2.0,63.77


In [29]:
# Now we have data in the form of range as well as data in the form of Sq. Meter and no one knows if there is total_sqft in 
# any other data value form

In [30]:
# We will only conider data in the range form and then apply avg function over data

In [45]:
def get_avg(sqft):
    numbers = sqft.split('-')
    if len(numbers) == 2:
        return float((float(numbers[0]) + float(numbers[1])) / 2);
    try:
        return float(sqft)
    except:
        return None

In [46]:
get_avg('34.46Sq. Meter	')

In [47]:
get_avg('1042 - 1105')

1073.5

In [48]:
file['total_sqft'] = file.total_sqft.apply(get_avg)

In [49]:
# It looks like it will give us the average value of the sqft value of the house

In [50]:
file.isnull().sum()

location       0
size           0
total_sqft    46
bath           0
price          0
dtype: int64

In [51]:
file.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2,1056.0,2.0,39.07
1,Chikka Tirupathi,4,2600.0,5.0,120.0
2,Uttarahalli,3,1440.0,2.0,62.0
3,Lingadheeranahalli,3,1521.0,3.0,95.0
4,Kothanur,2,1200.0,2.0,51.0


In [61]:
# Now lets calculate the price per sqft so that it will be easy to predict the price and it will be in unitary form

In [64]:
file['price_per_sqft'] = (file['price'] * 100000) / file['total_sqft']

In [65]:
file.head()

Unnamed: 0,location,size,total_sqft,bath,price,price_per_sqft
0,Electronic City Phase II,2,1056.0,2.0,39.07,3699.810606
1,Chikka Tirupathi,4,2600.0,5.0,120.0,4615.384615
2,Uttarahalli,3,1440.0,2.0,62.0,4305.555556
3,Lingadheeranahalli,3,1521.0,3.0,95.0,6245.890861
4,Kothanur,2,1200.0,2.0,51.0,4250.0


In [67]:
file.groupby('location').describe()

Unnamed: 0_level_0,size,size,size,size,size,size,size,size,total_sqft,total_sqft,...,price,price,price_per_sqft,price_per_sqft,price_per_sqft,price_per_sqft,price_per_sqft,price_per_sqft,price_per_sqft,price_per_sqft
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
location,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Anekal,1.0,1.000000,,1.0,1.00,1.0,1.00,1.0,1.0,351.000000,...,16.00,16.00,1.0,4558.404558,,4558.404558,4558.404558,4558.404558,4558.404558,4558.404558
Banaswadi,1.0,1.000000,,1.0,1.00,1.0,1.00,1.0,1.0,527.000000,...,35.00,35.00,1.0,6641.366224,,6641.366224,6641.366224,6641.366224,6641.366224,6641.366224
Basavangudi,1.0,1.000000,,1.0,1.00,1.0,1.00,1.0,1.0,670.000000,...,50.00,50.00,1.0,7462.686567,,7462.686567,7462.686567,7462.686567,7462.686567,7462.686567
Bhoganhalli,1.0,1.000000,,1.0,1.00,1.0,1.00,1.0,1.0,296.000000,...,22.89,22.89,1.0,7733.108108,,7733.108108,7733.108108,7733.108108,7733.108108,7733.108108
Devarabeesana Halli,6.0,2.666667,0.516398,2.0,2.25,3.0,3.00,3.0,6.0,1539.666667,...,150.00,160.00,6.0,7900.452568,1637.393519,5401.234568,6901.298701,8542.857143,8871.326042,9580.838323
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
t.c palya,1.0,6.000000,,6.0,6.00,6.0,6.00,6.0,1.0,1350.000000,...,160.00,160.00,1.0,11851.851852,,11851.851852,11851.851852,11851.851852,11851.851852,11851.851852
tc.palya,4.0,2.250000,0.500000,2.0,2.00,2.0,2.25,3.0,4.0,1075.000000,...,66.00,78.00,4.0,5651.101350,288.920138,5454.545455,5488.636364,5535.714286,5698.179272,6078.431373
vinayakanagar,1.0,7.000000,,7.0,7.00,7.0,7.00,7.0,1.0,1200.000000,...,200.00,200.00,1.0,16666.666667,,16666.666667,16666.666667,16666.666667,16666.666667,16666.666667
"white field,kadugodi",1.0,6.000000,,6.0,6.00,6.0,6.00,6.0,1.0,2100.000000,...,275.00,275.00,1.0,13095.238095,,13095.238095,13095.238095,13095.238095,13095.238095,13095.238095


In [68]:
# It seems that there are different numbers location in the bangalore city for which the price of the house is given
# lets find how many are there

In [70]:
# First we will have done data cleaning first and then we will do some data cleaning
len(file.location.unique())

1304

In [72]:
# So there are total of 1304 location for which we might have to find the price for
# But few of them might have come very few times so we can conclude that we can add them into other category
# Before that we will perform data cleaning by stripping the values

In [73]:
file.location = file.location.apply(lambda x : x.strip())

In [74]:
file.head()

Unnamed: 0,location,size,total_sqft,bath,price,price_per_sqft
0,Electronic City Phase II,2,1056.0,2.0,39.07,3699.810606
1,Chikka Tirupathi,4,2600.0,5.0,120.0,4615.384615
2,Uttarahalli,3,1440.0,2.0,62.0,4305.555556
3,Lingadheeranahalli,3,1521.0,3.0,95.0,6245.890861
4,Kothanur,2,1200.0,2.0,51.0,4250.0


In [79]:
location_count = file.location.value_counts(ascending=False)

In [80]:
location_count

Whitefield                        540
Sarjapur  Road                    397
Electronic City                   304
Kanakpura Road                    273
Thanisandra                       237
                                 ... 
Vasantapura main road               1
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
Abshot Layout                       1
Name: location, Length: 1293, dtype: int64

In [81]:
# So here we have got the values from highest 540 to lowest 1 so we can ignore some location that come less than that of the 
# some values and put them in the other

In [86]:
len(file.location.unique())

1293

In [88]:
# To make 1293 colunmns will be difficult hence we will reduce the number of columns

In [89]:
count_less_than_ten = location_count[location_count<=10]

In [90]:
file['location'] = file.location.apply(lambda x : 'others' if x in count_less_than_ten else x)

In [93]:
len(file.location.unique())

242

In [94]:
# Now the count is 242 and hence we can have less number of columns than that of previous one

In [95]:
file.head(10)

Unnamed: 0,location,size,total_sqft,bath,price,price_per_sqft
0,Electronic City Phase II,2,1056.0,2.0,39.07,3699.810606
1,Chikka Tirupathi,4,2600.0,5.0,120.0,4615.384615
2,Uttarahalli,3,1440.0,2.0,62.0,4305.555556
3,Lingadheeranahalli,3,1521.0,3.0,95.0,6245.890861
4,Kothanur,2,1200.0,2.0,51.0,4250.0
5,Whitefield,2,1170.0,2.0,38.0,3247.863248
6,Old Airport Road,4,2732.0,4.0,204.0,7467.057101
7,Rajaji Nagar,4,3300.0,4.0,600.0,18181.818182
8,Marathahalli,3,1310.0,3.0,63.25,4828.244275
9,others,6,1020.0,6.0,370.0,36274.509804
