In [99]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams['figure.figsize'] = (20, 10)
# Above line of code will implement every figure implemented using matplotlib of size 20 X 10
# There are differnt parameters such as figure size, font sizes, line widths, and more that can be implemented

In [100]:
file = pd.read_csv("Datasets/Bengaluru_House_Data.csv")

In [101]:
file.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [102]:
# Lets drop some columns which I think are not that useful

In [103]:
file = file.drop(['area_type', 'availability', 'society', 'balcony'], axis='columns')

In [104]:
file.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [105]:
file.isna().sum()

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [106]:
file.shape

(13320, 5)

In [107]:
# There are lot more number of records than that of the rows present in the dataset
# Hence we can drop the rows which contains null values
# Or we can replace the null values with median values too

In [108]:
bath_median = file['bath'].median()

In [109]:
bath_median

2.0

In [110]:
file['bath'] = file.bath.fillna(bath_median)

In [111]:
file.isna().sum()

location       1
size          16
total_sqft     0
bath           0
price          0
dtype: int64

In [112]:
file = file.dropna()

In [113]:
file.isna().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

In [114]:
# Now all the null values are droped from the dataset

In [115]:
file['size'].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

In [116]:
# Have a look we have got size factor in different paramters 
# Some rooms size is given in the form of BHK while some in the form of Bedrooms
# We will clear this using the function given below

In [117]:
def total_rooms(size):
    return (size.split(' ')[0])

In [118]:
total_rooms('5 Bedroom')

'5'

In [119]:
# Look we have got the size of number of the rooms in the form of numeric values

In [120]:
file['size'] = file['size'].apply(lambda x: int(x.split(' ')[0]))

In [121]:
file.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2,1056,2.0,39.07
1,Chikka Tirupathi,4,2600,5.0,120.0
2,Uttarahalli,3,1440,2.0,62.0
3,Lingadheeranahalli,3,1521,3.0,95.0
4,Kothanur,2,1200,2.0,51.0


In [122]:
file['total_sqft'].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [123]:
# It seems that there are differnt types of values when we consider the total_sqft feature of the dataset
# Hence we have to consider if they are numbers string or they are values of certain range

In [124]:
def is_float(sqft):
    try:
        float(sqft)
    except:
        return False
    return True

In [125]:
file[~file['total_sqft'].apply(is_float)].head(10)

Unnamed: 0,location,size,total_sqft,bath,price
30,Yelahanka,4,2100 - 2850,4.0,186.0
56,Devanahalli,4,3010 - 3410,2.0,192.0
81,Hennur Road,4,2957 - 3450,2.0,224.5
122,Hebbal,4,3067 - 8156,4.0,477.0
137,8th Phase JP Nagar,2,1042 - 1105,2.0,54.005
165,Sarjapur,2,1145 - 1340,2.0,43.49
188,KR Puram,2,1015 - 1540,2.0,56.8
224,Devanahalli,3,1520 - 1740,2.0,74.82
410,Kengeri,1,34.46Sq. Meter,1.0,18.5
549,Hennur Road,2,1195 - 1440,2.0,63.77


In [126]:
# Now we have data in the form of range as well as data in the form of Sq. Meter and no one knows if there is total_sqft in 
# any other data value form

In [127]:
# We will only conider data in the range form and then apply avg function over data

In [128]:
def get_avg(sqft):
    numbers = sqft.split('-')
    if len(numbers) == 2:
        return (int(numbers[0]) + int(numbers[1])) / 2;
    try:
        return float(sqft)
    except:
        return None

In [129]:
get_avg('34.46Sq. Meter	')

In [130]:
get_avg('1042 - 1105')

1073.5

In [131]:
# It looks like it will give us the average value of the sqft value of the house

In [132]:
file.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

In [133]:
file.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2,1056,2.0,39.07
1,Chikka Tirupathi,4,2600,5.0,120.0
2,Uttarahalli,3,1440,2.0,62.0
3,Lingadheeranahalli,3,1521,3.0,95.0
4,Kothanur,2,1200,2.0,51.0


In [134]:
# Now lets calculate the price per sqft so that it will be easy to predict the price and it will be in unitary form

In [135]:
file_new = file.copy()

In [136]:
file_new['Price_per_sqft'] = file_new['price'] * 100000 / file_new['total_sqft']

TypeError: unsupported operand type(s) for /: 'float' and 'str'