In [None]:
#In this jupyter notebook, we are going to preprocess the Bengaluru housing data and apply machine learning model

In [4]:
import pandas as pd

# read the dataset
Version1 = pd.read_csv("BHD.csv")

Version1.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [8]:
# We will go throw the data and try to understand dependencies of each feature with the target data

Version1.describe()

Unnamed: 0,bath,balcony,price
count,13247.0,12711.0,13320.0
mean,2.69261,1.584376,112.565627
std,1.341458,0.817263,148.971674
min,1.0,0.0,8.0
25%,2.0,1.0,50.0
50%,2.0,2.0,72.0
75%,3.0,2.0,120.0
max,40.0,3.0,3600.0


In [11]:
# after looking at the dataset, we can analyze that few datapoints wont play key role in identifing the price.
# So we will remove such varaibles

Version2 = Version1.drop(['area_type','availability','society','balcony'] , axis = "columns")

Version2.head()

# This is purly based on my assumptions, there can be senarios where price might vary with above fields

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [13]:
Version2.shape

(13320, 5)

In [16]:
# we will check the null values in the dataset

Version2.isnull().sum()

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [18]:
# There are many ways to handle the null values, solution differs from situation to situation
# Since we are huge data and very less null values, we can drop those rows
# In other cases we can handle them by filling with mean values are coming values 

Version3 = Version2.dropna()

Version3.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

In [19]:
Version3.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [23]:
Version3['size'].unique

<bound method Series.unique of 0            2 BHK
1        4 Bedroom
2            3 BHK
3            3 BHK
4            2 BHK
           ...    
13315    5 Bedroom
13316        4 BHK
13317        2 BHK
13318        4 BHK
13319        1 BHK
Name: size, Length: 13246, dtype: object>

In [30]:
# Since the bedroom size in the data is not consistent, we can do feature addition

Version3['bhk'] = Version3['size'].apply(lambda x : int(x.split(' ')[0]))
Version3.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Version3['bhk'] = Version3['size'].apply(lambda x : int(x.split(' ')[0]))


Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0,4
2,Uttarahalli,3 BHK,1440,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0,3
4,Kothanur,2 BHK,1200,2.0,51.0,2


In [31]:
Version3.bhk.unique()

array([ 2,  4,  3,  6,  1,  8,  7,  5, 11,  9, 27, 10, 19, 16, 43, 14, 12,
       13, 18], dtype=int64)

In [32]:
# if we look at the number of bed rooms, there are few houses with 43 bedrooms, looking strange is't it.
# lets have a look at that property 

Version3[Version3.bhk == 43]

Unnamed: 0,location,size,total_sqft,bath,price,bhk
4684,Munnekollal,43 Bedroom,2400,40.0,660.0,43


In [37]:
# if we look at the total_sqft values in the table, there are few which has range
# lets find them and put the value in that place

def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

Version3[~Version3['total_sqft'].apply(is_float)]

Unnamed: 0,location,size,total_sqft,bath,price,bhk
30,Yelahanka,4 BHK,2100 - 2850,4.0,186.000,4
122,Hebbal,4 BHK,3067 - 8156,4.0,477.000,4
137,8th Phase JP Nagar,2 BHK,1042 - 1105,2.0,54.005,2
165,Sarjapur,2 BHK,1145 - 1340,2.0,43.490,2
188,KR Puram,2 BHK,1015 - 1540,2.0,56.800,2
...,...,...,...,...,...,...
12975,Whitefield,2 BHK,850 - 1060,2.0,38.190,2
12990,Talaghattapura,3 BHK,1804 - 2273,3.0,122.000,3
13059,Harlur,2 BHK,1200 - 1470,2.0,72.760,2
13265,Hoodi,2 BHK,1133 - 1384,2.0,59.135,2


In [53]:
def convert_rangeTo_float(x):
    token = x.split("-")
    if len(token) == 2:
        return ( (float(token[0]) + float(token[1])) / 2)
    try:
        return float(x)
    except:
        return None

# just to make it clear, we will dump the dataset to new Version
Version4 = Version3.copy()
Version4['total_sqft'] = Version4['total_sqft'].apply(convert_rangeTo_float)

# There is porbability of getting NaN values while converting the range, So removing them here
Version4 = Version4[Version4.total_sqft.notnull()]
Version4.head(2)

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4


In [60]:
# to cross check whether the data got any null values after updation 

Version4[~Version4['total_sqft'].notnull()]

Unnamed: 0,location,size,total_sqft,bath,price,bhk,sqft_price


In [55]:

# In identifying the house price, cost of sqft is important.
# so we will do Feature Engineering by deducing new fields from existing fields

Version4['sqft_price'] = (Version4['price'] * 100000) / Version4['total_sqft']
Version4.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,sqft_price
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,51.0,2,4250.0


In [56]:
Version4['sqft_price'].describe()

count    1.320000e+04
mean     7.920759e+03
std      1.067272e+05
min      2.678298e+02
25%      4.267701e+03
50%      5.438331e+03
75%      7.317073e+03
max      1.200000e+07
Name: sqft_price, dtype: float64

In [64]:
# thumb rule while applying any ML model is : 
    # ML models does not work on string or alpha numerics, So convert all the values into numerics
    
# Ideally way to convert the strings to numberics is to use 'getDummies'.

len(Version4.location.unique())

1298

In [65]:
# If we see the number of unique locations in the bengulure, It is close to 1200

# If we perform OneHotEncoding on this, It will create 1298 new columns for every row, which might impact the model
# Lets use the dimensionality reduction concept here to reduce the categorical values (locations)

In [70]:
# lets trim the location names before checking the no. of houses in each location

Version4['location'] = Version4['location'].apply(lambda x : x.strip())
Version4.location.value_counts(ascending = False)

Whitefield                   533
Sarjapur  Road               392
Electronic City              304
Kanakpura Road               264
Thanisandra                  235
                            ... 
Yemlur, Old Airport Road,      1
Udayagiri                      1
Manonarayanapalya              1
Tilak Nagar                    1
CQAL LAYOUT C BLOCK            1
Name: location, Length: 1287, dtype: int64

In [74]:
Version4.location.value_counts() > 10

Whitefield                    True
Sarjapur  Road                True
Electronic City               True
Kanakpura Road                True
Thanisandra                   True
                             ...  
Yemlur, Old Airport Road,    False
Udayagiri                    False
Manonarayanapalya            False
Tilak Nagar                  False
CQAL LAYOUT C BLOCK          False
Name: location, Length: 1287, dtype: bool

In [75]:
# Any location having less than 10 data points should be tagged as "other" location. 
# This way number of categories can be reduced by huge amount. 
# Later on when we do one hot encoding, it will help us with having fewer dummy columns


location_stats = Version4.location.value_counts(ascending = False)
len(location_stats[location_stats > 10])

240

In [76]:
# storing all the locations whos no. of houses are kess than 10

location_stats_less_than_10 = location_stats[location_stats <= 10]
location_stats_less_than_10

Nagappa Reddy Layout         10
Sector 1 HSR Layout          10
Dairy Circle                 10
Thyagaraja Nagar             10
Ganga Nagar                  10
                             ..
Yemlur, Old Airport Road,     1
Udayagiri                     1
Manonarayanapalya             1
Tilak Nagar                   1
CQAL LAYOUT C BLOCK           1
Name: location, Length: 1047, dtype: int64

In [77]:
len(location_stats_less_than_10)

1047

In [78]:
len(Version4.location.unique())

1287

In [79]:
# for all those 1047 locations replacing those locations with "others"

Version4['location'] = Version4['location'].apply( lambda x : "other" if x in location_stats_less_than_10 else x)
len(Version4.location.unique())

241

In [81]:
# Removing the outlines 

# Generally square ft per bedroom is 300 (i.e. 2 bhk apartment is minimum 600 sqft.
# If you have for example 400 sqft apartment with 2 bhk than that seems suspicious and can be removed as an outlier.
# We will remove such outliers by keeping our minimum thresold per bhk to be 300 sqft

Version4[Version4.total_sqft/Version4.bhk<300].head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,sqft_price
9,other,6 Bedroom,1020.0,6.0,370.0,6,36274.509804
45,HSR Layout,8 Bedroom,600.0,9.0,200.0,8,33333.333333
58,Murugeshpalya,6 Bedroom,1407.0,4.0,150.0,6,10660.98081
68,Devarachikkanahalli,8 Bedroom,1350.0,7.0,85.0,8,6296.296296
70,other,3 Bedroom,500.0,3.0,100.0,3,20000.0


In [82]:
Version4.shape

(13200, 7)

In [83]:
# Check above data points. We have 6 bhk apartment with 1020 sqft. Another one is 8 bhk and total sqft is 600. 
# These are clear data errors that can be removed safely

Version5 = Version4[~(Version4.total_sqft/Version4.bhk<300)]
Version5.shape

(12456, 7)

In [86]:
import numpy as np

def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.sqft_price)
        st = np.std(subdf.sqft_price)
        reduced_df = subdf[(subdf.sqft_price>(m-st)) & (subdf.sqft_price<=(m+st))]
        df_out = pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out
Version6 = remove_pps_outliers(Version5)
Version6.shape

(10242, 7)

In [87]:

Version10 = Version6.copy()

In [88]:
Version10 = Version10.drop(['size','sqft_price'],axis='columns')
Version10.head(3)

Unnamed: 0,location,total_sqft,bath,price,bhk
0,1st Block Jayanagar,2850.0,4.0,428.0,4
1,1st Block Jayanagar,1630.0,3.0,194.0,3
2,1st Block Jayanagar,1875.0,2.0,235.0,3


In [89]:
dummies = pd.get_dummies(Version10.location)
dummies.head(3)

Unnamed: 0,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,6th Phase JP Nagar,7th Phase JP Nagar,8th Phase JP Nagar,9th Phase JP Nagar,...,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur,other
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [90]:

Version11 = pd.concat([Version10,dummies.drop('other',axis='columns')],axis='columns')
Version11.head()

Unnamed: 0,location,total_sqft,bath,price,bhk,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,1st Block Jayanagar,2850.0,4.0,428.0,4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1st Block Jayanagar,1630.0,3.0,194.0,3,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1st Block Jayanagar,1875.0,2.0,235.0,3,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1st Block Jayanagar,1200.0,2.0,130.0,3,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1st Block Jayanagar,1235.0,2.0,148.0,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [91]:

Version12 = Version11.drop('location',axis='columns')
Version12.head(2)

Unnamed: 0,total_sqft,bath,price,bhk,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,2850.0,4.0,428.0,4,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1630.0,3.0,194.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [92]:
Version12.shape

(10242, 244)

In [93]:
X = Version12.drop(['price'],axis='columns')
X.head(3)

Unnamed: 0,total_sqft,bath,bhk,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,6th Phase JP Nagar,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,2850.0,4.0,4,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1630.0,3.0,3,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1875.0,2.0,3,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [94]:
y = Version12.price
y.head(3)

0    428.0
1    194.0
2    235.0
Name: price, dtype: float64

In [95]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [96]:
from sklearn.linear_model import LinearRegression
lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)
lr_clf.score(X_test,y_test)

0.8154286792294445