In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Introduction

Before dig deep let's specify our objective.Here our objective is to predict the Bengaluru House Price.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib
import seaborn as sns
#matplotlib.rcParams["figure.figsize"] = (20,10)

In [None]:
df = pd.read_csv("../input/bengaluru-house-price-data/Bengaluru_House_Data.csv")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.dtypes

Here one important things to notice is that  "total_sqft" features is of object type. That's mean we have clean this feature .

In [None]:
df.groupby('area_type')['area_type'].agg('count')

In [None]:
#For the simplicity let's delete the first three features, seems to be less importent
df1 = df.drop(['area_type', 'availability','society'], axis = 1)

In [None]:
# Let's check the missing value
df1.isnull().sum()

Here we are ignorable missing values that's why we can directly drop those rows.

In [None]:
df2 = df1.dropna()
df2.isnull().sum()

In [None]:
#Now have a look of the features value
df2['size'].unique()

Here we are seeing Strings 'BHK', 'Bedroom'. I think both of the strings are specifying the bedroom and this feature how many bedroom each building contains. That's why let's split the each value of this feature take only the integer part.

In [None]:
df2['bhk'] = df2['size'].apply(lambda x: int(x.split(' ')[0]))

In [None]:
df2.head()

In [None]:
#Let's drop the 'size' column. 
df2 = df2.drop(['size'], axis = 1)

In [None]:
#Let's check the total_sqft column.
df2['total_sqft'].unique()

We are expecting this value as integer or float. But here we are seeing some range  so before dealing with this  let's how many types of values are present in this columns.

In [None]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [None]:
df2[~df2['total_sqft'].apply(is_float)].head(20)

Here we are seeing some alphanumeric value value also.So now i am going to make a function which will take the mean when it see range and ignore the alphanumeric value.

In [None]:
def convert_sqt_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return(float(tokens[0]) + (float(tokens[1])))/2
    try:
        return float(x)
    except:
        return None
    

In [None]:
convert_sqt_to_num('1000Sq. Meter')

In [None]:
df3 = df2.copy()
df3['total_sqft'] = df3['total_sqft'].apply(convert_sqt_to_num)
df3.head()


In [None]:
df3.loc[30]

In [None]:
#Now Let's check the location columns.
df3['location'].unique()

In [None]:
len(df3['location'].unique())

Here we are seeing 1265 unique value.This is an importent feature also while predicting price.That's why we cann,t drop it rather try to manage it.And if we take dummies then it will create dimensionality curse. So somehow we have to do the dimensionality reduction.So first see how much datapoints each unique value has. So those value has less datapoints we will consider it as a other.

In [None]:
location_stats = df3['location'] = df3['location'].apply(lambda x : x.strip()) #To strip leading and ending extra space of the each location. 
location_stats = df3.groupby('location')['location'].agg('count').sort_values(ascending = False)
location_stats

So here we are seeing many  location contain only  one data points.

In [None]:
len(location_stats[location_stats<= 10])

In [None]:
location_stats_less_then_10 = location_stats[location_stats<= 10]
location_stats_less_then_10

So here we are seeing that 1017 location which has less the 10 data points. so let's make another category 'other' which contain this types of value.

In [None]:
df3.location = df3.location.apply(lambda x : 'other'  if x in location_stats_less_then_10 else x)
len(df3.location.unique())

In [None]:
df3.head(20)

# Outlier detection

In [None]:
x = df3.bhk.sum()
y = df3.total_sqft.sum()

In [None]:
y/x

From the above calculation we are seeing that average area needed to be there for per bedroom is around 550 squre foot.

Let's  assume a threesold value of 100 sqft. And row containing less the 100 sqft for per bedroom will be consider as outlier and we are going to drop those those rows. 

In [None]:
df3[df3.total_sqft/df3.bhk < 100].head()

In [None]:
df4 = df3[~(df3.total_sqft/df3.bhk < 100)] # Dropping those rows contains outlier

In [None]:
df4.shape

In [None]:
df4['Price_per_sqft'] = df4.price*100000 / df4.total_sqft

In [None]:
df4.head()

In [None]:
df4.Price_per_sqft.describe()

Here we are seeing some extream high and low value. And by using the mean and standerd deviation er are going to take only those
value which are only 1 standerd deviation way from the mean of each location.

In [None]:
 def remove_pps_outliers(df):
        df_out = pd.DataFrame()
        for key, subdf in df.groupby('location'):
    
            m = np.mean(subdf.Price_per_sqft)
            std = np.std(subdf.Price_per_sqft)
            
            reduced_df = subdf[(subdf.Price_per_sqft > (m-std)) & (subdf.Price_per_sqft <= (m+std))]
            df_out = pd.concat([df_out,reduced_df],ignore_index = True)
            
        return df_out
        
df5 =  remove_pps_outliers(df4)
df5.shape

In [None]:
#Let's remove outlier from the 'bath' column.
df5.bath.unique()

In [None]:
df5[df5.bath>10]

In [None]:
plt.hist(df5.bath,rwidth = .8)
plt.xlabel('Number of bathrooms')
plt.ylabel('Count')

In [None]:
df5[df5.bath>df5.bhk + 2]

In [None]:
df6 =  df5[df5.bath<df5.bhk + 2]
df6.shape

In [None]:
dummies = pd.get_dummies(df6.location)

In [None]:
df6 = pd.concat([df6,dummies],axis = 'columns')

In [None]:
df6 = df6.drop(['Price_per_sqft','location'], axis = 1)

In [None]:
X = df6.drop(['price'], axis=1)
y = df6.price

In [None]:
X.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression
cls =  LinearRegression()
cls.fit(X_train, y_train)
cls.score(X_test,y_test)