In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import plotly.graph_objects as go 
import plotly.express as px 
%matplotlib inline

In [2]:
# Importing the House price dataset
df = pd.read_csv('./input/Bengaluru_House_Data.csv')
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [3]:
print('the shape of dataset is :',df.shape)

the shape of dataset is : (13320, 9)


## Data Cleaning

In [4]:
#Checking the null values
df.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [5]:
# Checking the dataset info
df.describe()

Unnamed: 0,bath,balcony,price
count,13247.0,12711.0,13320.0
mean,2.69261,1.584376,112.565627
std,1.341458,0.817263,148.971674
min,1.0,0.0,8.0
25%,2.0,1.0,50.0
50%,2.0,2.0,72.0
75%,3.0,2.0,120.0
max,40.0,3.0,3600.0


In [6]:
# Observing the Bath and Balcony column
y0 = df.bath
y1 = df.balcony
fig = go.Figure()
fig.add_trace(go.Box(y=y0,name='Bath',quartilemethod="linear"))
fig.add_trace(go.Box(y=y1,name='Balcony',quartilemethod="linear"))
fig.update_layout(title='Bath & Balcony', yaxis_title='Count',height=420,width=1080)
fig.show()

In [7]:
# As we can not fill the null data in Society by any means so saying it: No data (for nan value)
# As we say that maximum count of Bath and Balcony lies between its LQ - UQ: We can fill the bath,balcony by its mean
# For size we can not
df.bath = df.bath.fillna(int(df.bath.mean()))
df.balcony = df.balcony.fillna(int(df.balcony.mean()))
df.society = df.society.fillna("No data")
df = df[df['location'].notna()]
df = df[df['size'].notna()]
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,No data,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,No data,1200,2.0,1.0,51.0


In [8]:
# Again checking the null values
df.isnull().sum()

area_type       0
availability    0
location        0
size            0
society         0
total_sqft      0
bath            0
balcony         0
price           0
dtype: int64

In [9]:
# Removing un-necessary columns
df.drop(columns=['area_type','availability','society'],inplace=True)


### Cleaning columns : 'bath' and 'balcony'

In [10]:
# Size column having mixed value like '2 bhk or 2 bedroom' so i am generalising it
# Renaming column size to BHK
# Converting price into lakhs
df['size'] = df['size'].apply(lambda x: int(x.split(' ')[0]))
df.rename(columns={'size':'bhk'},inplace=True)
df['price'] = df['price']*100000
df.head()


Unnamed: 0,location,bhk,total_sqft,bath,balcony,price
0,Electronic City Phase II,2,1056,2.0,1.0,3907000.0
1,Chikka Tirupathi,4,2600,5.0,3.0,12000000.0
2,Uttarahalli,3,1440,2.0,3.0,6200000.0
3,Lingadheeranahalli,3,1521,3.0,1.0,9500000.0
4,Kothanur,2,1200,2.0,1.0,5100000.0


### Cleaning Column: total_sqft

In [11]:
#Checking total_sqft column entires
df.total_sqft.unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

We can see that total_sqft column have some entires as (1113 - 1505) so now we have to check more carefully

In [12]:
# Making a function to have all different types of entires
def check_float(x):
    try:
        float(x)
    except:
        return False
    return True
df[~df['total_sqft'].apply(check_float)].head(10)

Unnamed: 0,location,bhk,total_sqft,bath,balcony,price
30,Yelahanka,4,2100 - 2850,4.0,0.0,18600000.0
56,Devanahalli,4,3010 - 3410,2.0,1.0,19200000.0
81,Hennur Road,4,2957 - 3450,2.0,1.0,22450000.0
122,Hebbal,4,3067 - 8156,4.0,0.0,47700000.0
137,8th Phase JP Nagar,2,1042 - 1105,2.0,0.0,5400500.0
165,Sarjapur,2,1145 - 1340,2.0,0.0,4349000.0
188,KR Puram,2,1015 - 1540,2.0,0.0,5680000.0
224,Devanahalli,3,1520 - 1740,2.0,1.0,7482000.0
410,Kengeri,1,34.46Sq. Meter,1.0,0.0,1850000.0
549,Hennur Road,2,1195 - 1440,2.0,0.0,6377000.0


In [13]:
# Now converting this type of entries ('1125 - 4112') into INT and storing their average
def dash_to_int(x):
    dash = x.split('-')
    if len(dash)==2:
        return (float(dash[0]) + float(dash[1]))/2
    try:
        return float(x)
    except:
        return None
df['total_sqft'] = df['total_sqft'].apply(dash_to_int)
df.rename(columns={'total_sqft':'total_area'},inplace=True)
df.head()

Unnamed: 0,location,bhk,total_area,bath,balcony,price
0,Electronic City Phase II,2,1056.0,2.0,1.0,3907000.0
1,Chikka Tirupathi,4,2600.0,5.0,3.0,12000000.0
2,Uttarahalli,3,1440.0,2.0,3.0,6200000.0
3,Lingadheeranahalli,3,1521.0,3.0,1.0,9500000.0
4,Kothanur,2,1200.0,2.0,1.0,5100000.0


In [14]:
# Cross checking total_area entries types
df[df['total_area'].apply(check_float)]

Unnamed: 0,location,bhk,total_area,bath,balcony,price
0,Electronic City Phase II,2,1056.0,2.0,1.0,3907000.0
1,Chikka Tirupathi,4,2600.0,5.0,3.0,12000000.0
2,Uttarahalli,3,1440.0,2.0,3.0,6200000.0
3,Lingadheeranahalli,3,1521.0,3.0,1.0,9500000.0
4,Kothanur,2,1200.0,2.0,1.0,5100000.0
...,...,...,...,...,...,...
13315,Whitefield,5,3453.0,4.0,0.0,23100000.0
13316,Richards Town,4,3600.0,5.0,1.0,40000000.0
13317,Raja Rajeshwari Nagar,2,1141.0,2.0,1.0,6000000.0
13318,Padmanabhanagar,4,4689.0,4.0,1.0,48800000.0


### Cleaning column: 'location'

In [15]:
df.head()

Unnamed: 0,location,bhk,total_area,bath,balcony,price
0,Electronic City Phase II,2,1056.0,2.0,1.0,3907000.0
1,Chikka Tirupathi,4,2600.0,5.0,3.0,12000000.0
2,Uttarahalli,3,1440.0,2.0,3.0,6200000.0
3,Lingadheeranahalli,3,1521.0,3.0,1.0,9500000.0
4,Kothanur,2,1200.0,2.0,1.0,5100000.0


In [16]:
# Applying strip function to stripping the location names
df['location'] = df['location'].apply(lambda x: x.strip())

In [17]:
# As we have a lot of locations name so i am reducing the name types by assigning: location occuring for less than 10times as OTHER
loc = df.location.value_counts()
loc[loc<=10].count()
location_lessthan_10 = loc[loc<=10]
df.location = df.location.apply(lambda x: 'other' if x in location_lessthan_10 else x)
df.head()



Unnamed: 0,location,bhk,total_area,bath,balcony,price
0,Electronic City Phase II,2,1056.0,2.0,1.0,3907000.0
1,Chikka Tirupathi,4,2600.0,5.0,3.0,12000000.0
2,Uttarahalli,3,1440.0,2.0,3.0,6200000.0
3,Lingadheeranahalli,3,1521.0,3.0,1.0,9500000.0
4,Kothanur,2,1200.0,2.0,1.0,5100000.0


In [18]:
# cross checking
df.location.value_counts()

other                        2882
Whitefield                    540
Sarjapur  Road                397
Electronic City               304
Kanakpura Road                273
                             ... 
Pattandur Agrahara             11
2nd Phase Judicial Layout      11
Nehru Nagar                    11
Narayanapura                   11
HAL 2nd Stage                  11
Name: location, Length: 242, dtype: int64

#### ADDING a column named 'price_per_sqft' for understanding the data more clearly

In [19]:
# Making column 'price_per_sqft'
df['price_per_sqft'] = df['price']/df['total_area']

In [25]:
df.price_per_sqft.describe()

count    11475.000000
mean      6306.189469
std       3637.242546
min        267.829813
25%       4285.714286
50%       5313.531353
75%       6851.877201
max      53150.000000
Name: price_per_sqft, dtype: float64

In [21]:
df.shape

(13303, 7)

## Outliers removal by the knowledge of bussiness domain

#### Dataset must be Distributed uniformally
1. 1bhk house must be more than 300 sqft
2. Mostly price of 3 or 4 bhk house > price of 1bhk house
3. Number of bathrooms is less than count of BHK or (bhk+2) of house
 

In [22]:
df.describe()

Unnamed: 0,bhk,total_area,bath,balcony,price,price_per_sqft
count,13303.0,13257.0,13303.0,13303.0,13303.0,13257.0
mean,2.803728,1558.809175,2.689619,1.558295,11258400.0,7912.825
std,1.295022,1238.478935,1.339393,0.807914,14899380.0,106497.6
min,1.0,1.0,1.0,0.0,800000.0,267.8298
25%,2.0,1100.0,2.0,1.0,5000000.0,4271.186
50%,3.0,1275.0,2.0,2.0,7200000.0,5438.596
75%,3.0,1680.0,3.0,2.0,12000000.0,7313.318
max,43.0,52272.0,40.0,3.0,360000000.0,12000000.0


### Normalising the data set by taking 'price_per_sqft' as a feature

In [27]:
## Removing general outliers and making dataset distributed normally
def normally_dist(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        red_df = subdf[(subdf.price_per_sqft>(m-st)) &(subdf.price_per_sqft<=(m+st))]
        df_out = pd.concat([df_out,red_df],ignore_index = True)
    return df_out
df = normally_dist(df)
df.shape


(8369, 7)

In [28]:
df.describe()

Unnamed: 0,bhk,total_area,bath,balcony,price,price_per_sqft
count,8369.0,8369.0,8369.0,8369.0,8369.0,8369.0
mean,2.638667,1489.853828,2.520134,1.571275,9023334.0,5751.929985
std,1.047813,902.712226,1.10128,0.794019,8086565.0,2307.51713
min,1.0,300.0,1.0,0.0,1000000.0,2439.02439
25%,2.0,1100.0,2.0,1.0,5000000.0,4316.455696
50%,2.0,1270.0,2.0,2.0,6800000.0,5205.431755
75%,3.0,1626.0,3.0,2.0,9926000.0,6422.764228
max,27.0,30400.0,27.0,3.0,210000000.0,26000.0


In [63]:
# Checking total_area data points
fig = px.box(df,y='total_area',
            title='Total Area')
fig.update_layout(yaxis_title ='Area')
fig.show()

In [30]:
# 1bhk house must be more than 300 sqft
df = df[~(df.total_area/df.bhk < 300)]
df.shape

(8142, 7)

In [32]:
## Viz where it is happening that 2bhk price > 3bhk

def plot_scatter(df,location):
    bhk2 = df[(df.location == location) &(df.bhk ==2)]
    bhk3 = df[(df.location == location) &(df.bhk ==3)]
    fig = go.Figure() 
    fig.add_trace(go.Scatter(x=bhk2.total_area,y=bhk2.price,
                    mode='markers',
                    name='2 BHK'))
    fig.add_trace(go.Scatter(x=bhk3.total_area,y=bhk3.price,
                    mode='markers',
                    name='3 BHK'))
    fig.update_layout(title='2BHK vs 3BHK Price', xaxis_title='Area in Sqft', yaxis_title='Price',height=620,width=680)
    fig.show()

plot_scatter(df,'Rajaji Nagar')



In [33]:
# new feature inspection = Bathrooms (i went to my bussines manager)
df.bath.unique()

array([ 3.,  2.,  4.,  5.,  8.,  1.,  6.,  7.,  9., 12., 16., 13.])

In [34]:
y0 = df.bath
y1 = df.bhk
fig = go.Figure()
fig.add_trace(go.Box(y=y0,name='Bath',quartilemethod="linear"))
fig.add_trace(go.Box(y=y1,name='BHK',quartilemethod="linear"))
fig.update_layout(title='Bath & BHK', yaxis_title='Count',height=420,width=1080)
fig.show()

In [35]:
## As per BM i know that their can not be bathrooms more than Count of BHK +2 (may be outlier)

In [36]:
df[df['bath']> df['bhk']+2]

Unnamed: 0,location,bhk,total_area,bath,balcony,price,price_per_sqft
1231,Chikkabanavar,4,2460.0,7.0,2.0,8000000.0,3252.03252
3760,Nagasandra,4,7000.0,8.0,1.0,45000000.0,6428.571429
4905,Thanisandra,3,1806.0,6.0,2.0,11600000.0,6423.03433
6423,other,6,11338.0,9.0,1.0,100000000.0,8819.897689


In [37]:
# Removing bath outliers
df.drop(df[df['bath']> df['bhk']+2].index, inplace = True)

In [38]:
y0 = df.total_area
y1 = df.bhk
fig = go.Figure()
fig.add_trace(go.Box(y=y0,name='total_area',quartilemethod="linear"))
fig.add_trace(go.Box(y=y1,name='BHK',quartilemethod="linear"))
fig.update_layout(title='Bath & BHK', yaxis_title='Count',height=420,width=1080)
fig.show()

In [41]:
df.to_csv('Final_cleaned_data_4.csv')

## AS WE CLEANED OUR DATA NOW I AM MOVING TOWARDS MODEL TRAINING

In [42]:
dff = pd.read_csv('Final_cleaned_data_4.csv',index_col=0)

In [44]:
dff.shape

(8138, 7)

In [45]:
# one hot enchoding
dummies_dff =  pd.get_dummies(dff.location)
dff1 = pd.concat([dff,dummies_dff],axis='columns')

In [46]:
#dummmy variable trap avoided and string categorical data column removed

dff = dff1.drop(columns=['Vittasandra','location'])

In [47]:
dff.head(3)

Unnamed: 0,bhk,total_area,bath,balcony,price,price_per_sqft,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur,other
0,3,1630.0,3.0,2.0,19400000.0,11901.840491,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3,1875.0,2.0,3.0,23500000.0,12533.333333,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,1200.0,2.0,0.0,13000000.0,10833.333333,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [48]:
dff.shape

(8138, 247)

In [49]:
df.corr()

Unnamed: 0,bhk,total_area,bath,balcony,price,price_per_sqft
bhk,1.0,0.634363,0.853028,0.192395,0.531016,0.236103
total_area,0.634363,1.0,0.649762,0.14434,0.847678,0.277765
bath,0.853028,0.649762,1.0,0.18731,0.580616,0.292715
balcony,0.192395,0.14434,0.18731,1.0,0.117207,0.033033
price,0.531016,0.847678,0.580616,0.117207,1.0,0.680268
price_per_sqft,0.236103,0.277765,0.292715,0.033033,0.680268,1.0


### MOdel training

In [50]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

model = LinearRegression()

In [87]:
%%time
x= dff.drop(columns=['price','price_per_sqft'])
y = dff1['price']

Wall time: 9.81 ms


In [88]:
y.shape

(8138,)

In [89]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.35,random_state=10)

In [90]:
model.fit(x_train,y_train)
model.score(x_test,y_test)

0.8733565105010173

### Predictions

### As we used hot encoding for th elocation so we need a script to take input as location and get desired predictions

Input Parameters: Bhk;total_sqft;bath;location

In [91]:
x.columns

Index(['bhk', 'total_area', 'bath', 'balcony', '1st Block Jayanagar',
       '1st Phase JP Nagar', '2nd Phase Judicial Layout',
       '2nd Stage Nagarbhavi', '5th Block Hbr Layout', '5th Phase JP Nagar',
       ...
       'Vijayanagar', 'Vishveshwarya Layout', 'Vishwapriya Layout',
       'Whitefield', 'Yelachenahalli', 'Yelahanka', 'Yelahanka New Town',
       'Yelenahalli', 'Yeshwanthpur', 'other'],
      dtype='object', length=245)

In [92]:
# Script for prediction
def prediction(location,sqft,bhk,bath):
    location_index = np.where(x.columns == location)[0][0]

    X = np.zeros(len(x.columns))
    X[0]=bhk
    X[1]=sqft
    X[2]=bath  
    #X[3]=price_per_sqft   
 # X[3]= pricepersqft'( if we want to add price_p_sqft as a feature)       
    if location_index>=0:
        X[location_index]=1
    return model.predict([X])[0]


In [93]:
#Give input in order of location,sqft,bhk,bath

Price_prediction = prediction('1st Phase JP Nagar',1400,2,2)

print("The predicted price is",Price_prediction ,"lakh")

The predicted price is 11864502.724442942 lakh


### Now i am Going to check other Regression models and find out which one is the best

In [116]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(x)

In [117]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
#import xgboost as xgb 
from sklearn.tree import DecisionTreeRegressor
#from sklearn.ensemble import RandomForestRegressor

def find_best_model_using_gridsearchcv(x , y):
    algos = {
        'LinearRegression' : {
            'model' : LinearRegression(),
            'params' : {
                'normalize' : [True , False],
                'fit_intercept': [True , False],
                 'copy_X' : [True , False]
            }
        },
        'lasso' : {
            'model' : Lasso(),
            'params' : {
                'alpha' : [1, 10, 50, 200, 500],
                'selection' : ['random' , 'cyclic']
            } 
        },
         'Ridge' : {
            'model' : Ridge(), 
            'params' : {
                'alpha' : [1, 10, 50, 200, 500],
                'fit_intercept' : [True , False],
                'normalize' : [True , False],
            }
        },
        'descision_tree' : {
            'model' : DecisionTreeRegressor(),
            'params' :{
                'criterion' : ['mse' , 'friedman_mse'],
                'splitter' : ['best' , 'random']
            }
        }
    }
    
    scores = []
    cv = ShuffleSplit(n_splits = 5 , test_size = 0.2 , random_state = 0)
    for algo_name , config in algos.items():
        gs = GridSearchCV(config['model'] , config['params'] , cv = cv , return_train_score = False)
        gs.fit(x,y)
        scores.append({
            'model' : algo_name , 
            'best_score' : gs.best_score_,
            'best_params' : gs.best_params_
        })
 
    return pd.DataFrame(scores , columns = ['model' , 'best_score' , 'best_params'])  

In [122]:
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

In [123]:
find_best_model_using_gridsearchcv(x,y) 

Unnamed: 0,model,best_score,best_params
0,LinearRegression,0.863804,"{'copy_X': True, 'fit_intercept': True, 'norma..."
1,lasso,0.863804,"{'alpha': 1, 'selection': 'random'}"
2,Ridge,0.861887,"{'alpha': 1, 'fit_intercept': False, 'normaliz..."
3,descision_tree,0.791387,"{'criterion': 'mse', 'splitter': 'best'}"


### Thats why we used Liner Reg model

{Project by : Kumar Shivam}