# Project to predict Price of Houses in Bengaluru using Regression

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
bhd = pd.read_csv("/kaggle/input/bengaluru-house-price-data/Bengaluru_House_Data.csv")

In [None]:
bhd.head()

In [None]:
bhd.columns

In [None]:
bhd.shape

There are 9 columns and 13,320 entries into the dataset.

In [None]:
bhd.dtypes

In [None]:
bhd.isna().sum()

Seems as if we do have null values in the dataset and have to treat them appropiatety.

Renaming this price column to <code>price_lakhs</code>,as now it'll remove the ambiguity of unit.

In [None]:
bhd = bhd.rename(columns = {'price':'price_lakhs'})

In [None]:
bhd.describe(include='all')

From this preliminary analysis we get to know that:-

a). On average for every house atleast we have a balcony, a couple of bathrooms and price frequently ranging somewhere between 70  and 112

b).  Most of the houses available are 2BHK and in GrrvaGr society wth an availability of 'Ready to move'.

## Now, we'll explore data and do data cleaning side by side.

Lets explore <code>area_type</code> column

In [None]:
plt.figure(figsize=(10,4))
sns.countplot(x = bhd.area_type, data = bhd)
plt.show()

**Super built-up** Area is most common type followed by **Built-up** and **Plot** Area.

In [None]:
bhd.availability.value_counts()

Nearly three quarter of houses are available as ready to move.

Since we aren't quite aware of the time at which data has been gathered so we'll instead try to classify it as if the house is available for "Ready to move or not" and rename the column as <code>avail_now</code>

In [None]:
def check_avail(l):
    if l == 'Ready To Move':
        return 1
    else: 
        return 0

In [None]:
bhd['availability'] = bhd['availability'].apply(check_avail)

In [None]:
bhd.rename(columns= {'availability':'avail_now'}, inplace = True)

Lets just plot and have a look.

In [None]:
plt.figure(figsize=(10,3))
sns.countplot(x = bhd.avail_now, data = bhd)
plt.show()

In [None]:
bhd.location.value_counts()

In [None]:
bhd.location.value_counts()[:10].plot(kind='barh',figsize=(14,6))
plt.show()

So from the Viz we get to know that **Whitefield** has most no of apartments available followed by **Sarjapur Road** and **Electronic city**.

In [None]:
bhd['size'].value_counts()

In [None]:
bhd['size'] = bhd['size'].apply(lambda a: str(a).split(' ')[0])

bhd['size'] = bhd['size'].apply(lambda x:3 if(x =='nan') else x)

bhd['size'] = bhd['size'].map(int)

In [None]:
bhd['size'].value_counts().plot(kind= 'bar',figsize=(10,4))
plt.show()

In [None]:
bhd.rename(columns= {'size':'size_bhk'}, inplace = True)

In [None]:
bhd['society'].value_counts()

In [None]:
bhd['society'].isna().sum()

Since nearly half of the values from the column are missing thus we'll get rid of this column

In [None]:
bhd.drop(['society'],axis=1,inplace=True)

In [None]:
bhd.total_sqft.value_counts()

In [None]:
def oton(k):
    num = k.split('-')
    q=0
    for t in num:
        try :
            q+=float(t)
            return q/len(num)
        except ValueError as v:
            return num

In [None]:
bhd['total_sqft']=bhd['total_sqft'].apply(oton)

In [None]:
# assigning sqft value to corresponding house sizes in bhk
size = bhd['size_bhk'].value_counts().index
size = [i for i in size]
sqft = []
for z in size:
    
    y=bhd[bhd['size_bhk']==z]['total_sqft']
    sum = 0
    count=0
    for i in y:
        if(type(i) == float):
            sum+=i
            count+=1
        else :
            continue
    sqft.append(sum/count)    

In [None]:
def fitk(v):
    c1 = v[0]
    c2 = v[1]
    if type(c2) != float:
        return sqft[size.index(c1)]
    else :
        return c2

In [None]:
bhd['total_sqft'] = bhd[['size_bhk','total_sqft']].apply(fitk,axis=1)

In [None]:
plt.figure(figsize=(14,5))
sns.distplot(bhd['total_sqft'],kde = False)
plt.show()

In [None]:
bhd.drop(bhd[bhd['total_sqft'] < 300].index,inplace=True)

In [None]:
bhd.total_sqft.std()

In [None]:
bhd.total_sqft.mean()

In [None]:
bhd.drop(bhd[bhd['total_sqft'] > 10000].index,inplace=True)

In [None]:
plt.figure(figsize=(12,4))
sns.countplot(x='bath',data=bhd)
plt.show()

Majority of the houses are with two bathrooms.

In [None]:
bhd.bath.isna().sum()

In [None]:
bhd.balcony.isna().sum()

In [None]:
bhd.dropna(axis=0,how='any',inplace=True)

In [None]:
# converting to int datatype 
bhd['bath']=bhd['bath'].map(int)
bhd['balcony']=bhd['balcony'].map(int)

In [None]:
sns.heatmap(data=bhd.isna(),yticklabels=False,cbar = False,cmap='cividis')
plt.show()

So our data is free from any null values.

In [None]:
bhd.duplicated().sum()

In [None]:
bhd.drop_duplicates(inplace=True)

Removing duplicate entries from the dataset.

In [None]:
bhd.info()

In [None]:
bhd.head()

## EDA

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(x='area_type',data=bhd)
# plt.legend(loc =1)
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(x='area_type',data=bhd,hue='avail_now')
# plt.legend(loc =1)
plt.show()

So most of the houses are available immediately are in Super-builtup area.

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(x='size_bhk',data=bhd)
# plt.legend(loc =1)
plt.show()

A large chunk of houses available are either 2 or 3 BHK.

In [None]:
plt.figure(figsize=(10,6))
sns.distplot(x=bhd['total_sqft'],kde=False)
plt.show()

Most of the houses have area in the range of 1000 to 5000 sqft.

In [None]:
y = bhd.price_lakhs.value_counts()
plt.figure(figsize=(16,5))
plt.hist(x=y.index,weights=y,bins=60)
plt.show()

Majority of the houses available are below 200 lakhs

House available in the price range of 50-100 lakhs are maximum in count. Followed by houses priced below 50lakhs and houses priced between 100-150lakhs.

In [None]:
sns.heatmap(bhd.corr())
plt.show()

In [None]:
sns.pairplot(data=bhd)

In [None]:
bhd.drop(['location'],axis=1,inplace=True)

Adding 1 more column.

In [None]:
bhd['ppsft'] = (bhd['price_lakhs']*100000)/bhd['total_sqft']

In [None]:
bhd.corr()

In [None]:
bhd.describe()

In [None]:
bhd.drop(bhd[bhd['bath'] > 10].index,inplace=True)

In [None]:
sns.boxplot(data=bhd)

Making the dataset ready for the prediction via Regression.

In [None]:
bhdn = pd.get_dummies(bhd,drop_first=True)

In [None]:
y = bhdn[['price_lakhs']]

X = bhdn.drop(['price_lakhs'],axis=1)

In [None]:
bhdn.head()

In [None]:
X.head()

In [None]:
y.head()

In [None]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
Lr = LinearRegression()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=15)

In [None]:
Lr.fit(X_train,y_train)

In [None]:
Lr.score(X_train,y_train)

In [None]:
Lr.score(X_test,y_test)

In [None]:
y_hat = Lr.predict(X_test)

**Checkng the r-squared value**

In [None]:
from sklearn.metrics import r2_score

In [None]:
r2_score(y_test,y_hat)

In [None]:
cv = ShuffleSplit(n_splits=6, test_size=0.2, random_state=10)
ar = cross_val_score(LinearRegression(), X, y, cv=cv)

In [None]:
ar

In [None]:
print(f"This model can explain nearly {round(ar.mean()*100,2)}% of the results.")