# Data Science Regression Project : Bengaluru House Price Prediction

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

Loading Bengaluru House Price Dataset into Dataframe

In [None]:
df=pd.read_csv('../input/bengaluru-house-price-data/Bengaluru_House_Data.csv')

# Short Analysis

In [None]:
df.shape

In [None]:
df.columns.values

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

# Data Wrangling

Converting Price(in lakhs) to normal form

In [None]:
df['price']=df['price']*100000
df.head()

Finding unique values and count for each parameter

In [None]:
for i in df.columns:
    print(i)
    print(df[i].value_counts())
    #print('\n')
    print('__________________________________________________________________________________________\n')

Size Data Featuring

In [None]:
df['size'].value_counts()

In [None]:
df['size'].describe()

In [None]:
#Replacing size null Values with 0
df['size'].fillna('0',inplace=True)
df[df['size']=='0']

In [None]:
df['size'].unique()

Removing sufix such as 'BHK', 'Bedroom', 'RK',etc from size values

In [None]:
df['size'] = df['size'].apply(lambda x : int(x.split()[0]))
df.head()

Bath and Balcony Data Featuring

In [None]:
df.bath.describe()

In [None]:
df[df['bath'].isnull()].head()

In [None]:
df.balcony.describe()

In [None]:
df[df['balcony'].isnull()].head()

Filling Null Values of bath and balcony 

In [None]:
for bedr in df['size']:
    if bedr==0:
        df['bath']=df['bath'].fillna(0)
        df['balcony']=df['balcony'].fillna(0)
    elif bedr >=5:
        df['bath']=df['bath'].fillna(3)
        df['balcony']=df['balcony'].fillna(3)
    else:
        df['bath']=df['bath'].fillna(int(df['bath'].mean()))
        df['balcony']=df['balcony'].fillna(int(df['balcony'].mean()))

In [None]:
df.head()

total_sqft data Featuring

Finding total_sqft values which are in string format

In [None]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True
df[~df['total_sqft'].apply(is_float)].head(10)

Above total_sqft data shows values are in form of range or in different units  
Converting values which are in range by simply taking mean of min snd max of range.  
eg. for 2100-2850 the mean will be (2100+2850)/2=2475  
And for values with different units, converting them to square foot by unit conversion.

In [None]:
def total_sqft_modification(my_list):
    if len(my_list) == 1:
        
        try:
            return float(my_list[0])
        except:
            strings = ['Sq. Meter', 'Sq. Yards', 'Perch', 'Acres', 'Cents', 'Guntha', 'Grounds']
            split_list = re.split('(\d*.*\d)', my_list[0])[1:]
            area = float(split_list[0])
            type_of_area = split_list[1]
            
            if type_of_area == 'Sq. Meter':
                area_in_sqft = area * 10.7639
            elif type_of_area == 'Sq. Yards':
                area_in_sqft = area * 9.0
            elif type_of_area == 'Perch':
                area_in_sqft = area * 272.25
            elif type_of_area == 'Acres':
                area_in_sqft = area * 43560.0
            elif type_of_area == 'Cents':
                area_in_sqft = area * 435.61545
            elif type_of_area == 'Guntha':
                area_in_sqft = area * 1089.0
            elif type_of_area == 'Grounds':
                area_in_sqft = area * 2400.0
            return float(area_in_sqft)
        
    else:
        return (float(my_list[0]) + float(my_list[1]))/2.0

In [None]:
df['total_sqft'] = df.total_sqft.str.split('-').apply(total_sqft_modification)

In [None]:
df.total_sqft.describe()

In [None]:
df.head()

Availability Data Featuring

Converting availability values with month year to only year  
eg. 19-Dec to 2019

In [None]:
df['availability']=df['availability'].apply(lambda x : str(x.split('-')[0]))

In [None]:
df.head()

In [None]:
for i in df['availability'].unique():
    if i!='Immediate Possession':
        if i!='Ready To Move':
            a='20'+i
            int(a)
            df['availability']=df['availability'].replace(i,a)

In [None]:
df.head()

In [None]:
df.availability.value_counts()

In [None]:
df[df['location'].isnull()]

In [None]:
#Filling null values of location by Others
df['location'].fillna('Others',inplace=True)

Removing Society from Dataframe as most of the data is missing

In [None]:
df.drop('society',axis=1,inplace=True)
df.head()

Adding New Feature of Price per Square Feet

In [None]:
df['price_per_sqft']=df['price']/df['total_sqft']
df.head()

Removing Apartments having total_sqft less than 300

In [None]:
df[df['total_sqft']<300]

In [None]:
df.drop(df[df['total_sqft']<300].index,inplace=True)

Removing Apartments having price per sqft rate higher than 40000

In [None]:
df[df['price_per_sqft']>=40000].head()

In [None]:
df.drop(df[df['price_per_sqft']>=40000].index,inplace=True)

Removing Apartments having price per square foot rate less than 1500

In [None]:
df[df.price_per_sqft<=1500].head()

In [None]:
df.drop(df[df['price_per_sqft']<=1500].index,inplace=True)

Removing Apartments according to size and total square foot  
eg. Its generally not possible for 1 bhk Apartments to have 10763.9 sqft area so excluding such types

In [None]:
df.shape

In [None]:
df[(df['size']==1) & (df['total_sqft']/df['size']>=1800)]

In [None]:
df.drop(df[(df['size']==1) & (df['total_sqft']/df['size']>=1800)].index,inplace=True)

In [None]:
df[(df['size']>1) & (df['size']<4) & (df['total_sqft']/df['size']>=3000)]

In [None]:
df.drop(df[(df['size']>1) & (df['size']<4) & (df['total_sqft']/df['size']>=3000)].index,inplace=True)

In [None]:
df[(df['size']>3) & (df['total_sqft']/df['size']>=4000)]

In [None]:
df.drop(df[(df['size']>3) & (df['total_sqft']/df['size']>=4000)].index,inplace=True)

In [None]:
df[df['total_sqft']/df['size']<300].head()

In [None]:
df.drop(df[df['total_sqft']/df['size']<300].head().index,inplace=True)

Removing Data having number of bathrooms much greater than size(bhk)  
eg. A 3 bhk or less apartment usually don't have 5 baths(can be max with 1 common and 3 with attached bedroom)  
But in case of apartments with size greater than 4 may have baths equal to size+2

In [None]:
df[df['bath']>=df['size']+2].head()

In [None]:
df.drop(df[df['bath']>df['size']+2].index,inplace=True)

In [None]:
df[(df['size']<=5) & (df['bath']>=df['size']+2)].head()

In [None]:
df.drop(df[(df['size']<=5) & (df['bath']>=df['size']+2)].index,inplace=True)

The data shown below is invalid because  
1] 18 bhk houses usually dont exist  
2] And if they exist their total_sqft is not as low as shown below

In [None]:
df[df['size']>=17]

In [None]:
df.drop(df[df['size']>=17].index,inplace=True)

Converting Location values having total count less than or equal to 10 to 'Others'  
This also helps while buliding model as there will less columns in dummy

In [None]:
locations=dict(df['location'].value_counts())
locations

In [None]:
for k,v in list(locations.items()):
    if v>10:
        del locations[k]
locations

In [None]:
df.location.replace(list(locations.keys()),'Others',inplace=True)

In [None]:
df.location.value_counts()

In [None]:
df.shape

Removing outliers using help of 'price per sqrt'  taking std and mean per location


In [None]:
def remove_pps_outliers(df):
  df_out = pd.DataFrame()
  for key, subdf in df.groupby('location'):
    m=np.mean(subdf.price_per_sqft)
    st=np.std(subdf.price_per_sqft)
    reduced_df = subdf[(subdf.price_per_sqft>(m-st))&(subdf.price_per_sqft<=(m+st))]
    df_out = pd.concat([df_out, reduced_df], ignore_index = True)
  return df_out

df = remove_pps_outliers(df)
df.shape

Graph of 2 BHK and 3 BHK apartments at Yelahanka(location) with Price Vs Total Sqft

In [None]:
def plot_scatter_chart(df1,location):
    bhk2 = df1[(df1['location']==location) & (df1['size']==2)]
    bhk3 = df1[(df1['location']==location) & (df1['size']==3)]
    plt.rcParams['figure.figsize'] = (15,10)
    plt.scatter(bhk2['total_sqft'],bhk2['price'],color='orange',label='2 BHK', s=50)
    plt.scatter(bhk3['total_sqft'],bhk3['price'],marker='*', color='blue',label='3 BHK', s=50)
    plt.xlabel("Total Square Feet Area")
    plt.ylabel("Price (Lakh Indian Rupees)")
    plt.title(location)
    plt.legend()
plot_scatter_chart(df,"Yelahanka")

From the above graph it is observed that the price of 2 BHK apartment having approximately same total sqft have higher price than that of 3 BHK 

So Removing such type of apartments from same location having size 's' whose price_per_sqft is less than mean price_per_sqft that of size 's-1'

In [None]:
def remove_bhk_outliers(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('size'):
            bhk_stats[bhk] = {
                'mean': np.mean(bhk_df.price_per_sqft),
                'std': np.std(bhk_df.price_per_sqft),
                'count': bhk_df.shape[0]
            }
        for bhk, bhk_df in location_df.groupby('size'):
            stats = bhk_stats.get(bhk-1)
            if stats and stats['count']>5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)
    return df.drop(exclude_indices,axis='index')
df = remove_bhk_outliers(df)

In [None]:
df.shape

The below graph shows the expected results at Yelahanka

In [None]:
def plot_scatter_chart(df1,location):
    bhk2 = df1[(df1['location']==location) & (df1['size']==2)]
    bhk3 = df1[(df1['location']==location) & (df1['size']==3)]
    plt.rcParams['figure.figsize'] = (15,10)
    plt.scatter(bhk2['total_sqft'],bhk2['price'],color='orange',label='2 BHK', s=50)
    plt.scatter(bhk3['total_sqft'],bhk3['price'],marker='*', color='blue',label='3 BHK', s=50)
    plt.xlabel("Total Square Feet Area")
    plt.ylabel("Price (Lakh Indian Rupees)")
    plt.title(location)
    plt.legend()
plot_scatter_chart(df,"Yelahanka")

# Data Visualization

In [None]:
df.describe()

In [None]:
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (10,6)

Creating a function for plotting scatter of various parameters against price

In [None]:
def scatter(parameter):
    plt.scatter(df[parameter],df['price'])
    plt.xlabel(parameter)
    plt.ylabel('price range')
    plt.title((parameter+' available in  range'))
    plt.yticks(np.arange(10000000,250000000,10000000))

Creating a function for plotting histogram of unique values of various parameters

In [None]:
def bar(parameter):
    value_count_list=dict(df[parameter].value_counts())
    plt.bar(value_count_list.keys(),value_count_list.values())
    plt.xlabel(parameter)
    plt.ylabel('Count')
    plt.title(parameter+' Vs total count')

Most of the apartments in Bengaluru are of size 2-3 bhk

In [None]:
plt.xticks(np.arange(1,17,1))
bar('size')

In [None]:
plt.xticks(np.arange(1, 17, 1))
scatter('size')

Most of the Apartments are in range of 5000-10000 price per sqft

In [None]:
plt.hist(df['price_per_sqft'],rwidth=0.9,align='left',bins=8)
plt.xlabel('Per Square Feet')
plt.ylabel('Count')
plt.xticks(np.arange(0,30000,2500))
plt.title('Price per square foot vs total count in that price range')

In [None]:
plt.xticks(np.arange(1,18,1))
plt.yticks(np.arange(0,40000,2500))
plt.scatter(df['size'],df['price_per_sqft'])
plt.xlabel('size(in bhk)')
plt.ylabel('Price Per Square Feet')
plt.title('size(in bhk) availability vs price per square feet')

Most of the data of area type was given in Super built-up Area

In [None]:
bar('area_type')

Most of the apartments are available for Ready to move while a thousand are ready till 2018

In [None]:
bar('availability')

In [None]:
scatter('availability')

Most of the apartments have 1-2 balconies

In [None]:
bar('balcony')

Most of the apartments have 2 baths

In [None]:
plt.xticks(np.arange(1,11,1))
bar('bath')

Looking for the corelation of few parameters with price

In [None]:
import seaborn as sns
num_vars = ["bath", "balcony",'total_sqft','size','price']
sns.heatmap(df[num_vars].corr(),cmap="coolwarm", annot=True)


# Preprocessing Data for building ML Model

Removing 'Balcony' column as the corelation of balcony is very less with price  
And also removing Price per sqft as it was created only data featuring  
And area type because it also don't affect the price

In [None]:
df.drop(['price_per_sqft','area_type','balcony'],axis=1,inplace=True)

In [None]:
df.head()

Using OneHotEncoder for creating dummy columns of availability and location

In [None]:
availability_dummy=pd.get_dummies(df.availability)
availability_dummy.head(3)

In [None]:
location_dummy=pd.get_dummies(df.location)
location_dummy.head(3)

# Creating Final DataFrame for Model Deployment

In [None]:
df_final=pd.concat([df,availability_dummy,location_dummy.drop('Others',axis=1)],axis=1)
df_final.drop(['availability','location'],axis=1,inplace=True)
df_final.head()

In [None]:
X=df_final.drop(['price'],axis=1)
y=df_final['price']

In [None]:
X.head(3)

In [None]:
y.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
print('Shape of X_train : ',X_train.shape)
print('Shape of y_train : ',y_train.shape)
print('Shape of X_test : ',X_test.shape)
print('Shape of y_test : ',y_test.shape)

# Trying Different Regression Algorithms and  selecting one with highest score

In [None]:
from sklearn.linear_model import LinearRegression, Lasso,Ridge,BayesianRidge,OrthogonalMatchingPursuit,LassoLars
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.cross_decomposition import PLSRegression
from xgboost import XGBRegressor
from sklearn.model_selection import ShuffleSplit, cross_val_score, GridSearchCV

Finding best model using GridSearchCV

In [None]:
model_parameters={
    
    'Linear Regression':{
        'model' :             LinearRegression(),
        'params':{
            'normalize':     [True, False]
        }
    },
    
    'Decision Regression':{
        'model' :             DecisionTreeRegressor(),
        'params':{
            'criterion':     ['mse', 'friedman_mse'],
            'splitter':      ['best','random'],
            'max_features':  [None,'auto', 'sqrt', 'log2']
        }
    },
    
    'Random Forest Regression':{
        'model' :             RandomForestRegressor(),
        'params':{
            'max_features':  [None,'auto','sqrt','log2']
        }
    },
    
    'Gradient Boosting Regressor':{
        'model' :             GradientBoostingRegressor(),
        'params':{
            'loss':          ['ls', 'lad', 'huber', 'quantile'],
            'criterion':     ['friedman_mse', 'mse'],
            'max_features':  [None,'auto', 'sqrt', 'log2']
        }
    },
    
    'Lasso Regressor':{
        'model' :             Lasso(),
        'params':{
            'selection':     ['random', 'cyclic']
        }
    },
    
    'Ridge Regressor':{
        'model' :            Ridge(),
        'params':{
            'solver':        ['auto','lsqr','saga'],
            'alpha' :        [0,0.2,0.4,0.6,0.8,1]
        }
    },
    
    'Extra Tree Regressor':{
        'model' :             ExtraTreeRegressor(),
        'params':{
            'max_features':  ['sqrt','log2']
        }
    },
    
    'Bayesian Ridge Regressor':{
        'model' :             BayesianRidge(),
        'params':{}
    },
    
    'PLS Regression':{
        'model' :             PLSRegression(),
        'params':{}
    },
    
    'OrthogonalMatchingPursuit Regressor':{
        'model' :             OrthogonalMatchingPursuit(),
        'params':{}
    },
    
    'LassoLars Regressor':{
        'model' :             LassoLars(),
        'params':{}
    },
    
    'XGB Regressor':{
        'model' :              XGBRegressor(),
        'params':{
            'n_estimators':[10,50,100,500,1000]
        }
    }
}

In [None]:
def best_regression_parameters_model(X,y):
    scores=[]
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for model_name, parameters in model_parameters.items():
        gs=GridSearchCV(parameters['model'],parameters['params'],return_train_score=False,cv=cv)
        gs.fit(X,y)
        
        scores.append({
            'Model':            model_name,
            'Best Parameters':  gs.best_params_,
            'Best Score':       gs.best_score_
        })
    model= pd.DataFrame(scores,columns=['Model','Best Parameters','Best Score'])
    return model
models=best_regression_parameters_model(X,y)

In [None]:
models['Best Score']=models['Best Score']*100
models

In [None]:
models[models['Best Score']>83]

From above table it is observed that best score is achieved by using Xboost Algorithm  
So 'Xboost' is used for price prediction

In [None]:
for a in [10,50,100,500,1000]:
    xgb_reg = XGBRegressor(n_estimators=a)
    xgb_reg.fit(X_train,y_train)
    print(a," : ",xgb_reg.score(X_test,y_test)*100)

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(XGBRegressor(n_estimators=500),X_train,y_train,cv=10)

# Building the Model

In [None]:
model= XGBRegressor(n_estimators=500)
model.fit(X_train,y_train)
model.score(X_test,y_test)*100

In [None]:
model.predict(X_test)#predicting test data