# Data Science Regression Project: Predicting home prices in Bangalore

#### Dataset is downloaded from here:https://www.kaggle.com/datasets/shantanudhakadd/house-prediction-dataset?datasetId=1965889

In [409]:
import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
%matplotlib inline
matplotlib.rcParams["figure.figsize"]=(20,10)

In [410]:
df=pd.read_csv("../input/house-prediction-dataset/Bengaluru_House_Data.csv")
df.head()

In [411]:
df.shape

In [412]:
df.info()

# Let us first define what information the columns contain:
* area_type: This column specify that the area given is super built up or bulit up or plot area
* availability:When will this house is available for shifting
* location: Colony ( location) name
* size: Size of the house
* society: Society name
* total_sqft: Total Square feet area of the house
* bath: Number of Bathrooms
* balcony: Number of Balcony
* price: Final Price of the house

# Data Cleaning

In [413]:
# Dropping the columns which are not important features

df2=df.drop(['area_type','society','balcony','availability'],axis=1)
df2.head()

In [414]:
df2.describe()

In [415]:
# Counting the number of missing values in the dataset

df2.isnull().sum()


In [416]:
# Handling location column

df2['location'].value_counts()

### Whitefield and Sarjapur Road are the most frequent values in location column, so we will fill the null values in location column with one of them.

In [417]:
# Filling the NaN  values in location with the most frequent value.

df2['location']=df2['location'].fillna('Whitefield')

In [418]:
# Handling size column

df2['size'].value_counts()

In [419]:
# Filling the missing value in size column with the most frequent value

df2['size']=df2['size'].fillna('2 BHK')

In [420]:
# Filling the missing value in bath column with the median of the bath column

df2['bath']=df2['bath'].fillna(df2['bath'].median())

In [421]:
df2.info()

### Hence, we have successfully handled all the missing values.

In [422]:
df2['size'].unique()

In [423]:
# Creating a new column named bhk in which only the numerical value of bhk is kept.

df2['bhk']=df2['size'].apply(lambda x: int(x.split(' ')[0]))

In [424]:
df2.head()

In [425]:
df2['bhk'].unique()

In [426]:
# Viewing the rows having more than 20 bhk

df2[df2.bhk>20]

### These are the outliers present in bhk column which needs to be handled

In [427]:
df2['total_sqft'].unique()

In [428]:
# Handling the values in total_sqft which is present as range

def convertRange(x):
    tokens=x.split('-')
    if len(tokens)==2:
        return (float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None
    
    
df3=df2.copy()
df3['total_sqft']=df3['total_sqft'].apply(convertRange)
df3.head()

In [429]:
df3['total_sqft'].unique()

In [430]:
df3.isnull().sum()

In [431]:
df3=df3.dropna()

In [432]:
# Calculating price per square feet for each row to help detect outliers

df4=df3.copy()
df4['price_per_sqft']=df3['price']*100000/df3['total_sqft']
df4.head()

In [433]:
location_data=df4["location"].value_counts()
location_data

In [434]:
# Removing the spaces in values if there is any.

df4['location']=df4['location'].apply(lambda x: x.strip())
location_data=df4["location"].value_counts()
location_data

In [435]:
# Making a cluster of all the values having counts less than 10

location_count_lessthan10=location_data[location_data<=10]
location_count_lessthan10

In [436]:
def handle_location_column(value):
    if value in location_count_lessthan10:
        return 'others'
    else:
        return value
    
df4['location']=df4['location'].apply(handle_location_column)
df4['location'].value_counts()

In [437]:
df4.head()

# Outlier detection and removal

In [438]:
df4.describe()

In [439]:
# Finding the outliers and anamolies for total_sqft column

(df4['total_sqft']/df4['bhk']).describe()

In [440]:
 # Removing all those rows which has total_sqft/bhk <300

df5=df4.copy()
df5=df5[((df5['total_sqft']/df5['bhk']) >=300)]
df5.describe()

In [441]:
df5.isnull().sum()

In [442]:
df5.shape

In [443]:
# Finding outliers for price_per_sqft column

df5.price_per_sqft.describe()

In [444]:
# Keeping those values in price_per_sqft which lie between 3 standard deviations of the mean

def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        reduced_df = subdf[(subdf.price_per_sqft>(m-3*st)) & (subdf.price_per_sqft<=(m+3*st))]
        df_out = pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out
df6 = remove_pps_outliers(df5)
df6.describe()

In [445]:
df6.head()

In [446]:
# Removing outliers present in bhk column using price_per_sqft column

def bhk_outlier_removal(df):
    exclude_indices=np.array([])
    for location,location_df in df.groupby('location'):
        bhk_stats={}
        for bhk,bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk]={
                'mean': np.mean(bhk_df.price_per_sqft),
                'std':np.std(bhk_df.price_per_sqft),
                'count':bhk_df.shape[0]
            }
        
        for bhk,bhk_df in location_df.groupby('bhk'):
            stats=bhk_stats.get(bhk-1)
            if stats and stats['count']>5:
                exclude_indices=np.append(exclude_indices,bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)
    return df.drop(exclude_indices,axis='index')


df7=bhk_outlier_removal(df6)


In [447]:
df7.shape

In [448]:
# Outlier Removal Using Bathrooms Feature

df7.bath.unique()

In [449]:

 df7[df7.bath>10]

### It is unusual to have 2 more bathrooms than number of bedrooms in a home

In [450]:
df7[df7.bath>df7.bhk+2]

In [451]:
# Only selecting those values in bath column whose value is less than value of corresponding bhk +2

df8 = df7[df7.bath<df7.bhk+2]
df8.shape

In [452]:
# Dropping the unnecessary columns

df9 = df8.drop(['size','price_per_sqft'],axis=1)


# Cleaned Data

In [453]:
df9.isnull().sum()

# Building the Model

In [454]:
X= df9.drop(['price'],axis=1)
y= df9.price

In [455]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score


In [456]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=10)

# Applying Linear Regression

In [457]:
column_trans=make_column_transformer((OneHotEncoder(sparse=False),['location']),
                                    remainder='passthrough')

In [458]:
scaler=StandardScaler()

In [459]:
lr=LinearRegression(normalize=True)

In [460]:
pipe=make_pipeline(column_trans,scaler,lr)

In [461]:
pipe.fit(X_train,y_train)

In [462]:
y_pred_lr=pipe.predict(X_test)

In [463]:
r2_score(y_test,y_pred_lr)

# Applying Lasso

In [464]:
lasso=Lasso()

In [465]:
pipe=make_pipeline(column_trans,scaler,lasso)

In [466]:
pipe.fit(X_train,y_train)

In [467]:
y_pred_lasso=pipe.predict(X_test)
r2_score(y_test,y_pred_lasso)

# Applying Ridge

In [473]:
ridge=Ridge()

In [474]:
pipe=make_pipeline(column_trans,scaler,ridge)

In [475]:
pipe.fit(X_train,y_train)

In [477]:
y_pred_ridge=pipe.predict(X_test)
r2_score(y_test,y_pred_ridge)

In [480]:
print("No Regularization: ",r2_score(y_test,y_pred_lr))
print("Lasso: ",r2_score(y_test,y_pred_lasso))
print("Ridge: ",r2_score(y_test,y_pred_ridge))

### R2_score of Linear Regression and Ridge is approximately same so we will drop Ridge model

# Export the tested model to a pickle file

In [482]:
import pickle

pickle.dump(pipe,open('RidgeModel.pk1','wb'))