In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import plotly.graph_objects as go 
import plotly.express as px 
%matplotlib inline

In [None]:
# Importing the House price dataset
df = pd.read_csv('../input/bengaluru-house-price-data/Bengaluru_House_Data.csv')
df.head()

In [None]:
print('the shape of dataset is :',df.shape)

## Data Cleaning

In [None]:
#Checking the null values
df.isnull().sum()

In [None]:
# Checking the dataset info
df.describe()

In [None]:
# Observing the Bath and Balcony column
y0 = df.bath
y1 = df.balcony
fig = go.Figure()
fig.add_trace(go.Box(y=y0,name='Bath',quartilemethod="linear"))
fig.add_trace(go.Box(y=y1,name='Balcony',quartilemethod="linear"))
fig.update_layout(title='Bath & Balcony', yaxis_title='Count',height=420,width=1080)
fig.show()

In [None]:
# As we can not fill the null data in Society by any means so saying it: No data (for nan value)
# As we say that maximum count of Bath and Balcony lies between its LQ - UQ: We can fill the bath,balcony by its mean
# For size we can not
df.bath = df.bath.fillna(int(df.bath.mean()))
df.balcony = df.balcony.fillna(int(df.balcony.mean()))
df.society = df.society.fillna("No data")
df = df[df['location'].notna()]
df = df[df['size'].notna()]
df.head()

In [None]:
# Again checking the null values
df.isnull().sum()

In [None]:
# Removing un-necessary columns
df.drop(columns=['area_type','availability','society'],inplace=True)


### Cleaning columns : 'bath' and 'balcony'

In [None]:
# Size column having mixed value like '2 bhk or 2 bedroom' so i am generalising it
# Renaming column size to BHK
# Converting price into lakhs
df['size'] = df['size'].apply(lambda x: int(x.split(' ')[0]))
df.rename(columns={'size':'bhk'},inplace=True)
df['price'] = df['price']*100000
df.head()


### Cleaning Column: total_sqft

In [None]:
#Checking total_sqft column entires
df.total_sqft.unique()

We can see that total_sqft column have some entires as (1113 - 1505) so now we have to check more carefully

In [None]:
# Making a function to have all different types of entires
def check_float(x):
    try:
        float(x)
    except:
        return False
    return True
df[~df['total_sqft'].apply(check_float)].head(10)

In [None]:
# Now converting this type of entries ('1125 - 4112') into INT and storing their average
def dash_to_int(x):
    dash = x.split('-')
    if len(dash)==2:
        return (float(dash[0]) + float(dash[1]))/2
    try:
        return float(x)
    except:
        return None
df['total_sqft'] = df['total_sqft'].apply(dash_to_int)
df.rename(columns={'total_sqft':'total_area'},inplace=True)
df.head()

In [None]:
# Cross checking total_area entries types
df[df['total_area'].apply(check_float)]

### Cleaning column: 'location'

In [None]:
df.head()

In [None]:
# Applying strip function to stripping the location names
df['location'] = df['location'].apply(lambda x: x.strip())

In [None]:
# As we have a lot of locations name so i am reducing the name types by assigning: location occuring for less than 10times as OTHER
loc = df.location.value_counts()
loc[loc<=10].count()
location_lessthan_10 = loc[loc<=10]
df.location = df.location.apply(lambda x: 'other' if x in location_lessthan_10 else x)
df.head()



In [None]:
# cross checking
df.location.value_counts()

#### ADDING a column named 'price_per_sqft' for understanding the data more clearly

In [None]:
# Making column 'price_per_sqft'
df['price_per_sqft'] = df['price']/df['total_area']

In [None]:
df.price_per_sqft.describe()

In [None]:
df.shape

## Outliers removal by the knowledge of bussiness domain

#### Dataset must be Distributed uniformally
1. 1bhk house must be more than 300 sqft
2. Mostly price of 3 or 4 bhk house > price of 1bhk house
3. Number of bathrooms is less than count of BHK or (bhk+2) of house
 

In [None]:
df.describe()

### Normalising the data set by taking 'price_per_sqft' as a feature

In [None]:
## Removing general outliers and making dataset distributed normally
def normally_dist(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        red_df = subdf[(subdf.price_per_sqft>(m-st)) &(subdf.price_per_sqft<=(m+st))]
        df_out = pd.concat([df_out,red_df],ignore_index = True)
    return df_out
df = normally_dist(df)
df.shape


In [None]:
df.describe()

In [None]:
# Checking total_area data points
fig = px.box(df,y='total_area',
            title='Total Area')
fig.update_layout(yaxis_title ='Area')
fig.show()

In [None]:
# 1bhk house must be more than 300 sqft
df = df[~(df.total_area/df.bhk < 300)]
df.shape

In [None]:
## Viz where it is happening that 2bhk price > 3bhk

def plot_scatter(df,location):
    bhk2 = df[(df.location == location) &(df.bhk ==2)]
    bhk3 = df[(df.location == location) &(df.bhk ==3)]
    fig = go.Figure() 
    fig.add_trace(go.Scatter(x=bhk2.total_area,y=bhk2.price,
                    mode='markers',
                    name='2 BHK'))
    fig.add_trace(go.Scatter(x=bhk3.total_area,y=bhk3.price,
                    mode='markers',
                    name='3 BHK'))
    fig.update_layout(title='2BHK vs 3BHK Price', xaxis_title='Area in Sqft', yaxis_title='Price',height=620,width=680)
    fig.show()

plot_scatter(df,'Rajaji Nagar')



In [None]:
# new feature inspection = Bathrooms (i went to my bussines manager)
df.bath.unique()

In [None]:
y0 = df.bath
y1 = df.bhk
fig = go.Figure()
fig.add_trace(go.Box(y=y0,name='Bath',quartilemethod="linear"))
fig.add_trace(go.Box(y=y1,name='BHK',quartilemethod="linear"))
fig.update_layout(title='Bath & BHK', yaxis_title='Count',height=420,width=1080)
fig.show()

In [None]:
## As per BM i know that their can not be bathrooms more than Count of BHK +2 (may be outlier)

In [None]:
df[df['bath']> df['bhk']+2]

In [None]:
# Removing bath outliers
df.drop(df[df['bath']> df['bhk']+2].index, inplace = True)

In [None]:
y0 = df.total_area
y1 = df.bhk
fig = go.Figure()
fig.add_trace(go.Box(y=y0,name='total_area',quartilemethod="linear"))
fig.add_trace(go.Box(y=y1,name='BHK',quartilemethod="linear"))
fig.update_layout(title='Bath & BHK', yaxis_title='Count',height=420,width=1080)
fig.show()

In [None]:
df.to_csv('Final_cleaned_data_4.csv')

## AS WE CLEANED OUR DATA NOW I AM MOVING TOWARDS MODEL TRAINING

In [None]:
dff = pd.read_csv('./Final_cleaned_data_4.csv',index_col=0)

In [None]:
dff.shape

In [None]:
# one hot enchoding
dummies_dff =  pd.get_dummies(dff.location)
dff1 = pd.concat([dff,dummies_dff],axis='columns')

In [None]:
#dummmy variable trap avoided and string categorical data column removed

dff = dff1.drop(columns=['Vittasandra','location'])

In [None]:
dff.head(3)

In [None]:
dff.shape

In [None]:
df.corr()

### MOdel training

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

model = LinearRegression()

In [None]:
%%time
x= dff.drop(columns=['price','price_per_sqft'])
y = dff1['price']

In [None]:
y.shape

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.35,random_state=10)

In [None]:
model.fit(x_train,y_train)
model.score(x_test,y_test)

In [None]:
print('Model Acccuracy:',model.score(x_test,y_test)*100,'%')

### Predictions

### As we used hot encoding for th elocation so we need a script to take input as location and get desired predictions

Input Parameters: Bhk;total_sqft;bath;location

In [None]:
x.columns

In [None]:
# Script for prediction
def prediction(location,sqft,bhk,bath):
    location_index = np.where(x.columns == location)[0][0]

    X = np.zeros(len(x.columns))
    X[0]=bhk
    X[1]=sqft
    X[2]=bath  
    #X[3]=price_per_sqft   
 # X[3]= pricepersqft'( if we want to add price_p_sqft as a feature)       
    if location_index>=0:
        X[location_index]=1
    return model.predict([X])[0]


In [None]:
#Give input in order of location,sqft,bhk,bath

Price_prediction = prediction('1st Phase JP Nagar',1400,2,2)

print("The predicted price is",Price_prediction ,"lakh")

### Now i am Going to check other Regression models and find out which one is the best

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(x)

In [None]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
#import xgboost as xgb 
from sklearn.tree import DecisionTreeRegressor
#from sklearn.ensemble import RandomForestRegressor

def find_best_model_using_gridsearchcv(x , y):
    algos = {
        'LinearRegression' : {
            'model' : LinearRegression(),
            'params' : {
                'normalize' : [True , False],
                'fit_intercept': [True , False],
                 'copy_X' : [True , False]
            }
        },
        'lasso' : {
            'model' : Lasso(),
            'params' : {
                'alpha' : [1, 10, 50, 200, 500],
                'selection' : ['random' , 'cyclic']
            } 
        },
         'Ridge' : {
            'model' : Ridge(), 
            'params' : {
                'alpha' : [1, 10, 50, 200, 500],
                'fit_intercept' : [True , False],
                'normalize' : [True , False],
            }
        },
        'descision_tree' : {
            'model' : DecisionTreeRegressor(),
            'params' :{
                'criterion' : ['mse' , 'friedman_mse'],
                'splitter' : ['best' , 'random']
            }
        }
    }
    
    scores = []
    cv = ShuffleSplit(n_splits = 5 , test_size = 0.2 , random_state = 0)
    for algo_name , config in algos.items():
        gs = GridSearchCV(config['model'] , config['params'] , cv = cv , return_train_score = False)
        gs.fit(x,y)
        scores.append({
            'model' : algo_name , 
            'best_score' : gs.best_score_,
            'best_params' : gs.best_params_
        })
 
    return pd.DataFrame(scores , columns = ['model' , 'best_score' , 'best_params'])  

In [None]:
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

find_best_model_using_gridsearchcv(x,y) 

### Thats why we used Liner Reg model

{Project by : Kumar Shivam}