In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Bangalore House Price Prediction**

**The main goal of this project is to find the price of the Bangalore house using their features.**

##  **Import Libraries**

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib 
matplotlib.rcParams["figure.figsize"] = (20,10)

## **Load Dataset**

In [None]:
df1 = pd.read_csv("../input/bengaluru-house-price-data/Bengaluru_House_Data.csv")
df1.head()

In [None]:
df1.shape

## **Exploratory Data Analysis**

In [None]:
# get the information of data
df1.info()

In [None]:
df1.columns

In [None]:
df1['area_type'].unique()

In [None]:
df1['area_type'].value_counts()

In [None]:
import seaborn as sns
sns.scatterplot(df1['balcony'], df1['price'])

In [None]:
sns.countplot(df1['area_type'], hue='balcony', data=df1)

In [None]:
sns.jointplot(x=df1['bath'], y=df1['price'], data=df1)

In [None]:
df1.describe()
# We have only 3 neumerical features - bath, balcony and price
# 6 categorical features - area type, availability, size, society, and total_srft
# Target Feature =======>>>>>> price >>>>>>
# Price in lakh
 
#observe 75% and max value it shows huge diff

In [None]:
sns.pairplot(df1)

# bath and price have slightly linear correlation with some outliers

In [None]:
# value count of each feature
def value_count(df1):
  for var in df1.columns:
    print(df1[var].value_counts())
    print("--------------------------------")

In [None]:
value_count(df1)

In [None]:
# correlation heatmap
num_vars = ["bath", "balcony", "price"]
sns.heatmap(df1[num_vars].corr(),cmap="coolwarm", annot=True)
 
# correlation of bath is greater than a balcony with price

## **Data Cleaning: Handle NA values**

In [None]:
df1.isnull().sum()

In [None]:
df1.shape

In [None]:
df1.isnull().mean()*100 # % of measing value

#society has 41.3% missing value (need to drop)

In [None]:
# visualize missing value using heatmap to get idea where is the value missing
 
plt.figure(figsize=(16,9))
sns.heatmap(df1.isnull())

In [None]:
del_col = ['area_type','availability','society','balcony']
df2 = df1.drop(del_col, axis=1)

In [None]:
# drop na value rows from df2
# because there is very less % value missing
df3 = df2.dropna()
df3.shape

In [None]:
df3.isnull().sum()

In [None]:
df3.head()

## **Feature Engineering**

## **Working on size feature**

In [None]:
df3['size'].unique()

In [None]:
df3['bhk'] = df3['size'].apply(lambda x : int(x.split(' ')[0]))

In [None]:
df3.head()

In [None]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [None]:
df3[~df3['total_sqft'].apply(is_float)].head(10)

In [None]:
# here we observe that 'total_sqft' contain string value in diff format
#float, int like value 1689.28,817 
# range value: 540 - 740 
# number and string: 142.84Sq. Meter, 117Sq. Yards, 1Grounds
 
# best strategy is to convert it into number by spliting it

def convert_sqft_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None

In [None]:
df4 = df3.copy()
df4['total_sqft'] = df4['total_sqft'].apply(convert_sqft_to_num)

In [None]:
df4.head()

In [None]:
df4.isna().sum()

In [None]:
df5 = df4.copy()

In [None]:
df5['price_per_sqft'] = df5['price']* 100000 / df5['total_sqft']

In [None]:
df5.head()

In [None]:
df5.dtypes

## **Finding Outlier and Removing**

In [None]:
# function to create histogram, Q-Q plot and boxplot
 
# for Q-Q plots
import scipy.stats as stats

def diagnostic_plots(df, variable):
    # function takes a dataframe (df) and
    # the variable of interest as arguments
 
    # define figure size
    plt.figure(figsize=(16, 4))
 
    # histogram
    plt.subplot(1, 3, 1)
    sns.distplot(df[variable], bins=30)
    plt.title('Histogram')
 
    # Q-Q plot
    plt.subplot(1, 3, 2)
    stats.probplot(df[variable], dist="norm", plot=plt)
    plt.ylabel('Variable quantiles')
 
    # boxplot
    plt.subplot(1, 3, 3)
    sns.boxplot(y=df[variable])
    plt.title('Boxplot')
 
    plt.show()
    
num_var = ["bath","total_sqft","bhk","price"]
for var in num_var:
    print("******* {} *******".format(var))
    diagnostic_plots(df5, var)
 
  # here we observe outlier using histogram,, qq plot and boxplot

## **Explore Location Feature**

In [None]:
df5['location'] = df5['location'].apply(lambda x : x.strip())

loc_status = df4.groupby('location')['location'].agg('count')
loc_status.sort_values(ascending = False)

In [None]:
len(loc_status[loc_status <=10])

In [None]:
loc_status_less_10 = loc_status[loc_status <=10]

In [None]:
df5['location'] = df5['location'].apply(lambda x : 'other' if x in loc_status_less_10 else x)
df5.head()

In [None]:
df5.shape

## **Outlier Remove**

#### **As general, total_sqft per bedroom can't be less than 300**

In [None]:
df5[df5['total_sqft']/ df5['bhk'] <300 ].head()  #remove these rows

In [None]:
df6 = df5[~(df5['total_sqft']/ df5['bhk'] <300) ]
print(df6.head())
df6.shape

#### **price_per_sqft**

In [None]:
df6['price_per_sqft'].describe()

In [None]:
# Removing outliers using help of 'price per sqrt'  taking std and mean per location
def remove_pps_outliers(df):
  df_out = pd.DataFrame()
  for key, subdf in df.groupby('location'):
    m=np.mean(subdf.price_per_sqft)
    st=np.std(subdf.price_per_sqft)
    reduced_df = subdf[(subdf.price_per_sqft>(m-st)) & (subdf.price_per_sqft<=(m+st))]
    df_out = pd.concat([df_out, reduced_df], ignore_index = True)
  return df_out

In [None]:
df7 = remove_pps_outliers(df6)
df7.shape

#### **plot a scatter plot for 2 and 3 bedroom**

In [None]:
def scatter_chart(df, location):
    bhk2 = df[(df['location'] == location) & (df['bhk'] == 2)]
    bhk3 = df[(df['location'] == location) & (df['bhk'] == 3)]
    
    matplotlib.rcParams['figure.figsize'] = (15,10)
    
    plt.scatter(bhk2['total_sqft'], bhk2['price_per_sqft'], label='2 BHK', s=50)
    plt.scatter(bhk3['total_sqft'], bhk3['price_per_sqft'], marker='+',label= '3 BHK', s= 50, color='green')
    plt.xlabel("Total Square Feat Area")
    plt.ylabel("Price per Sqft")
    plt.title(location)
    plt.legend()

In [None]:
scatter_chart(df7, 'Rajaji Nagar')

# in below scatterplot we observe that at same location price of
# 2 bhk house is greater than 3 bhk so it is outlier

In [None]:
scatter_chart(df7, "Hebbal")

# in below scatterplot we observe that at same location price of
# 3 bhk house is less than 2 bhk so it is outlier

#### **remove 2 bedroom having value less than 1 bedroom**

In [None]:
def rm_bhk_outliers(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk] = {
                'mean' : np.mean(bhk_df['price_per_sqft']),
                'std'  : np.std(bhk_df['price_per_sqft']),
                'count' : bhk_df.shape[0]
            }
            
        for bhk, bhk_df in location_df.groupby('bhk'):
            stats  = bhk_stats.get(bhk -1)
            
            if stats and stats['count'] > 5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df['price_per_sqft'] < (stats['mean'])].index.values)
    return df.drop(exclude_indices, axis='index')

In [None]:
df8 = rm_bhk_outliers(df7)
df8.shape

In [None]:
scatter_chart(df8, 'Rajaji Nagar')

In [None]:
scatter_chart(df8, "Hebbal")

In [None]:
import matplotlib
matplotlib.rcParams["figure.figsize"] = (20,10)
plt.hist(df8.price_per_sqft,rwidth=0.8)
plt.xlabel("Price Per Square Feet")
plt.ylabel("Count")

### **Outlier Removal Using Bathrooms Feature**

In [None]:
df8.bath.unique()

In [None]:
plt.hist(df8.bath,rwidth=0.8)
plt.xlabel("Number of bathrooms")
plt.ylabel("Count")

In [None]:
df8[df8.bath>10]

#It is unusual to have 2 more bathrooms than number of bedrooms in a home

In [None]:
df8[df8.bath>df8.bhk+2]

In [None]:
#if you have 4 bedroom home and even if you have bathroom in all 4 rooms plus one guest bathroom, you will have total bath = total bed + 1 max.

df9 = df8[df8.bath<df8.bhk+2]
df9.shape

In [None]:
df9.head(2)

In [None]:
df10 = df9.drop(['size','price_per_sqft'],axis='columns')
df10.head(3)

## **Use One Hot Encoding For Location**

A one hot encoding is a representation of categorical variables as binary vectors.

This first requires that the categorical values be mapped to integer values.
Then, each integer value is represented as a binary vector that is all zero values except the index of the integer, 
which is marked with a 1.

In [None]:
dummies = pd.get_dummies(df10.location)
dummies.head(3)

In [None]:
df11 = pd.concat([df10,dummies.drop('other',axis='columns')],axis='columns')
df11.head()

In [None]:
df12 = df11.drop('location',axis='columns')
df12.head(2)

### **Build a Model Now...**

In [None]:
df12.shape

In [None]:
X = df12.drop(['price'],axis='columns')
X.head(3)

In [None]:
X.shape

In [None]:
y = df12.price
y.head(3)

In [None]:
len(y)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [None]:
from sklearn.linear_model import LinearRegression
lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)
lr_clf.score(X_test,y_test)

### **Use K Fold cross validation to measure accuracy of our LinearRegression model**

divides all the samples in k groups of samples, called folds (if k=n, this is equivalent to the Leave One Out strategy), of equal sizes (if possible). The prediction function is learned using k-1 folds, and the fold left out is used for test.

In [None]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=6, test_size=0.2, random_state=10)

cross_val_score(LinearRegression(), X, y, cv=cv)

We can see that in 5 iterations we get a score above 80% all the time. This is pretty good but we want to test few other algorithms for regression to see if we can get even better score. We will use GridSearchCV for this purpose¶

### **GridSearchCV**

GridSearchCV is a library function that is a member of sklearn’s model_selection package. It helps to loop through predefined hyperparameters and fit your estimator (model) on your training set. So, in the end, you can select the best parameters from the listed hyperparameters.

In [None]:
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

def find_best_model_using_gridsearchcv(X,y):
    algos = {
        'linear_regression' : {
            'model': LinearRegression(),
            'params': {
                'normalize': [True, False]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1,2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion' : ['mse','friedman_mse'],
                'splitter': ['best','random']
            }
        }
    }
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=10)
    for algo_name, config in algos.items():
        gs =  GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

find_best_model_using_gridsearchcv(X,y)

Based on above results we can say that Linear Regression gives the best score. Hence we will use that

### **Test the model for few properties**

In [None]:
def predict_price(location,sqft,bath,bhk):    
    loc_index = np.where(X.columns==location)[0][0]

    x = np.zeros(len(X.columns))
    x[0] = sqft
    x[1] = bath
    x[2] = bhk
    if loc_index >= 0:
        x[loc_index] = 1

    return lr_clf.predict([x])[0]

In [None]:
predict_price('1st Phase JP Nagar',1000, 2, 2)

In [None]:
predict_price('Indira Nagar', 1000, 2, 2 )

In [None]:
predict_price('1st Phase JP Nagar', 1000, 3, 3)