# Bengaluru Housing Price Prediction Model

#_Dataset: Bengaluru Realestate Pricing

#_Source: Kaggle

#_Reference: Codebasics (YT Playlist)

#_Contents of the notebook:
1. Data Exploration
  
  A. Finding Data Types
  
  B. Missing Value detection and treatment
  
  C. Redundancy Detection
  
  D. Descriptive Statistics 
  
  
2. Feature Engineering

  A. Dimentionality Reduction: Location (create others) 
  
  B. Add: price/sqft 
  
  C. Redundancies Check 
  
  D. Ouliers Treatment
  

3. Creating Dummies


4. Modeling

  A. Splitting
  
  B. Model Training
  
  C. Checking Accuracy 
  
  D. Predictions
  
  E. MAE Check

In [None]:
#Libraries required
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
import sklearn
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import statistics
from scipy import stats
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams['figure.figsize']=(10,10)

## Loading Dataset

In [None]:
og_data = pd.read_csv('../input/bengaluru-house-price-data/Bengaluru_House_Data.csv')
og_data.head()

In [None]:
og_data.shape

## Exploring Data

1.Data Types
2.Missing Values
3.Redundancies
4.Descriptive Stats 


### 1. Finding Data Types

In [None]:
og_data.dtypes

In [None]:
og_data.head()

In [None]:
og_data.nunique()

### 2. Missing Values

In [None]:
og_data.isnull().sum()

In [None]:
#Checking %
((og_data.isnull().sum())/len(og_data))*100

#### Imputing mode for NA in Balcony

In [None]:
#Checking unique values in balcony
og_data.balcony.unique()

In [None]:
#replacing missing values of balcony with mode
mode_bal = statistics.mode(og_data['balcony'])

og_data['balcony'].fillna(mode_bal, inplace=True)

In [None]:
og_data['balcony'].isnull().sum()

#### Dropping NA rows and cols

In [None]:
#Dropping society
og_data = og_data.drop(['society'], axis = 1)

In [None]:
#Dropping na in Location, size, bath
og_data = og_data.dropna()

In [None]:
og_data.isna().sum()

In [None]:
og_data.shape

### 3. Looking for Redundancies

In [None]:
og_data.nunique()

#### Checking col: size

In [None]:
og_data['size'].unique()

In [None]:
#Changing 1 RK to 0 Bedrooms
to_replace = {'1 RK':'0 Bedrooms'}
og_data = og_data.replace(to_replace)
og_data['size'].unique()

In [None]:
#Different names for same values
#Creating new col with BHK nos. only
og_data['BHK'] = og_data['size'].apply(lambda x: float(x.split(' ')[0]))
og_data

#### Checking col: total_sqft

In [None]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [None]:
#showing rows tht are not float in total-sqft
x = og_data[~og_data['total_sqft'].apply(is_float)]
x['total_sqft'].unique()

In [None]:
#Redudant values: sq.mt, perch, range, Sq.Yards, Acres, Guntha, Grounds, 
#Ignoring different unit values
#replace range values with average of the range
def convert_range(x):
    values = x.split('-')
    if len(values) == 2:
        avg = ((float(values[0])+float(values[1]))/2)
        return avg
    try:
        return float(x)
    except:
        return x

In [None]:
og_data['total_sqft1'] = og_data['total_sqft'].apply(convert_range)

#checking if range is there
'2100 - 2850' in og_data['total_sqft']

In [None]:
#Remaining redundant rows in total-sqft
y = og_data[~og_data['total_sqft1'].apply(is_float)]

#% check
(len(y)/len(og_data))*100

In [None]:
y_indices = y.index

proc_data_1 = pd.DataFrame(og_data.drop(y_indices))
proc_data_1['total_sqft1'].dtypes

In [None]:
#Converting total_sqft1 into float
proc_data_1['total_sqft1'] = pd.to_numeric(og_data.total_sqft1, errors = 'coerce')
proc_data_1.info()

### 4. Descriptive Stats

In [None]:
proc_data_1.columns

In [None]:
proc_data_1.describe()

In [None]:
#Removing unwanted columns
proc_data_1 = proc_data_1.drop(columns=['size','total_sqft'],axis=1)

In [None]:
proc_data_1.columns

In [None]:
proc_data_1.hist()

### Feature Engineering

1.Dimentionality Reduction: Location (create others)
2.Add: price/sqft
3.Redundancies Check
4.Ouliers Treatment

#### 1. Dimentionality Reduction
##### Location

In [None]:
#Removing any spaces in the location feature
proc_data_1['location'] = proc_data_1.location.apply(lambda x: x.strip())

#Location frequency dist table
location_dist = proc_data_1.groupby(proc_data_1['location'])['location'].count()

#Filtering out those locations with less than 10 freq
location_lessthan10 = location_dist[location_dist<=10]
location_lessthan10

In [None]:
#Putiing all the locations with <=10 record into 'others' category
proc_data_1['location'] = proc_data_1.location.apply(lambda x: 'others' if x in location_lessthan10 else x)
proc_data_1.location.nunique()

#### 2. Adding pric/sqft feature
##### price/sqft

In [None]:
proc_data_1['price/sqft'] = ((proc_data_1['price']*100000)/proc_data_1['total_sqft1'])
proc_data_1.info()

#### 3. Redundancies Check

##### BHK

In [None]:
proc_data_2 = proc_data_1.copy()
#Sqft/bedroom = 300 (usually)
#Checking is every record has 300sqft rooms
out_bhk = proc_data_2[proc_data_2['total_sqft1']/proc_data_2['BHK']<300]
out_bhk

In [None]:
#Desirable: total_sqft1/bhk = 300
#Therefore, bhk = total_sqt1/300
#Replacing out_bhk values with bhk
out_bhk['BHK'] = round(out_bhk['total_sqft1']/300)
out_bhk['BHK'].unique()

In [None]:
#Replacing outliers with desired values
proc_data_2.loc[out_bhk.index] = out_bhk
proc_data_2['BHK'].unique()

##### Bath

In [None]:
#In 2 BHK max baths possible = 3
out_bath = proc_data_2[proc_data_2['bath']>proc_data_2['BHK']+1]
out_bath

In [None]:
#replacing out_bath values with desired value
desired_val = (out_bath['BHK']+1)
out_bath['bath']=desired_val
out_bath['bath'].unique()

In [None]:
#Replacing outliers with desired values
proc_data_2.loc[out_bath.index] = out_bath
proc_data_2['bath'].unique()

#### 4. Outlier Treatment

In [None]:
sns.boxplot(data=proc_data_2)

##### price/sqft

In [None]:
#Removing outliers usinf std. dev
#Defining function to remove outliers
def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf['price/sqft'])
        st = np.std(subdf['price/sqft'])
        reduced_df = subdf[(subdf['price/sqft']>(m-st)) & (subdf['price/sqft']<=(m+st))]
        df_out = pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out
proc_data_3 = remove_pps_outliers(proc_data_2)
proc_data_3.shape

In [None]:
sns.boxplot(data=proc_data_3)

In [None]:
#Outlier threshold > 10,000 & <1500
outliers = proc_data_3[proc_data_3['price/sqft']>10000]

In [None]:
#Replacing with mean
mean_pps = statistics.mean(proc_data_3['price/sqft'])
proc_data_3['price/sqft'].loc[outliers.index] = mean_pps

In [None]:
sns.boxplot(data=proc_data_3['price/sqft'])

##### Bath and BHK

In [None]:
sns.boxplot(data=proc_data_3[['bath','BHK']])

In [None]:
proc_data_4 = proc_data_3.copy()
#bath and bhk threshold = 4
#Droping outliers from bath,bhk and remaining from price/sqft
out_pps = proc_data_3[(proc_data_3['price/sqft']>9500) | (proc_data_3['price/sqft']<1500)]
proc_data_4 = proc_data_4.drop(out_pps.index)

out_bath = proc_data_4[proc_data_3['bath']>4]
proc_data_4 = proc_data_4.drop(out_bath.index)

out_bhk = proc_data_4[proc_data_3['BHK']>4]
proc_data_4 = proc_data_4.drop(out_bhk.index)

proc_data_4.info()

In [None]:
proc_data_5 = proc_data_4[['area_type','location','bath','balcony','price','BHK','total_sqft1','price/sqft']]
proc_data_5.hist()

In [None]:
proc_data_5.columns

In [None]:
#Final set to be use 
proc_data = proc_data_5[['area_type','location','bath','balcony','price','BHK','total_sqft1']]
proc_data.info()

### Checking Correlations

In [None]:
data = proc_data.drop(['area_type','location'],axis=1)
sns.heatmap(data.corr(),cmap="YlGnBu",annot=True, annot_kws={"size": 10})

### Creating Dummies

#### 1.Location

In [None]:
dummies_loc = pd.get_dummies(proc_data['location'])
dummies_loc

In [None]:
num_data_1 = pd.concat([proc_data,dummies_loc],axis='columns')
num_data_1

#### Area_type

In [None]:
dummies_at = pd.get_dummies(proc_data['area_type'])
dummies_at

In [None]:
num_data_2 = pd.concat([num_data_1,dummies_at],axis='columns')
num_data_2

In [None]:
#Removing unwanted columns
num_data = num_data_2.drop(['others','location','area_type'],axis=1)
num_data = num_data.astype(int)
num_data.info()

### Modeling

#### Defining X and Y variables for Model

In [None]:
#Target Variable = Price
X = num_data.drop(['price'],axis=1)
Y = num_data['price']

#### Splitting Data

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.3,train_size=0.7,random_state=40)

#### Modeling
#### (Random Forest)

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train,y_train)
lr.score(X_test,y_test)

In [None]:
lr.score(X_train,y_train)

#### Predictions

In [None]:
pred = lr.predict(X_test)

In [None]:
plt.scatter(y_test,pred)
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.show()

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

mae = mean_absolute_error(y_test,pred)
mae

In [None]:
rmse = np.sqrt(mean_squared_error(y_test,pred))
rmse