# Predicting House Prices in Bangalore

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams['figure.figsize'] =(20,10)

# Data Load: Loading the data of home prices into a dataframe

In [None]:
df = pd.read_csv('../input/bengaluru-house-price-data/Bengaluru_House_Data.csv')
df

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.groupby('area_type')['area_type'].agg('count')

In [None]:
df.area_type.unique()

# Handling of NA values

In [None]:
def change_to_numeric(x):
    if x == 'Super built-up  Area':
        return 0
    elif x == 'Plot  Area':
        return 1
    elif x == 'Built-up  Area':
        return 2
    else:
        return 3

df['area_type_numeric'] = df['area_type'].apply(change_to_numeric)

In [None]:
df2 = df[['area_type_numeric', 'price']]
df2.corr()

In [None]:
df1 = df.drop('area_type', axis = 'columns')

In [None]:
df1.balcony = df1.balcony.fillna(df1.balcony.median())

In [None]:
df1.balcony.isnull().sum()

In [None]:
df_temp = df1[['balcony', 'price']]
df_temp.corr()

In [None]:
def ready(x):
    if x == 'Ready To Move':
        return 1
    else:
        return 0
df1['availability'] = df1['availability'].apply(ready)

In [None]:
df_temp = df1[['availability', 'price']]
df_temp.corr()

In [None]:
df2 = df1.drop(['society' , 'balcony', 'availability'] , axis = 'columns')

In [None]:
df2.head()

In [None]:
df2.isnull().sum()

In [None]:
df2.describe()

In [None]:
df2['size'].unique()

In [None]:
df2['size'].value_counts().head()

In [None]:
df2['size'] = df2['size'].fillna('2 BHK')

# Feature Engineering
**Adding new feature(integer) for bhk (Bedrooms Hall Kitchen)**

In [None]:
df2['BHK'] = df2['size'].apply(lambda x : int(x.split(' ')[0]))

In [None]:
df_temp = df2[['BHK', 'price']]
df_temp.corr()

In [None]:
df2['bath'] = df2['bath'].fillna(df2.bath.median())

In [None]:
df2.head()

In [None]:
df2['location'].value_counts()

In [None]:
df2['location'] = df2['location'].fillna('Whitefield')

In [None]:
df2.BHK.unique()

In [None]:
df2[df2.BHK>20]

In [None]:
df2.total_sqft.unique()

In [None]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True    

In [None]:
df2[~df2['total_sqft'].apply(is_float)].head(10)

**Above shows that total_sqft can be a range (e.g. 2100-2850). For such case we can just take average of min and max value in the range. There are other cases such as 34.46Sq. Meter which one can convert to square ft using unit conversion. We are going to just drop such corner cases to keep things simple.**

In [None]:
def convert_sqft_to_num(x):
    token = x.split('-')
    if len(token)==2:
        return (float(token[0]) + float(token[1]))/2
    try:
        return float(x)
    except:
        return None

In [None]:
df3 = df2.copy()
df3['total_sqft'] = df3['total_sqft'].apply(convert_sqft_to_num)
df3.head()

In [None]:
df3['bath'] = df3['bath'].apply(lambda x: int(x))

In [None]:
df_temp = df3[['BHK', 'price']]
df_temp.corr()

In [None]:
df4 = df3.copy()
df4['price_per_sqft'] = df3['price']*100000/df3['total_sqft']
df4

In [None]:
df4['location'].agg('count')

In [None]:
len(df4.location)

In [None]:
df4.location = df4.location.apply(lambda x : x.strip())
location_stats = df4.groupby('location')['location'].agg('count')
location_stats

In [None]:
location_stats = df4.groupby('location')['location'].agg('count').sort_values(ascending = False)
location_stats

In [None]:
len(location_stats[location_stats<=10])

# Dimensionality Reduction
**Any location having less than 10 data points should be tagged as "other" location. This way number of categories can be reduced by huge amount. Later on when we do one hot encoding, it will help us with having fewer dummy columns.**

In [None]:
location_stats_less_than_10 = location_stats[location_stats<=10]

In [None]:
df4.location = df4.location.apply(lambda x : 'other' if x in location_stats_less_than_10 else x)

In [None]:
len(df4.location.unique())

In [None]:
sns.heatmap(df4.corr(), annot = True)

In [None]:
df4.head(10)

# Outliers detection and removal

In [None]:
sns.boxplot('total_sqft', data = df4, orient = 'vertical')

In [None]:
sns.boxplot('BHK', data = df4, orient = 'vertical')

In [None]:
sns.boxplot('bath', data = df4, orient = 'vertical')

In [None]:
sns.boxplot('price', data = df4, orient = 'vertical')

In [None]:
max_threshold = df4[["bath","BHK","total_sqft", "price", "price_per_sqft"]].quantile(0.95)
max_threshold

In [None]:
df4 = df4.drop(df4[df4['BHK'] > 5].index)
df4 = df4.drop(df4[df4['bath'] > 5].index)
df4 = df4.drop(df4[df4['total_sqft'] > 3250.0].index)
df4 = df4.drop(df4[df4['price'] > 324.0].index)
df4 = df4.drop(df4[df4['price_per_sqft'] > 15293.303348].index)

In [None]:
sns.boxplot('BHK', data = df4, orient = 'vertical')

# Outlier Removal Using Real Estate Logic
**Normally square ft per bedroom is 300 (i.e. 2 bhk apartment is minimum 600 sqft. If we have for example 400 sqft apartment with 2 bhk than that seems suspicious and can be removed as an outlier. We will remove such outliers by keeping our minimum threshold per bhk to be 300 sqft.**

In [None]:
df4[df4.total_sqft/df4.BHK<300].head(10)

In [None]:
df5 = df4[~(df4.total_sqft/df4.BHK<300)]

In [None]:
df5['price_per_sqft'].describe()

# Outlier Removal Using Standard Deviation and Mean

In [None]:
def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key , subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        reduced_df = subdf[(subdf.price_per_sqft>(m-st)) & (subdf.price_per_sqft<(m+st))]
        df_out = pd.concat([df_out , reduced_df] , ignore_index = True)
    return df_out

df6 = remove_pps_outliers(df5)
df6.shape

In [None]:
df5.shape

In [None]:
def plot_scatter_chart(df , location):
    bhk2 = df[(df.location == location) & (df.BHK==2)]
    bhk3 = df[(df.location == location) & (df.BHK==3)]
    matplotlib.rcParams['figure.figsize'] = (15 , 10)
    plt.scatter(bhk2.total_sqft , bhk2.price , color = 'blue' , label = '2 BHK' , s = 50)
    plt.scatter(bhk3.total_sqft , bhk3.price , marker = '+' , color = 'green' , label = '3 BHK' , s = 50)
    plt.xlabel('Total Square feet Area')
    plt.ylabel('Price(Lakhs Indian Rupees)')
    plt.title(location)
    plt.legend()

plot_scatter_chart(df6 , 'Hebbal')  

In [None]:
def remove_bhk_outliers(df):
    exclude_indices = np.array([])
    for location , location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk , bhk_df in location_df.groupby('BHK'):
            bhk_stats[bhk] = {
                'mean' : np.mean(bhk_df.price_per_sqft),
                'std' : np.std(bhk_df.price_per_sqft),
                'count' : bhk_df.shape[0]
            }
        for bhk , bhk_df in location_df.groupby('BHK'):
            stats = bhk_stats.get(bhk-1)
            if stats and stats['count']>5:
                exclude_indices = np.append(exclude_indices , bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)
                
    return df.drop(exclude_indices , axis = 'index')   

df7 = remove_bhk_outliers(df6)
df7.shape

In [None]:
plot_scatter_chart(df7 , 'Hebbal')

In [None]:
plt.hist(df7.price_per_sqft , rwidth = 0.8)
plt.xlabel('Price per square feet')
plt.ylabel('Count')

In [None]:
df7.bath.unique()

In [None]:
plt.hist(df7.bath , rwidth = 0.8)
plt.xlabel('number of bathrooms')
plt.ylabel('Count')

In [None]:
df8 = df7[(df7.bath<df7.BHK+2)]
df8.shape

In [None]:
df9 = df8.drop(['size' , 'price_per_sqft', 'area_type_numeric'] , axis = 'columns')
df9.head()

# Using One Hot Encoding For Location

In [None]:
dummies = pd.get_dummies(df9.location)
dummies.head()

In [None]:
df10 = pd.concat([df9 , dummies.drop('other' , axis = 'columns')] , axis = 'columns')
df10.head()

In [None]:
df11 = df10.drop(['location'] , axis = 'columns')

# Model Building

In [None]:
x = df11.drop('price' , axis = 'columns')

In [None]:
y = df11.price

# Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(x)

In [None]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

In [None]:
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
#import xgboost as xgb 
from sklearn.tree import DecisionTreeRegressor
#from sklearn.ensemble import RandomForestRegressor

def find_best_model_using_gridsearchcv(x , y):
    algos = {
        'LinearRegression' : {
            'model' : LinearRegression(),
            'params' : {
                'normalize' : [True , False],
                'fit_intercept': [True , False],
                 'copy_X' : [True , False]
            }
        },
        'lasso' : {
            'model' : Lasso(),
            'params' : {
                'alpha' : [1, 10, 50, 200, 500],
                'selection' : ['random' , 'cyclic']
            } 
        },
        'Ridge' : {
            'model' : Ridge(), 
            'params' : {
                'alpha' : [1, 10, 50, 200, 500],
                'fit_intercept' : [True , False],
                'normalize' : [True , False],
            }
        },
        'descision_tree' : {
            'model' : DecisionTreeRegressor(),
            'params' :{
                'criterion' : ['mse' , 'friedman_mse'],
                'splitter' : ['best' , 'random']
            }
        }
    }
    
    scores = []
    cv = ShuffleSplit(n_splits = 5 , test_size = 0.2 , random_state = 0)
    for algo_name , config in algos.items():
        gs = GridSearchCV(config['model'] , config['params'] , cv = cv , return_train_score = False)
        gs.fit(x , y)
        scores.append({
            'model' : algo_name , 
            'best_score' : gs.best_score_,
            'best_params' : gs.best_params_
        })
        
    return pd.DataFrame(scores , columns = ['model' , 'best_score' , 'best_params'])        

In [None]:
find_best_model_using_gridsearchcv(X , y) 

In [None]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(x , y , test_size = 0.2 , random_state = 10)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
s_x = StandardScaler()
s_y = StandardScaler()
X_train_scaled = s_x.fit_transform(X_train)
X_test_scaled = s_x.transform(X_test)

# Model Training 

In [None]:
from sklearn.linear_model import Ridge
model = Ridge(alpha= 1, fit_intercept = True, normalize = False)
model.fit(X_train_scaled, y_train)
model.score(X_test_scaled , y_test)

# Evaluation

In [None]:
from sklearn.metrics import mean_squared_error,r2_score
model.fit(X_train_scaled,y_train)
y_pred = model.predict(X_test_scaled)
acc = mean_squared_error(y_pred,y_test)
rscore = r2_score(y_pred,y_test)

In [None]:
rscore

In [None]:
def rmse(y_pred, y_test):
    return np.sqrt(mean_squared_error(y_pred,y_test))
rmse(y_pred,y_test)

In [None]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits = 10 , test_size = 0.2 , random_state = 0)
cross_val_score(Ridge(alpha= 1, fit_intercept = True, normalize = False, tol = 0.0001) , X , y , cv=cv).mean()