# House Price Prediction using Hybrid ML Model (With Feature Selection)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Data Preprocessing**

**Read the Dataset**

In [None]:
hyderabad_dataset = pd.read_csv('../input/housing-prices-in-metropolitan-areas-of-india/Hyderabad.csv')

In [None]:
#size of the dataset
hyderabad_dataset.shape
hyderabad_dataset

**Impute Missing Values**

In [None]:
new_value = float("Nan")
#missing values are represented with '9' in this dataset
hyderabad_dataset.replace(to_replace =9 ,value = new_value, inplace= True)
hyderabad_dataset = hyderabad_dataset.dropna()
hyderabad_dataset.shape

**Find and Remove Duplicate rows**

In [None]:
hyderabad_dataset.duplicated().sum()
hyderabad_dataset.drop_duplicates(inplace=True)
#size after removing duplicates
hyderabad_dataset.shape

# **Data Analysis**

**Add a new column 'AreaRank' based on the Area size**

In [None]:
hyderabad_dataset['AreaRank']=hyderabad_dataset['Area']
hyderabad_dataset = hyderabad_dataset.sort_values('AreaRank')
hyderabad_dataset = hyderabad_dataset.reset_index(drop=True)
j,temp = 0,-1

for i in range(1, len(hyderabad_dataset)+1):
    if temp!= (hyderabad_dataset.iloc[i-1]['AreaRank']):
            j+=1
            temp = hyderabad_dataset.iloc[i-1]['AreaRank']
    hyderabad_dataset.loc[i-1, 'AreaRank'] = j

**Dataset Details**

In [None]:
hyderabad_dataset.info()

In [None]:
hyderabad_dataset

**Outlier Detection and Elimination**

In [None]:
hyderabad_dataset.describe()

In [None]:
plt.figure(figsize=(12,12))
plt.subplot(221)
seaborn.boxplot(x=hyderabad_dataset['Area'])
plt.title('Outliers In Area In the Dataset')
plt.subplot(222)
seaborn.boxplot(x=hyderabad_dataset['No. of Bedrooms'])
plt.title('Outliers In No. of Bedrooms In the Dataset')
plt.subplot(223)
seaborn.boxplot(x=hyderabad_dataset['Price'])
plt.title('Outliers In Price In the Dataset')

In [None]:
#Quantile-based Flooring and Capping

for feature in ['No. of Bedrooms','Area','Price']:
    print('Initial Skew value: ', hyderabad_dataset[feature].skew())
    Q1 = hyderabad_dataset[feature].quantile(0.25)
    Q3 = hyderabad_dataset[feature].quantile(0.75)
    #print(Q1,Q3)
    hyderabad_dataset[feature] = np.where(hyderabad_dataset[feature] <Q1, Q1,hyderabad_dataset[feature])
    hyderabad_dataset[feature] = np.where(hyderabad_dataset[feature] >Q3, Q3,hyderabad_dataset[feature])
    print('Final Skew value: ', hyderabad_dataset[feature].skew())
    print()

In [None]:
hyderabad_dataset.shape, hyderabad_dataset.columns

# **Feature Transformation and Engineering**

In [None]:
#convert price units to lakhs
hyderabad_dataset['Price'] = hyderabad_dataset['Price']/100000

**Feature Importance**

In [None]:
features = hyderabad_dataset.drop(['Price', 'Location'], axis=1)
label = hyderabad_dataset['Price']
hyderabad_dataset.drop("Price", inplace=True, axis=1)
print(features.shape, label.shape)

plt.figure(figsize=(12,5))
from sklearn.ensemble import ExtraTreesRegressor
etr = ExtraTreesRegressor()
etr.fit(features, label)
feature_importance = pd.Series(etr.feature_importances_, index=features.columns)
feature_importance.nlargest(20).plot(kind='barh')
plt.show()


**Heatmap for understanding  correlation**

In [None]:
plt.figure(figsize=(20,20))
corr=hyderabad_dataset.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
seaborn.heatmap(hyderabad_dataset.corr(), mask=mask, cmap='coolwarm', vmax=.3, center=0,
            square=True, linewidths=.5,annot=True)
plt.show()

**Correlation Checking (Visually)**

In [None]:
import networkx as nx
indices = corr.index.values
cor_matrix = np.asmatrix(corr)
G = nx.from_numpy_matrix(cor_matrix)
G = nx.relabel_nodes(G,lambda x: indices[x])
G.edges(data=True)
print()

In [None]:
def corr_network(G, corr_direction, min_correlation):
    H = G.copy()

    for s1, s2, weight in G.edges(data=True):       
        if corr_direction == "positive":
            if weight["weight"] < 0 or weight["weight"] < min_correlation:
                H.remove_edge(s1, s2)
        else:
            if weight["weight"] >= 0 or weight["weight"] > min_correlation:
                H.remove_edge(s1, s2)
                
    edges,weights = zip(*nx.get_edge_attributes(H,'weight').items())
    
    weights = tuple([(1+abs(x))**2 for x in weights])
   
    d = dict(nx.degree(H))
    nodelist=d.keys()
    node_sizes=d.values()
    
    positions=nx.circular_layout(H)
    
    plt.figure(figsize=(15,15))

    nx.draw_networkx_nodes(H,positions,node_color='#d100d1',nodelist=nodelist,
                       node_size=tuple([x**3 for x in node_sizes]),alpha=0.8)

    nx.draw_networkx_labels(H, positions, font_size=8)

    if corr_direction == "positive":
        edge_colour = plt.cm.summer 
    else:
        edge_colour = plt.cm.autumn
        
    nx.draw_networkx_edges(H, positions, edgelist=edges,style='solid',
                          width=weights, edge_color = weights, edge_cmap = edge_colour,
                          edge_vmin = min(weights), edge_vmax=max(weights))
    plt.axis('off')
    plt.show() 

In [None]:
corr_network(G, corr_direction="positive",min_correlation = 0.5)

In [None]:
corr_network(G, corr_direction="negative",min_correlation = -0.1)

**Convert categorical variable into dummy/indicator variables**

In [None]:
print(list(hyderabad_dataset.columns))
hyderabad_dataset = pd.get_dummies(hyderabad_dataset)
hyderabad_dataset.shape, hyderabad_dataset.columns

**Remove Features that aren't important or redundant**

In [None]:
features = ['MaintenanceStaff', 'CarParking', 'Intercom',
            'Gymnasium', 'JoggingTrack', 'RainWaterHarvesting', 
            'ShoppingMall', 'SportsFacility', 'ATM', 'ClubHouse', 'School', 
            '24X7Security', 'StaffQuarter', 'Cafeteria', 'MultipurposeRoom', 
            'Hospital', 'WashingMachine', 'Gasconnection', 'AC', 'Wifi', "Children'splayarea", 
            'BED', 'VaastuCompliant', 'Microwave', 'GolfCourse', 'TV', 'DiningTable', 
            'Sofa', 'Wardrobe', 'Refrigerator']

selected_features = hyderabad_dataset.drop(hyderabad_dataset[features], axis=1)

# **Split the Dataset into Test and Train**

In [None]:
from sklearn.model_selection import train_test_split
features_train, features_test, label_train, label_test = train_test_split(selected_features, label, test_size=0.3, random_state=40)
features_train.shape, features_test.shape, label_train.shape, label_test.shape, selected_features.columns

# **Prediction Model**

In [None]:
from sklearn.tree import DecisionTreeRegressor 
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

#DecisionTreeRegressor
DTR = DecisionTreeRegressor(random_state = 0)

#KNeighborsRegressor
KNR = KNeighborsRegressor()

#XGBRegressor
XGBR = XGBRegressor(max_depth=80,learning_rate=0.01,n_estimators=1000)

#RandomForestRegressor
RFR = RandomForestRegressor(n_estimators = 100, random_state = 0)


#Fitting the models
DTR.fit(features_train, label_train)
KNR.fit(features_train, label_train)
XGBR.fit(features_train,label_train)
RFR.fit(features_train, label_train)
print()

**R2 score and Measurements of Error**

In [None]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import explained_variance_score




def calc_r2(l, pred_values):
    print('r2_score: ', r2_score(l, pred_values))
def calc_moe(l, pred_values):
    print('root_mean_square_error: ', mean_squared_error(l, pred_values, squared = False))
    print('mean_absolute_error: ', mean_absolute_error(l, pred_values))
    print('mean_absolute_percentage_error: ', mean_absolute_percentage_error(l, pred_values))
    print('explained_variance_score: ', explained_variance_score(l, pred_values))
    
    
models =[DTR, KNR, XGBR, RFR]
models_name = ['DTR','KNR', 'XGBR ', 'RFR']
i=0
for model in models:
    print(models_name[i])
    print()
    i+=1
    print('Training Data:')
    pred_values = model.predict(features_train)
    calc_r2(label_train, pred_values)
    calc_moe(label_train, pred_values)
    print()
    print('Testing Data:')
    pred_values = model.predict(features_test)
    calc_r2(label_test, pred_values)
    calc_moe(label_test, pred_values)
    print()
    print()

# **Hybrid Model**

In [None]:
#Create new Test and Train datasets that contain prices predicted by the models used

label_pred_xgbr_train = XGBR.predict(features_train)
#label_pred_knr_train = KNR.predict(features_train)
label_pred_rfr_train = RFR.predict(features_train)
label_pred_dtr_train = DTR.predict(features_train)


features_train_new = pd.DataFrame()
features_train_new['xgbr'] = label_pred_xgbr_train
#features_train_new['knr'] = label_pred_knr_train
features_train_new['rfr'] = label_pred_rfr_train
features_train_new['dtr'] = label_pred_dtr_train


label_pred_xgbr_test = XGBR.predict(features_test)
#label_pred_knr_test = KNR.predict(features_test)
label_pred_rfr_test = RFR.predict(features_test)
label_pred_dtr_test = DTR.predict(features_test)

features_test_new = pd.DataFrame()
features_test_new['xgbr'] = label_pred_xgbr_test
#features_test_new['knr'] = label_pred_knr_test
features_test_new['rfr'] = label_pred_rfr_test
features_test_new['dtr'] = label_pred_dtr_test

features_train_new.shape, features_test_new.shape

In [None]:
hybrid = XGBRegressor(max_depth=80,learning_rate=0.01,n_estimators=1000)
hybrid.fit(features_train_new,label_train)
print()

In [None]:
print('Training Data:')
pred_values = hybrid.predict(features_train_new)
calc_r2(label_train, pred_values)
calc_moe(label_train, pred_values)
print()
print()
print('Testing Data:')
pred_values = hybrid.predict(features_test_new)
calc_r2(label_test, pred_values)
calc_moe(label_test, pred_values)