<a href="https://colab.research.google.com/github/samshanmukh/Predict_House_Price/blob/master/Predict_house_prices.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing libraries

In [0]:
%%time
import re
import pickle
import numpy as np
import pandas as pd

import sklearn
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import mean_absolute_error
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import xgboost as xgb
import lightgbm

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

CPU times: user 473 ms, sys: 94 ms, total: 567 ms
Wall time: 1.07 s


# Getting datasets

In [0]:
train_data = pd.read_csv('https://raw.githubusercontent.com/samshanmukh/Predict-house-prices-in-Bangalore/master/Dataset/train.csv')
test_data = pd.read_csv('https://raw.githubusercontent.com/samshanmukh/Predict-house-prices-in-Bangalore/master/Dataset/test.csv')

In [0]:
train_data.head()
# train_data.shape

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [0]:
train_data.area_type.value_counts()

Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: area_type, dtype: int64

In [0]:
# Convert categorical data into numeric in area_type column
replace_area_type = {'Super built-up  Area': 0, 'Built-up  Area': 1, 'Plot  Area': 2, 'Carpet  Area': 3}
train_data.area_type = train_data.area_type.map(replace_area_type)

In [0]:
train_data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,0,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,2,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,1,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,0,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,0,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [0]:
# Convert the categorical values into 3 categories in the 'availability' column.
#   > Ready To Move
#   > Immediate Possession
#   > Others

def replace_availability(value):
  
  if value == 'Ready To Move':
    return 0
  elif value == 'Immediate Possession':
    return 1
  else:
    return 2

In [0]:
# Apply the above function to convert the values of column availibility
train_data['availability'] = train_data.availability.apply(replace_availability)

In [0]:
train_data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,0,2,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,2,0,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,1,0,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,0,0,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,0,0,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [0]:
# Preprocess the column 'location'

# check which value is null in location column
train_data[~train_data.location.notnull()]

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
568,0,0,,3 BHK,Grare S,1600,3.0,2.0,86.0


In [0]:
# fill the null value 568th row or null value with 'Location not provided'
train_data['location'] = train_data.location.fillna('Location not Provided')

In [0]:
# check which value is null in location column again after preprocess
train_data[~train_data.location.notnull()]

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price


In [0]:
# Convert categorical into numerical in column 'location'
location_encoder = LabelEncoder()
location_encoder.fit(train_data['location'].append(test_data['location']))
train_data['location'] = location_encoder.transform(train_data['location'])


In [0]:
location_encoder.classes_

array([' Anekal', ' Banaswadi', ' Basavangudi', ..., 'whitefiled',
       'yelahanka, north', 'yettagodi Road'], dtype=object)

In [0]:
train_data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,0,2,430,2 BHK,Coomee,1056,2.0,1.0,39.07
1,2,0,325,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,1,0,1220,3 BHK,,1440,2.0,3.0,62.0
3,0,0,778,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,0,0,736,2 BHK,,1200,2.0,1.0,51.0


In [0]:
# Preprocess the column 'size'

# Convert all the categorical data into numerical using LabelEncoder
size_encoder = LabelEncoder()
size_encoder.fit(train_data['size'].astype('str').append(test_data['size'].astype('str')))
train_data['size'] = size_encoder.transform(train_data['size'].astype('str'))

In [0]:
size_encoder.classes_

array(['1 BHK', '1 Bedroom', '1 RK', '10 BHK', '10 Bedroom', '11 BHK',
       '11 Bedroom', '12 Bedroom', '13 BHK', '14 BHK', '16 BHK',
       '16 Bedroom', '18 Bedroom', '19 BHK', '2 BHK', '2 Bedroom',
       '27 BHK', '3 BHK', '3 Bedroom', '4 BHK', '4 Bedroom', '43 Bedroom',
       '5 BHK', '5 Bedroom', '6 BHK', '6 Bedroom', '7 BHK', '7 Bedroom',
       '8 BHK', '8 Bedroom', '9 BHK', '9 Bedroom', 'nan'], dtype=object)

In [0]:
train_data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,0,2,430,14,Coomee,1056,2.0,1.0,39.07
1,2,0,325,20,Theanmp,2600,5.0,3.0,120.0
2,1,0,1220,17,,1440,2.0,3.0,62.0
3,0,0,778,17,Soiewre,1521,3.0,1.0,95.0
4,0,0,736,14,,1200,2.0,1.0,51.0


In [0]:
# Preprocess the column 'society'

# check which value is null in society column

train_data[~train_data.society.notnull()]

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
2,1,0,1220,17,,1440,2.0,3.0,62.00
4,0,0,736,14,,1200,2.0,1.0,51.00
8,0,0,821,17,,1310,3.0,1.0,63.25
9,2,0,445,25,,1020,6.0,,370.00
10,0,2,1297,17,,1800,2.0,2.0,70.00
13,1,0,477,14,,1100,2.0,2.0,40.00
19,0,0,810,14,,1100,2.0,2.0,48.00
20,1,0,706,0,,600,1.0,1.0,15.00
23,0,0,242,17,,1767,3.0,1.0,103.00
25,0,2,811,17,,1250,3.0,2.0,56.00


In [0]:
# Fill the null values with 'other'
train_data['society'] = train_data.society.fillna('other')

In [0]:
train_data[~train_data.society.notnull()]

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price


In [0]:
# convert categorical to numeric using LabelEncoder in column 'society'
society_encoder = LabelEncoder()
society_encoder.fit(train_data['society'].astype('str').append(test_data['society'].astype('str').fillna('Other')))
train_data['society'] = society_encoder.transform(train_data['society'])

In [0]:
train_data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,0,2,430,14,487,1056,2.0,1.0,39.07
1,2,0,325,20,2568,2600,5.0,3.0,120.0
2,1,0,1220,17,2833,1440,2.0,3.0,62.0
3,0,0,778,17,2302,1521,3.0,1.0,95.0
4,0,0,736,14,2833,1200,2.0,1.0,51.0


In [0]:
# preprocess the column 'totla_sqft'
def preprocess_total_sqft(my_list):
    if len(my_list) == 1:
        
        try:
            return float(my_list[0])
        except:
            strings = ['Sq. Meter', 'Sq. Yards', 'Perch', 'Acres', 'Cents', 'Guntha', 'Grounds']
            split_list = re.split('(\d*.*\d)', my_list[0])[1:]
            area = float(split_list[0])
            type_of_area = split_list[1]
            
            if type_of_area == 'Sq. Meter':
                area_in_sqft = area * 10.7639
            elif type_of_area == 'Sq. Yards':
                area_in_sqft = area * 9.0
            elif type_of_area == 'Perch':
                area_in_sqft = area * 272.25
            elif type_of_area == 'Acres':
                area_in_sqft = area * 43560.0
            elif type_of_area == 'Cents':
                area_in_sqft = area * 435.61545
            elif type_of_area == 'Guntha':
                area_in_sqft = area * 1089.0
            elif type_of_area == 'Grounds':
                area_in_sqft = area * 2400.0
            return float(area_in_sqft)
        
    else:
        return (float(my_list[0]) + float(my_list[1]))/2.0

In [0]:
# As all the given values are not measured in 'square feet', we have to preprocess some of the values in the column.  
# Some of them are also measured in square meters, square yards, perch, acres, cents, guntha and grounds.
# So, convert all of them into square feet to make the data more consistent.

train_data['total_sqft'] = train_data.total_sqft.str.split('-').apply(preprocess_total_sqft)

In [0]:
train_data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,0,2,430,14,487,1056.0,2.0,1.0,39.07
1,2,0,325,20,2568,2600.0,5.0,3.0,120.0
2,1,0,1220,17,2833,1440.0,2.0,3.0,62.0
3,0,0,778,17,2302,1521.0,3.0,1.0,95.0
4,0,0,736,14,2833,1200.0,2.0,1.0,51.0


In [0]:
# Preprocess the column 'bath'

In [0]:
# There are many missing values (73) in the column 'bath'.
# So, the missing values are filled by grouping the rows based on location and taking the mean of the column 'bath' in that location.
# Even after doing this, there is a missing value. 
# This is because there is a row in which the location is unique(occurred only one time) and the value is NaN. 
# So, it cannot fill as there are no other values for bathrooms to find mean. 
# In this case, the missing values are filled with the mean of the whole column.

In [0]:
train_data['bath'].isna().sum()

73

In [0]:
column_bath = train_data.groupby('location')['bath'].transform(lambda x: x.fillna(x.mean()))

In [0]:
column_bath[~column_bath.notnull()]

1775   NaN
Name: bath, dtype: float64

In [0]:
column_bath = column_bath.fillna(column_bath.mean())
column_bath.isna().sum()

0

In [0]:
train_data['bath'] = column_bath

In [0]:
train_data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,0,2,430,14,487,1056.0,2.0,1.0,39.07
1,2,0,325,20,2568,2600.0,5.0,3.0,120.0
2,1,0,1220,17,2833,1440.0,2.0,3.0,62.0
3,0,0,778,17,2302,1521.0,3.0,1.0,95.0
4,0,0,736,14,2833,1200.0,2.0,1.0,51.0


In [0]:
# Preprocess the column 'balcony'

In [0]:
# There are many missing values (609) in the column 'bath'.
# So, the missing values are filled by grouping the rows based on location and taking the mean of the column 'balcony' in that location.
# Even after doing this, there are missing value in some rows. 
# This is because there are rows in which the location is unique(occurred only one time) and the value is NaN. 
# So, it cannot fill as there are no other values for balcony to find mean. 
# In this case, the missing values are filled with the mean of the whole column.

In [0]:
train_data.balcony.isna().sum()

609

In [0]:
train_data.balcony.value_counts()

2.0    5113
1.0    4897
3.0    1672
0.0    1029
Name: balcony, dtype: int64

In [0]:
column_balcony = train_data.groupby('location')['balcony'].transform(lambda x: x.fillna(x.mean()))
column_balcony = column_balcony.fillna(column_balcony.mean())


In [0]:
train_data['balcony'] = column_balcony

In [0]:
train_data.balcony.isna().sum()

0

In [0]:
train_data.head()


Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,0,2,430,14,487,1056.0,2.0,1.0,39.07
1,2,0,325,20,2568,2600.0,5.0,3.0,120.0
2,1,0,1220,17,2833,1440.0,2.0,3.0,62.0
3,0,0,778,17,2302,1521.0,3.0,1.0,95.0
4,0,0,736,14,2833,1200.0,2.0,1.0,51.0


In [0]:
# Seprate data to the input and output data to train the model
columns = train_data.columns
X_train = train_data[columns[:-1]]
y_train = train_data[columns[-1]]

# Preprocess the test data

In [0]:
# We have to handle missing values in the test data as it has alot of missing values.
test_data.isna().sum()

area_type          0
availability       0
location           0
size               2
society          626
total_sqft         0
bath               7
balcony           69
price           1480
dtype: int64

In [0]:
test_data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,0,0,95,5,405,1225.0,2.0,2.0,
1,2,0,30,20,306,2400.0,9.0,2.0,
2,2,2,185,10,442,1650.0,5.0,2.0,
3,0,0,269,7,590,1322.0,3.0,1.0,
4,0,0,274,5,26,1161.0,2.0,1.0,


In [0]:
test_data.loc[test_data.society == 'NaN']

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price


In [0]:
test_data.isna().sum()

area_type          0
availability       0
location           0
size               2
society          626
total_sqft         0
bath               7
balcony           69
price           1480
dtype: int64

In [0]:
test_data['area_type'] = test_data.area_type.map(replace_area_type)

In [0]:
test_data['availability'] = test_data.availability.apply(replace_availability)


In [0]:
train_data['location'] = train_data['location'].fillna('Location not provided')

location_encoder = LabelEncoder()
location_encoder.fit(test_data['location'])
test_data['location'] = location_encoder.transform(test_data['location'].astype('str'))

In [0]:
size_encoder = LabelEncoder()
size_encoder.fit(test_data['size'].astype('str'))

test_data['size'] = size_encoder.transform(test_data['size'].astype('str'))

In [0]:
test_data['society'] = test_data['society'].fillna('Other')

society_encoder = LabelEncoder()
society_encoder.fit(test_data['society'])
test_data['society'] = society_encoder.transform(test_data['society'])

In [0]:
test_data['total_sqft'] = test_data.total_sqft.str.split('-').apply(preprocess_total_sqft)

In [0]:
test_data['bath'] = test_data['bath'].fillna(train_data.bath.mean())

In [0]:
test_data['balcony'] = test_data['balcony'].fillna(train_data.balcony.mean())

In [0]:
y_test = test_data.price

X_test = test_data
X_test = X_test.drop(columns='price')


# Modeling

In [0]:
rfRegressor = RandomForestRegressor()
model = rfRegressor.fit(X_train, y_train)

In [0]:
# lreg = LinearRegression(normalize=True)
# model = lreg.fit(X_train, y_train)

In [0]:
# dtReg = DecisionTreeRegressor()
# model = dtReg.fit(X_train, y_train)

In [0]:
# from catboost import CatBoostRegressor
# model=CatBoostRegressor(iterations=10000, depth=5, learning_rate=0.03, loss_function='RMSE')
# model.fit(X_train, y_train)

In [0]:
y_pred = model.predict(X_test)

In [0]:
out_df = pd.DataFrame({'price': y_pred})

In [0]:
# out_df.to_excel('predictions.xlsx', index=False)
out_df.to_csv('predictions.csv', index=False)

In [0]:
prediction_file = pd.read_csv('predictions.csv')

In [0]:
prediction_file.head()

Unnamed: 0,price
0,62.5
1,342.5
2,155.7
3,83.422
4,53.956333


# Checking Mean Absolute Error

In [0]:
columns = train_data.columns
X = train_data[columns[:-1]]
y = train_data[columns[-1]]

# Split into validation and training data
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X, y, random_state=1)

# Specify Model Decision Tree Regressor
iowa_model = DecisionTreeRegressor(random_state=1)
iowa_model.fit(train_X, train_y)
val_predictions = iowa_model.predict(test_X)
val_mae = mean_absolute_error(val_predictions, test_y)
print("Validation MAE when not specifying max_leaf_nodes: {:,.0f}".format(val_mae))

# Using best value for max_leaf_nodes
iowa_model = DecisionTreeRegressor(max_leaf_nodes=100, random_state=1)
iowa_model.fit(train_X, train_y)
val_predictions = iowa_model.predict(test_X)
val_mae = mean_absolute_error(val_predictions, test_y)
print("Validation MAE for best value of max_leaf_nodes: {:,.0f}".format(val_mae))

# Define the RF model. Set random_state to 1
rf_model = RandomForestRegressor(random_state=1)
rf_model.fit(train_X, train_y)
rf_val_predictions = rf_model.predict(test_X)
rf_val_mae = mean_absolute_error(rf_val_predictions, test_y)
print("Validation MAE for Random Forest Model: {:,.0f}".format(rf_val_mae))

# Define the xgboost model
from xgboost import XGBRegressor
xgb_model = XGBRegressor()
# Add silent=True to avoid printing out updates with each cycle
xgb_model.fit(train_X, train_y, verbose=False)
xgb_predictions = xgb_model.predict(test_X)
# print("Mean Absolute Error : " + str(mean_absolute_error(xgb_predictions, test_y)))
xgb_val_mae = mean_absolute_error(xgb_predictions, test_y)
print("Validation MAE for XGBOOST Model: {:,.0f}".format(xgb_val_mae))




Validation MAE when not specifying max_leaf_nodes: 41
Validation MAE for best value of max_leaf_nodes: 40
Validation MAE for Random Forest Model: 35
Validation MAE for XGBOOST Model: 38
Instructions for updating:
Use tf.cast instead.


# Using Tensorflow

In [0]:
# Define the tensorflow model
from tensorflow import keras
from tensorflow.keras import layers
def build_model():
  model =keras.Sequential([
    layers.Dense(64, activation=tf.nn.relu, input_shape=[len(train_X.keys())]),
    layers.Dense(64, activation=tf.nn.relu),
    layers.Dense(1)
  ])

  optimizer = tf.keras.optimizers.RMSprop(0.001)

  model.compile(loss='mean_squared_error',
                optimizer=optimizer,
                metrics=['mean_absolute_error', 'mean_squared_error'])
  return model

model = build_model()

In [0]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_21 (Dense)             (None, 64)                576       
_________________________________________________________________
dense_22 (Dense)             (None, 64)                4160      
_________________________________________________________________
dense_23 (Dense)             (None, 1)                 65        
Total params: 4,801
Trainable params: 4,801
Non-trainable params: 0
_________________________________________________________________


In [0]:
example_batch = train_X[:10]
example_result = model.predict(example_batch)
example_result

NameError: ignored

In [0]:
# To be continued...