In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#supress warning

import warnings
warnings.filterwarnings("ignore")

In [None]:
#save file path
real_estate_file_path = '/kaggle/input/real-estate-price-prediction/Real estate.csv'
#read data and store data
real_estate = pd.read_csv(real_estate_file_path)
#summary of real_estate data
real_estate.head()

In [None]:
#inspect various aspect of dataframe

real_estate.shape

In [None]:
real_estate.info()

In [None]:
#to check the null values
real_estate.isnull().sum()

There is no null value present in dataset.

In [None]:
#describe the data
real_estate.describe()

In [None]:
#There is no need of 'No' column and 'Date' column, hence we can drop it without afftecting our predictions.
real_estate.drop(['No'], axis=1, inplace=True)
real_estate.drop(['X1 transaction date'],axis=1, inplace=True)

In [None]:
#check dataset after dropping 'No' col
real_estate.head()

Data Visualization

In [None]:
#import libraries
import matplotlib.pyplot as plt
import seaborn as sns
#Visualising all numeric variable
plt.figure(figsize=(6,12))
sns.pairplot(real_estate)
plt.show()

In [None]:
from matplotlib import style 
style.use("dark_background")
sns.heatmap(real_estate.corr(),annot=True,cmap="winter")

In [None]:
sns.jointplot(x="X2 house age",y="Y house price of unit area",data=real_estate,kind='kde')

In [None]:
sns.scatterplot(x="X5 latitude",y="Y house price of unit area",data=real_estate)

In [None]:
sns.distplot(real_estate["Y house price of unit area"])

In [None]:
# pair plot in dark theme
sns.pairplot(real_estate)

In [None]:
# checking the columns before moving to plit data into train and test
real_estate.columns

# Data splitting to Training and Testing model

In [None]:
#importing required libraries from scikit learn to spilt in train - test
from sklearn.model_selection import train_test_split,KFold,cross_val_score

np.random.seed(0)

df_train,df_test = train_test_split(real_estate, train_size=0.70, test_size=0.30,random_state=100)
print(df_train.head())
print(df_test.head())


In [None]:
#Dividing X and y sets for model building
y_train = df_train.pop('Y house price of unit area')
X_train = df_train
print(y_train.head())
print(X_train.head())

# Data Modelling and Evaluation

In [None]:
#import Linear regression
from sklearn.linear_model import LinearRegression

In [None]:
#fit the model
lm = LinearRegression()
lm.fit(X_train, y_train)

In [None]:
lm.coef_

In [None]:
#The coefficient of all independent variable are as follows
coeff = pd.DataFrame(lm.coef_, X_train.columns, columns=['coefficient'])
coeff

In [None]:
import statsmodels.api as sm
X_train_new = sm.add_constant(X_train)

lm_1 = sm.OLS(y_train, X_train).fit()
print(lm_1.summary())

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
X = X_train_new
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
#vif = vif.sort_values(by = "VIF", ascending = False)
vif

All features p-value is significant and vif is less than 5. Hence no need to remove any feature.

# Residual Analysis of train data

In [None]:
y_train_price = lm_1.predict(X_train)

In [None]:
from sklearn.metrics import r2_score
r2_score(y_true=y_train,y_pred=y_train_price)

In [None]:
#plot histogram of error terms
fig = plt.figure()
sns.distplot((y_train-y_train_price), bins=20)
fig.suptitle('Error Terms',fontsize = 20)
plt.xlabel('Error',fontsize=17)

Here error terms has come in normal error distribution

In [None]:
#residual error scatter plot of error terms

residual = y_train-y_train_price

fig, ax = plt.subplots(figsize=(6,2.5))
_ = ax.scatter(residual, y_train_price)

# Making Prediction on Final model

In [None]:
y_test = df_test.pop('Y house price of unit area')
X_test = df_test

In [None]:
y_test_pred = lm_1.predict(X_test)

Calculating R2 Value now

In [None]:
#import library
from sklearn.metrics import r2_score
#Evaluate r2
r2_score(y_true=y_test,y_pred=y_test_pred)

In [None]:
df = pd.DataFrame({'Actual':y_test,'Predictions':y_test_pred})
df['Predictions']= round(df['Predictions'])
df.head()

In [None]:
sns.regplot('Actual','Predictions',data=df)

# Evaluating Model performance

In [None]:
from sklearn import metrics

#Mean absolute error(MAE)
print('MAE',metrics.mean_absolute_error(y_test,y_test_pred))
#Mean squared error(MSE)
print('MSE',metrics.mean_squared_error(y_test,y_test_pred))
#Root mean squared error(RMSE)
print('RMSE',np.sqrt(metrics.mean_squared_error(y_test,y_test_pred)))

# Using LightGBM for Predicting Prices

In [None]:
from lightgbm import LGBMRegressor
model=LGBMRegressor(n_estimators=1000)
model.fit(X_train,y_train)
kfold=KFold(n_splits=10)
print(model)
res=cross_val_score(model,X_train,y_train,cv=kfold)
print(res.mean()*100)

In [None]:
yp=model.predict(X_train)

import statsmodels.api as sm
model1= sm.WLS(y_train,X_train).fit()
model1.params

In [None]:
model1.summary()

In [None]:
yp1=model1.predict(X_test)

#import library
from sklearn.metrics import r2_score
#Evaluate r2
r2_score(y_true=y_test,y_pred=yp1)

In [None]:
df = pd.DataFrame({'Actual':y_test,'Predictions':yp1})
df['Predictions']= round(df['Predictions'])
df.head()

In [None]:
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

print(mean_absolute_error(y_test,yp1))
print(mean_squared_error(y_test,yp1))