In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Read csv file
df=pd.read_csv('../input/real-estate-price-prediction/Real estate.csv')

In [None]:
#print top-5 records
df.head()

In [None]:
# Check how many rows and columns
df.shape

In [None]:
# Check statistic information
df.describe()

In [None]:
# Check any missing values , datatypes all features in the dataset
df.info()

# Exploratory Data Analysis

In [None]:
# remove 'No' feature becuase it's no so necessary here
df.drop('No',axis=1,inplace=True)

In [None]:
#print top-5 records
df.head()

In [None]:
# Change the columns name (remove X1,X2.. from columns name )
df.rename(columns={"X1 transaction date":"transaction_date","X2 house age":"house_age","X3 distance to the nearest MRT station":"distance_to_nearest_station","X4 number of convenience stores":"number_of_convenience_stores","X5 latitude":"latitude","X6 longitude":"longitude","Y house price of unit area":"house_price"},inplace=True)

In [None]:
#print top-5 records
df.head()

In [None]:
# take only years fro the transaction date
new=df['transaction_date'].astype(str).str.split(".",n=1,expand=True)

In [None]:
df['transaction_year']=new[0]

In [None]:
# check the relation between house price and transaction year
data=df.copy()
data.groupby('transaction_year')['house_price'].median().plot.bar()
plt.title('Transaction Year vs House Price')
plt.xlabel('Transaction Year')
plt.ylabel('House Price')
plt.show()

In [None]:
# remove the transaction_date column
df.drop('transaction_date',axis=1,inplace=True)

In [None]:
# create the list for numeric features
numeric_feature= list(df.select_dtypes(include=['int64','float64']).keys())
numeric_feature

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
# check the distribution of all numeric features
for feature in numeric_feature:
    sns.histplot(data=df,x=feature,kde=True,palette='pastel')
    plt.title(feature)
    plt.xlabel(feature)
    plt.ylabel('frequancy')
    plt.show()
    

**here most of the features are not follow the normal distribution**

In [None]:
# Check the outliers
for feature in numeric_feature:
    data=df.copy()
    if data[feature].unique() is 0:
        pass
    else:
        data[feature]=np.log(data[feature])
        data.boxplot(feature)
        plt.title(feature)
        plt.ylabel('count')
        plt.show()

In [None]:
sns.pairplot(df,hue='transaction_year')

**here little-bit of linear relationship between house price and other features**

# Feature Engineering

In [None]:
# Transform all skewed data into log normal distribution
# create a list of skewed features

for feature in numeric_feature:
    if 0 in df[feature].unique():
        pass
    else:
        df[feature]=np.log(df[feature])
        sns.histplot(data=df,x=feature,kde=True,palette='pastel')
        plt.title(feature)
        plt.ylabel('count')
        plt.show()

In [None]:
df.head()

In [None]:
# Check the outliers and remove outliers
features=['house_age','distance_to_nearest_station','latitude','house_price']
for feature in features:
    df.boxplot(feature)
    plt.title(feature)
    plt.ylabel('count')
    plt.show()

In [None]:
# remove outliers
#crete the function return lower_range and upper_range
def outlier_treatment(datacolumn):
    sorted(datacolumn)
    Q1,Q3 = np.percentile(datacolumn,[25,75])
    IQR = Q3-Q1
    lower_range = Q1 - (1.5 * IQR)
    upper_range = Q3 + (1.5 * IQR)
    return lower_range,upper_range

In [None]:
# Check percentage outliers in each feature
for feature in features:
        data=df.copy()
        lowerbound,upperbound = outlier_treatment(data[feature])
        print(f"Feature is {feature} and {np.round(data[(data[feature] < lowerbound) | (data[feature] > upperbound)].shape[0]/ data.shape[0]*100,4)}% outliers")
        print("------------------------------")

In [None]:
# remove outliers
for feature in features:
    lowerbound,upperbound = outlier_treatment(df[feature])
    df.drop(df[(df[feature]>upperbound)|(df[feature]<lowerbound)].index,inplace=True)
    print(f"Feature is {feature} and {np.round(df[(df[feature] < lowerbound) | (df[feature] > upperbound)].shape[0]/ df.shape[0]*100,4)}% outliers")
    print("---------------------------------")

In [None]:
# create dummy variables for transaction_year
static=pd.get_dummies(df['transaction_year'],prefix_sep='_',prefix='year')
static

In [None]:
# merge the static into main dataframe
df=pd.concat([df,static],axis=1)

In [None]:
# drop the transaction_year column and print top-5 records
df.drop('transaction_year',axis=1,inplace=True)
df.head()

In [None]:
df['number_of_convenience_stores'].value_counts()

# Data Preparation

In [None]:
# split the data into independent and dependent variables
X=df.drop('house_price',axis=1)
y=df['house_price']

In [None]:
X.head()

In [None]:
y.head()

In [None]:
# import minmaxscaler
from sklearn.preprocessing import MinMaxScaler

In [None]:
# initialize and fit the data
scaler=MinMaxScaler()

In [None]:
col=X.columns
x=pd.DataFrame(scaler.fit_transform(X),columns=col)
x.head()

In [None]:
X_df=x.copy()
y_df=y.copy()
df_1=pd.concat([X_df,y_df],axis=1)
df_1.head()

# Feature Selection

In [None]:
# import train_test_split
from sklearn.model_selection import train_test_split

In [None]:
#test train spit
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.3)

In [None]:
print("shape of X_train:",X_train.shape)
print("shape of X_test:",X_test.shape)
print("shape of y_train:",y_train.shape)
print("shape of y_test:",y_test.shape)

In [None]:
# Importing statsmodels module as sm
import statsmodels.api as sm

# Adding a constant column to our X_train dataframe
X_train = sm.add_constant(X_train)

# create a first fitted model
model=sm.OLS(y_train.values.reshape(-1,1),X_train)
lm_1 = model.fit()

In [None]:
#Let's see the summary of our first linear model
print(lm_1.summary())

In [None]:
# import variance_inflation_factor module
from statsmodels.stats.outliers_influence import variance_inflation_factor
# Define vif_scores function as stated above
def vif_score(X):
    vif_data=pd.DataFrame()
    vif_data['Variables']=X.columns
    vif_data['VIF']=[variance_inflation_factor(X.values,i) for i in range(len(X.columns))]
    return vif_data




# print vif scores for all current input features
print(vif_score(x))

**Year_2012 and year_2013 columns highy correlated because vif score is high**

In [None]:
# drop the year_21012 and year_2013 columns
x.drop(columns=['year_2012','year_2013'],axis=1,inplace=True)
print(vif_score(x))

In [None]:
#test train spit
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.3)

In [None]:
# Importing statsmodels module as sm
import statsmodels.api as sm

# Adding a constant column to our X_train dataframe
X_train = sm.add_constant(X_train)

# create a first fitted model
model=sm.OLS(y_train.values.reshape(-1,1),X_train)
lm_2 = model.fit()

In [None]:
#Let's see the summary of our second linear model
print(lm_2.summary())

In [None]:
# drop  the longtitud and check if r2 values increase or not
x.drop(columns=['longitude'],axis=1,inplace=True)
print(vif_score(x))

In [None]:
#test train spit
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.3)

In [None]:
# Importing statsmodels module as sm
import statsmodels.api as sm

# Adding a constant column to our X_train dataframe
X_train = sm.add_constant(X_train)

# create a first fitted model
model=sm.OLS(y_train.values.reshape(-1,1),X_train)
lm_3 = model.fit()

In [None]:
#Let's see the summary of our third linear model
print(lm_3.summary())

In [None]:
# Adding a constant column to our dataframe
X_test =  sm.add_constant(X_test)


# Adding  constant variable to test dataframe
X_test = sm.add_constant(X_test)

# create a second fitted model
lm_3 = sm.OLS(y_test.values.reshape(-1,1),X_test).fit()

In [None]:
#print model 2 summary
print(lm_3.summary())

In [None]:
# Making predictions
y_pred = lm_3.predict(X_test)

# Model Evaluation

In [None]:
# Actual vs Predicted graph as below
c = [i for i in range(1,121,1)]
fig = plt.figure()
#Plotting Actual
plt.plot(c,y_test)
#Plotting predicted
plt.plot(c,y_pred,color='red')
# Plot heading 
plt.title('Actual vs Predicted',fontsize=20)
# X-label
plt.xlabel('Index',fontsize=15)
# Y-label
plt.ylabel('House price',fontsize=15)
#showing the plot
plt.show()

In [None]:
# Plotting y_test and y_pred scatter plot to understand the spread.
fig = plt.figure()
#plotting scatter plot between actual and predicted
plt.scatter(y_test,y_pred)
# Plot heading 
plt.title('y_test vs y_pred',fontsize=20)
# X-label
plt.xlabel('y_test',fontsize=15)
# Y-label
plt.ylabel('y_pred',fontsize=15)
#show plot
plt.show()

In [None]:
# Plotting line chart of Error terms
fig = plt.figure()
c = [i for i in range(1,121,1)]
# line plot between c and error trem
sns.lineplot(c,y_test.values-y_pred.values,color='blue')
# Plot heading 
plt.title('error term',fontsize=20)
# X-label
plt.xlabel('index',fontsize=15)
# Y-label
plt.ylabel('y_test-y_pred',fontsize=15)
#show plot
plt.show()

In [None]:
# Plotting the error terms as dist plot to understand the distribution.
fig = plt.figure()
#plot distplot of error tem
sns.distplot(y_test.values-y_pred.values,bins=20)
# Plot heading 
plt.title('error term',fontsize=20)
# X-label
plt.xlabel('y_test-y_pred',fontsize=15)

# Y-label"
plt.ylabel('index',fontsize=15)

#show plot
plt.show()

## Regression metrics

In [None]:
#import metrics module
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
#calculate and print RMSE
mse=mean_squared_error(y_test,y_pred)
print("Root Mean Squared error(RMSE) is: ",np.sqrt(mse))

In [None]:
#calculate and print MSE
print("Mean squared Error(MSE) is: ",mse)

In [None]:
#calculate and print MAE
print("Mean Absolute error(MAE) is: ",mean_absolute_error(y_test,y_pred))

# Verify Assumption

### Multicollinearity

In [None]:
print(vif_score(x))

### Normality of residuals

In [None]:
residual=y_test.values-y_pred.values
sns.distplot(residual)

In [None]:
np.mean(residual)

### Homoscedasticity

In [None]:
fig,ax=plt.subplots(figsize=(6,2.5))
_=ax.scatter(y_pred,residual)

### No auto correlation of residual

In [None]:
import statsmodels.tsa.api as smt
acf=smt.graphics.plot_acf(residual,lags=40,alpha=0.05)
acf.show()