In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Imports

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# Get the Data

In [None]:
df = pd.read_csv('/kaggle/input/real-estate-price-prediction/Real estate.csv')

In [None]:
df.head()

In [None]:
df.info()

# Data Cleaning

In [None]:
#we don't need the columns=[['No','X1 transaction data']] so we will drop them.
df.drop(columns=['No','X1 transaction date'],inplace=True,axis=1)

# EDA

In [None]:
df['Y house price of unit area'].hist(bins=200)
#this is not a normally distributed data.
#so it violates the normality assumption of linear regression model.

In [None]:
sns.heatmap(df.corr(),annot=True,cmap='coolwarm')
#there is highly negative correlation between x3 and x6 : -0.81
#which violates the No Multicollinearity assumption of linear regression model.
#so to deal with Multicollinearity we will drop either x3 or x6.
#since the x3 is more correlated with other columns too, we will drop x3.

In [None]:
df.drop(columns=['X3 distance to the nearest MRT station'],inplace=True)

In [None]:
sns.pairplot(df)
#here we can say that the relationship between the output and input parameters is not linear.
#which violates the linearity assumption of linear regression.
#so to deal with this issue we will use PolynomialFeatures.

In [None]:
#here we are seperating the input data(X) and output data(y).
X = df.drop(columns=['Y house price of unit area'])
y = df['Y house price of unit area']

# Preprocessing

In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
poly = PolynomialFeatures(degree=3)
#choose diffrent degree values to see which gives better r2_score.

In [None]:
X_poly = poly.fit_transform(X)

# Training and Testing Data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X_poly,y,random_state=101,test_size=0.3)

# Training the Model

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
model = LinearRegression()

In [None]:
model.fit(x_train,y_train)

In [None]:
pred_y = model.predict(x_test)

In [None]:
from sklearn.metrics import r2_score

In [None]:
sns.scatterplot(pred_y,y_test)

# Evaluating the Model

In [None]:
r2_score(y_test,pred_y)

In [None]:
from sklearn import metrics

#Mean absolute error(MAE)
print('MAE',metrics.mean_absolute_error(y_test,pred_y))
#Mean squared error(MSE)
print('MSE',metrics.mean_squared_error(y_test,pred_y))
#Root mean squared error(RMSE)
print('RMSE',np.sqrt(metrics.mean_squared_error(y_test,pred_y)))

# **Residual Plot**

In [None]:
sns.distplot(y_test-pred_y,bins=200)

I am a beginner in machine learning so if there are any mistakes in this notebook please let me know.