In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Problem Statement
*This is a Real Estate price data. These estates are located all over the city and the prices have been decided as per the convenience and facilities available near to these estates.
This is a regression problem where we have to predict the prices of the estates as per the customer convenience he prefers.
Real Estate prediction is a very common real-life issue that each real estate agent faces at least once in its lifetime. If done correctly, it can save a lot of man-hours at the end and give the predictions correct.*

# Hypothesis generation
*There are some factors that should be brainstormed before getting into code. Below are some factors that matter and put a weightage on the required predictions:
        __House age:__ This is the age of the house which is a very important feature that how old the house is and how much more the life of the house will be.
        __Nearest to MRT Station:__ The should nearest to the MRT station will get costly because the convenience get better.
        __Convenience Stores:__ Near the stores, the more life will get more convenient as in case of emergencies, the products will be available just a foot away.
All the above factors make the difference between the low and high price houses.*

In [None]:
file = pd.read_csv('Real estate.csv')
file.head()

In [None]:
file.describe()

In [None]:
def basic_info(df):
    print(df.info(),'\n',df.describe(),'\n',df.isna().sum(),'\n',file.shape)

In [None]:
basic_info(file)

## Observations

*We can see that there are no null values.*

# EDA

In [None]:
sns.pairplot(file)

In [None]:
sns.displot(data= file, x='Y house price of unit area' , bins=25 , kde=True)

In [None]:
sns.displot(data= file, x='X6 longitude' , bins=25 , kde=True)

In [None]:
sns.displot(data= file, x='X5 latitude' , bins=25 , kde=True)

In [None]:
sns.displot(data= file, x='X4 number of convenience stores' , bins=25 , kde=True)

In [None]:
sns.displot(data= file, x='X3 distance to the nearest MRT station' , bins=25 , kde=True)

In [None]:
sns.displot(data= file, x='X2 house age' , bins=25 , kde=True)

In [None]:
sns.displot(data= file, x='X1 transaction date' , bins=25 , kde=True)

In [None]:
sns.heatmap(file.corr(),annot=True,cmap='inferno')

EDA showed us that the following factors have a direct impact on sales:
__Price:__
__Longitude:__
__latitude:__
__Distance to the nearest MRT station:__
__House age:__

# Regression

In [None]:
dataset = pd.read_csv('Real estate.csv')
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:,  -1].values

### Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size=0.2, random_state=1)

### Training the Simple Linear Regression model on the Training set

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(xTrain, yTrain)

In [None]:
regressor.coef_

### Predicting the Test  set result

In [None]:
yPred = regressor.predict(xTest)

In [None]:
np.set_printoptions(precision=2)
print(np.concatenate((yPred.reshape(len(yPred), 1), yTest.reshape(len(yTest), 1)), 1))

### Evaluating the Model Performance

In [None]:
from sklearn.metrics import r2_score
r2_score(yTest, yPred)