# Real Estate Price Prediction using Linear Regression 

You are given a real estate dataset.

Real estate is one of those examples that every regression course goes through as it is extremely easy to understand and there is a (almost always) certain causal relationship to be found.

The data is located in the file: 'real_estate_price_size_year_view.csv'.

We are expected to create a multiple linear regression, using the new data.

In this exercise, the dependent variable is 'price', while the independent variables are 'size', 'year', and 'view'.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Exploration and Cleaning 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
real_estate=pd.read_csv("../input/real-estate-price/real_estate_price_size_year_view.csv")

In [None]:
real_estate.head(5)

In [None]:
real_estate.describe()

In [None]:
real_estate.info()

In [None]:
real_estate.shape

## EDA

In [None]:
plt.figure(figsize=(20,15))
sns.pairplot(real_estate)
plt.show()

In [None]:
plt.figure(figsize=(20,15))
plt.subplot(1,2,1)
sns.boxplot(x='year',y='price',data=real_estate)
plt.subplot(1,2,2)
sns.boxplot(x='view',y='price',data=real_estate)
plt.show()

In [None]:
plt.figure(figsize=(20,12))
sns.heatmap(real_estate.corr())
plt.show()

## Creating Dummy

In [None]:
real_estate['view']=real_estate['view'].map({'No sea view':0, 'Sea view':1})

In [None]:
real_estate['year']=real_estate['year'].astype("category")

In [None]:
year=pd.get_dummies(real_estate['year'],drop_first=True)

In [None]:
real_estate=pd.concat([real_estate,year],axis=1)

In [None]:
real_estate.drop(['year'],axis=1,inplace=True)

In [None]:
real_estate.head(5)

## Spliting the data set for train and test and doing sclaing

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df_train,df_test=train_test_split(real_estate,test_size=0.2,random_state=42)

In [None]:
df_train.head(5)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()

In [None]:
num_vars=['price','size']

In [None]:
df_train[num_vars].head(5)

In [None]:
df_train[num_vars] = scaler.fit_transform(df_train[num_vars])

In [None]:
df_test[num_vars] = scaler.transform(df_test[num_vars])

In [None]:
df_train.head(5)

In [None]:
y_train=df_train.pop('price')
X_train=df_train

# Creating the Linear Regression Model

In [None]:
from sklearn.linear_model import LinearRegression
lm=LinearRegression()
model=lm.fit(X_train,y_train)

In [None]:
y_train_pred=model.predict(X_train)

In [None]:
from sklearn.metrics import r2_score
r2_score(y_train,y_train_pred)

## Checking model accuracy on test data set

In [None]:
y_test=df_test.pop('price')
X_test=df_test

In [None]:
y_test_pred=model.predict(X_test)

In [None]:
r2_score(y_test,y_test_pred)