# Multiple Linear Regression with Dummies 

You are given a real estate dataset. 

Real estate is one of those examples that every regression course goes through as it is extremely easy to understand and there is a (almost always) certain causal relationship to be found.

The data is located in the file: 'real_estate_price_size_year_view.csv'. 

We are expected to create a multiple linear regression, using the new data. 

In this exercise, the dependent variable is 'price', while the independent variables are 'size', 'year', and 'view'.


## Import the relevant libraries

In [None]:
# Importing all required packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

## Load the data

In [None]:
estate = pd.DataFrame(pd.read_csv("../input/real-estate-price/real_estate_price_size_year_view.csv"))

In [None]:
estate.head()

In [None]:
estate.describe()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sns.pairplot(estate)
plt.show()

In [None]:
plt.figure(figsize=(20, 12))
sns.regplot(x = 'size', y = 'price', data = estate)
plt.show()

In [None]:

plt.figure(figsize=(20, 12))
plt.subplot(2,3,2)
sns.boxplot(x = 'year', y = 'price', data = estate)
plt.subplot(2,3,3)
sns.boxplot(x = 'view', y = 'price', data = estate)
plt.show()

## Create a dummy variable for 'view'

In [None]:
estate['view']=estate['view'].map({'No sea view':1,'Sea view':0})

In [None]:
estate.head()

In [None]:
estate['year'].value_counts()

In [None]:
year = pd.get_dummies(estate['year'])

In [None]:
year.head()

In [None]:
year = pd.get_dummies(estate['year'], drop_first = True)

In [None]:
estate = pd.concat([estate, year], axis = 1)

In [None]:
estate.head()

In [None]:
estate.drop(['year'], axis = 1, inplace = True)

In [None]:
estate.head()

## Create the regression

### Declare the dependent and the independent variables

In [None]:
from sklearn.model_selection import train_test_split

# We specify this so that the train and test data set always have the same rows, respectively
np.random.seed(0)
df_train, df_test = train_test_split(estate, train_size = 0.7, test_size = 0.3, random_state = 100)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [None]:
num_vars = ['size','price']

df_train[num_vars] = scaler.fit_transform(df_train[num_vars])

In [None]:
df_train.head()

In [None]:
plt.figure(figsize=[10,10])
sns.heatmap(df_train.corr(), annot = True, cmap="YlGnBu")
plt.show()

In [None]:
plt.figure(figsize=[10,10])
sns.regplot(x = 'size', y = 'price', data = df_train)
plt.show()

In [None]:
y_train = df_train.pop('price')
X_train = df_train

### Regression

In [None]:
estate.columns

In [None]:
import statsmodels.api as sm
X_train_lm = sm.add_constant(X_train)

lr= sm.OLS(y_train, X_train_lm).fit()

lr.params

In [None]:
print(lr.summary())

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
X = X_train.drop(2009, 1,)

In [None]:
X_train_lm = sm.add_constant(X)

lr_2 = sm.OLS(y_train, X_train_lm).fit()

In [None]:
print(lr_2.summary())

In [None]:
vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
y_train_price = lr_2.predict(X_train_lm)

In [None]:
fig = plt.figure()
sns.distplot((y_train - y_train_price), bins = 20)
fig.suptitle('Error Terms', fontsize = 20)                  
plt.xlabel('Errors', fontsize = 18)
plt.show()

In [None]:
num_vars = ['size','price']

df_test[num_vars] = scaler.transform(df_test[num_vars])

In [None]:
df_test.head()

In [None]:
df_test.describe()

In [None]:
y_test = df_test.pop('price')
X_test = df_test

In [None]:
X_test_m2 = sm.add_constant(X_test)

In [None]:
y_pred_m2 = lr.predict(X_test_m2)

In [None]:
fig = plt.figure()
sns.regplot(y_test, y_pred_m2)
fig.suptitle('y_test vs y_pred', fontsize = 20)              # Plot heading 
plt.xlabel('y_test', fontsize = 18)                          # X-label
plt.ylabel('y_pred', fontsize = 16)   
plt.show()

We can see that the equation of our best fitted line is:

price = 0.6869 * size + 0.1089 * year - 0.17213 * view  + 0.0698 * '2015' + 0.1009 * '2018' + 0.1685 


In [None]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_m2))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_m2))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_m2)))

In [None]:
df = pd.DataFrame({'Actual': y_test.values.flatten(), 'Predicted': y_pred_m2.values.flatten()})
df

In [None]:
df1 = df.head(30)
df1.plot(kind='bar',figsize=(16,10))
plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()