# US Housing Data - Multiple Linear Regression

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# Check for the VIF values of the feature variables. 
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
from sklearn.metrics import mean_squared_error,r2_score

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split

In [None]:
df=pd.read_csv('../input/usa-housingcsv/USA_Housing.csv')
df.head()

In [None]:
df.describe()

In [None]:
df.info()
# No missing values found

In [None]:
df.shape

In [None]:
# Need to look for outliers

In [None]:
sns.pairplot(df)

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(),annot=True)

In [None]:
# Price is correlated with all the independant variables as per the pair plot

In [None]:
plt.figure(figsize=(16,8))
plt.subplot(2,3,1)
sns.boxplot(df['Avg. Area Income'])
plt.subplot(2,3,2)
sns.boxplot(df['Avg. Area Number of Rooms'])
plt.subplot(2,3,3)
sns.boxplot(df['Avg. Area Number of Bedrooms'])
plt.subplot(2,3,4)
sns.boxplot(df['Avg. Area House Age'])
plt.subplot(2,3,5)
sns.boxplot(df['Area Population'])

In [None]:
#There are outliers in a number of variables, need outlier treatment

In [None]:
sns.displot(df['Price'],bins=20)

In [None]:
df.drop('Address',axis=1, inplace=True)

In [None]:
X_train,X_test=train_test_split(df,train_size=.7,test_size=.3,random_state=101)

In [None]:
cols=list(X_train.columns)
cols

In [None]:
# Apply scaling

scaler=MinMaxScaler()
X_train[cols]=scaler.fit_transform(X_train[cols])

In [None]:
X_train

In [None]:
X_train

In [None]:
y_train=X_train.pop('Price')

In [None]:
y_train

In [None]:
# Using scikit learn regression model

lm=LinearRegression()

In [None]:
lm.fit(X_train,y_train)

In [None]:
lm.intercept_

In [None]:
lm.coef_

In [None]:
# Using stasmodel regression

X_train_sm=sm.add_constant(X_train)
X_train_sm

In [None]:
sm_lr=sm.OLS(y_train,X_train_sm).fit()

sm_lr.params

print(sm_lr.summary())

In [None]:
# Same y intercept and coefficient obtained with both the regression models

In [None]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
#Lets drop the varaible with the highest VIF and P-value

X_train=X_train.drop('Avg. Area Number of Bedrooms',axis=1)


In [None]:
X_train

In [None]:
X_train_sm=sm.add_constant(X_train)
sm_lr=sm.OLS(y_train,X_train_sm).fit()

sm_lr.params

print(sm_lr.summary())

In [None]:
# P-values indicate that the varaibles are quite significant

In [None]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
cols

In [None]:
X_test[cols]=scaler.transform(X_test[cols])
X_test

In [None]:
y_test=X_test.pop('Price')

In [None]:
X_test=X_test.drop('Avg. Area Number of Bedrooms',axis=1)

In [None]:
X_test

In [None]:
X_train

In [None]:
X_test_lm=sm.add_constant(X_test)

In [None]:
y_pred=sm_lr.predict(X_test_lm)

In [None]:
# Plot the residuals
residuals=y_pred-y_test
sns.scatterplot(y_test,y_pred,s=100)
#Error terms got a constant variance

In [None]:
sns.distplot(residuals)

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [None]:
#Returns the mean squared error; we'll take a square root
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
r_squared = r2_score(y_test,y_pred)
r_squared