<p style = "color : #f54748; font-size : 40px; font-family : 'Comic Sans MS'; text-align : center;"><strong>Linear Regression and Regularization</strong></p>

In [None]:
# necessary imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
plt.style.use('ggplot')

In [None]:
from sklearn.datasets import load_boston

data = load_boston() # reading data

In [None]:
data

<p style = "font-size : 30px; color : #4e8d7c ; font-family : 'Comic Sans MS';  "><strong>Data Description :-</strong></p>

<ul>
    <li style = "color : #03506f; font-size : 18px; font-family : 'Comic Sans MS';"><strong>Data :- Independent Variables also known as the x values.</strong></li>
    <li style = "color : #03506f; font-size : 18px; font-family : 'Comic Sans MS';"><strong>feature_names :- The column names of the data.</strong></li>
    <li style = "color : #03506f; font-size : 18px; font-family : 'Comic Sans MS';"><strong>target :- The target variable or the price of the houses(dependent variable) alse known as y value.</strong></li>
</ul>

In [None]:
# creating dataframe 

df = pd.DataFrame(data.data, columns = data.feature_names)

In [None]:
# adding target value to the data

df['MEDV'] = data.target

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

<p style = "font-size : 20px; color : #34656d ; font-family : 'Comic Sans MS'; "><strong>It seems there are no missing values in data.</strong></p> 

In [None]:
# looking at null values 

df.isna().sum()

<p style = "font-size : 20px; color : #34656d ; font-family : 'Comic Sans MS'; "><strong>There are no missing values in the data.</strong></p> 

<p style = "color : #f54748; font-size : 35px; font-family : 'Comic Sans MS';"><strong>EDA</strong></p>

In [None]:
# Let's see how data is distributed for every column

plt.figure(figsize = (20, 15))
plotnumber = 1

for column in df:
    if plotnumber <= 14:
        ax = plt.subplot(3, 5, plotnumber)
        sns.distplot(df[column])
        plt.xlabel(column, fontsize = 15)
        
    plotnumber += 1
    
plt.tight_layout()
plt.show()

In [None]:
# Plotting `Price` with remaining columns

plt.figure(figsize = (20, 15))
plotnumber = 1

for column in df:
    if plotnumber <= 14:
        ax = plt.subplot(3, 5, plotnumber)
        sns.scatterplot(x = df['MEDV'], y = df[column])
        
    plotnumber += 1

plt.tight_layout()
plt.show()

In [None]:
# looking for outliers using box plot

plt.figure(figsize = (20, 8))
sns.boxplot(data = df, width = 0.8)
plt.show()

<p style = "font-size : 20px; color : #34656d ; font-family : 'Comic Sans MS'; "><strong>There are some outliers in data, so StandardScaler can help in scaling data.</strong></p> 

In [None]:
# creating features and label variable

X = df.drop(columns = 'MEDV', axis = 1)
y = df['MEDV']

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
X_scaled

In [None]:
# checking for multicollinearity using `VIF` and `correlation matrix`

from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()

vif['VIF'] = [variance_inflation_factor(X_scaled, i) for i in range(X_scaled.shape[1])]
vif['Features'] = X.columns

vif

In [None]:
# Heatmap

fig, ax = plt.subplots(figsize = (16, 8))
sns.heatmap(df.corr(), annot = True, fmt = '1.2f', annot_kws = {'size' : 10}, linewidth = 1)
plt.show()

<p style = "font-size : 20px; color : #34656d ; font-family : 'Comic Sans MS'; "><strong>"RAD" and "TAX" columns are highly correlated which means multicollinearity is present so we have to remove one column.</strong></p> 

In [None]:
import statsmodels.formula.api as smf

lm = smf.ols(formula = 'MEDV ~ RAD', data = df).fit()
lm.summary()

In [None]:
lm = smf.ols(formula = 'MEDV ~ TAX', data = df).fit()
lm.summary()

<p style = "font-size : 20px; color : #34656d ; font-family : 'Comic Sans MS'; "><strong>From OLS Regression Results we can conclude that removing "RAD" column will be good.</strong></p> 

In [None]:
# removing "RAD" column

df.drop(columns = 'RAD', axis = 1, inplace = True)

In [None]:
df.head()

In [None]:
# splitting data into training asnd test set

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.30, random_state = 0)

In [None]:
# fitting training data to model

from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)

In [None]:
# prediction of model

y_pred = lr.predict(X_test)

In [None]:
# training accuracy of model

lr.score(X_train, y_train)

In [None]:
# test accuracy of model

lr.score(X_test, y_test)

In [None]:
# creating a function to create adhusted R-Squared

def adj_r2(X, y, model):
    r2 = model.score(X, y)
    n = X.shape[0]
    p = X.shape[1]
    adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
    
    return adjusted_r2

In [None]:
print(adj_r2(X_train, y_train, lr))

In [None]:
print(adj_r2(X_test, y_test, lr))

<p style = "font-size : 20px; color : #34656d ; font-family : 'Comic Sans MS'; "><strong>Model r2 score is less on the test data so there is chance of overfitting, let's check this using regularization.</strong></p> 

<p style = "color : #f54748; font-size : 35px; font-family : 'Comic Sans MS';"><strong>Lasso Regression</strong></p>

In [None]:
from sklearn.linear_model import Lasso, LassoCV

lasso_cv = LassoCV(alphas = None, cv = 10, max_iter = 100000, normalize = True)
lasso_cv.fit(X_train, y_train)

In [None]:
# best alpha parameter

alpha = lasso_cv.alpha_
alpha

In [None]:
lasso = Lasso(alpha = lasso_cv.alpha_)
lasso.fit(X_train, y_train)

In [None]:
lasso.score(X_train, y_train)

In [None]:
lasso.score(X_test, y_test)

In [None]:
print(adj_r2(X_train, y_train, lasso))

In [None]:
print(adj_r2(X_test, y_test, lasso))

<p style = "color : #f54748; font-size : 35px; font-family : 'Comic Sans MS';"><strong>Ridge Regression</strong></p>

In [None]:
from sklearn.linear_model import Ridge, RidgeCV

alphas = np.random.uniform(0, 10, 50)
ridge_cv = RidgeCV(alphas = alphas, cv = 10, normalize = True)
ridge_cv.fit(X_train, y_train)

In [None]:
# best alpha parameter

alpha = ridge_cv.alpha_
alpha

In [None]:
ridge = Ridge(alpha = ridge_cv.alpha_)
ridge.fit(X_train, y_train)

In [None]:
ridge.score(X_train, y_train)

In [None]:
ridge.score(X_test, y_test)

In [None]:
print(adj_r2(X_train, y_train, ridge))

In [None]:
print(adj_r2(X_test, y_test, ridge))

<p style = "color : #f54748; font-size : 35px; font-family : 'Comic Sans MS';"><strong>Elastic Net</strong></p>

In [None]:
from sklearn.linear_model import ElasticNet, ElasticNetCV

elastic_net_cv = ElasticNetCV(alphas = None, cv = 10, max_iter = 100000, normalize = True)
elastic_net_cv.fit(X_train, y_train)

In [None]:
# best alpha parameter

alpha = elastic_net_cv.alpha_
alpha

In [None]:
# l1 ratio 

elastic_net_cv.l1_ratio

In [None]:
elastic_net = ElasticNet(alpha = elastic_net_cv.alpha_, l1_ratio = elastic_net_cv.l1_ratio)
elastic_net.fit(X_train, y_train)

In [None]:
elastic_net.score(X_train, y_train)

In [None]:
elastic_net.score(X_test, y_test)

In [None]:
print(adj_r2(X_train, y_train, elastic_net))

In [None]:
print(adj_r2(X_test, y_test, elastic_net))

<p style = "font-size : 20px; color : #34656d ; font-family : 'Comic Sans MS'; "><strong>We still are getting the same r2 score. That means our Regression model has been well trained over the training data and there is no overfitting.</strong></p> 

<p style = "color : #f55c47; font-size : 35px; font-family : 'Comic Sans MS';"><strong>If you like this kernel, Please do upvote.</strong></p>