In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm

Loading the Advertising Data

In [None]:
df = pd.read_csv("ISL_DataSets\Advertising.csv")
print(df.shape)
print(df.columns)

Running Single Variable Regression

In [None]:
#X = df[['TV','radio','newspaper']]
X = df[['TV']]
y = df[['sales']]
reg = LinearRegression().fit(X, y)
print(reg.score(X, y))
print(reg.coef_)
print(reg.intercept_)

In [None]:
y_pred = reg.predict(X)
print("Mean squared error: %.2f" % mean_squared_error(y, y_pred))
print("Coefficient of determination: %.2f" % r2_score(y, y_pred))

In [None]:
plt.scatter(X, y, color="black")
plt.plot(X, y_pred, color="blue", linewidth=3)

#plt.xticks(())
#plt.yticks(())

plt.show()

Running the Regression on Test Data and Analyze the Difference in R2 and MSE

In [None]:
# repeat this activity multiple times and record the R2 and MSE for each run
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [None]:
reg2 = LinearRegression().fit(X_train, y_train)
print(reg2.score(X, y))
print(reg2.coef_)
print(reg2.intercept_)
y_pred_test = reg2.predict(X_test)
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred_test))
print("Coefficient of determination: %.3f" % r2_score(y_test, y_pred_test))

In [None]:
#repeat the activity for radio and newspaper as well

Multiple Linear Regression

In [None]:
# 2 variables
#X = df[['TV','radio','newspaper']]
X = df[['TV','radio']]
y = df[['sales']]
reg = LinearRegression().fit(X, y)
print(reg.score(X, y))
print(reg.coef_)
print(reg.intercept_)
y_pred = reg.predict(X)
print("Mean squared error: %.2f" % mean_squared_error(y, y_pred))
print("Coefficient of determination: %.3f" % r2_score(y, y_pred))

In [None]:
# train test split
#X = df[['TV','radio','newspaper']]
X = df[['TV','radio']]
y = df[['sales']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)
reg2 = LinearRegression().fit(X_train, y_train)
print(reg2.score(X_train, y_train))
print(reg2.coef_)
print(reg2.intercept_)
y_pred_test = reg2.predict(X_test)
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred_test))
print("Coefficient of determination: %.3f" % r2_score(y_test, y_pred_test))

In [None]:
# 3 variables
X = df[['TV','radio','newspaper']]
#X = df[['TV','radio']]
y = df[['sales']]
reg = LinearRegression().fit(X, y)
print(reg.score(X, y))
print(reg.coef_)
print(reg.intercept_)
y_pred = reg.predict(X)
print("Mean squared error: %.2f" % mean_squared_error(y, y_pred))
print("Coefficient of determination: %.3f" % r2_score(y, y_pred))

In [None]:
# train test split - 3 variables
X = df[['TV','radio','newspaper']]
#X = df[['TV','radio']]
y = df[['sales']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)
reg2 = LinearRegression().fit(X_train, y_train)
print(reg2.score(X_train, y_train))
print(reg2.coef_)
print(reg2.intercept_)
y_pred_test = reg2.predict(X_test)
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred_test))
print("Coefficient of determination: %.3f" % r2_score(y_test, y_pred_test))

In [None]:
#compute correlation matrix
X2 = df.drop(columns=['S_No'])
X2.corr()

Using Statsmodel Library

In [None]:
X = sm.add_constant(X)
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())


Interaction Model

In [None]:
X = df[['TV','radio']]
X['TV_Radio'] = df['TV']*df['radio']
print(X.shape)
X.head()

In [None]:
y = df[['sales']]
reg = LinearRegression().fit(X, y)
print(reg.score(X, y))
print(reg.coef_)
print(reg.intercept_)
y_pred = reg.predict(X)
print("Mean squared error: %.2f" % mean_squared_error(y, y_pred))
print("Coefficient of determination: %.3f" % r2_score(y, y_pred))

In [None]:
X = sm.add_constant(X)
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())

In [None]:
#show it for test data as well

In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
poly = PolynomialFeatures(2,interaction_only=True)
X = X = df[['TV','radio']]
y = df[['sales']]
X2 = poly.fit_transform(X)
print(X2.shape)
print(poly.get_feature_names_out())
X2 = sm.add_constant(X2)
mod = sm.OLS(y, X2)
res = mod.fit()
print(res.summary())

In [None]:
poly = PolynomialFeatures(2) #,interaction_only=True)
X = df[['TV','radio']]
y = df[['sales']]
X2 = poly.fit_transform(X)
print(X2.shape)
print(poly.get_feature_names_out())
X2 = sm.add_constant(X2)
mod = sm.OLS(y, X2)
res = mod.fit()
print(res.summary())

Feature Selection (Forward and Backward)

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector

Multiple Regression and Feature Selection using Boston Data

In [None]:
df = pd.read_csv("ISL_DataSets\Boston.csv")
print(df.shape)
print(df.columns)

In [None]:
X = df.drop(columns=['s_no','medv'])
print(X.shape)
y = df[['medv']]

In [None]:
X = sm.add_constant(X)
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())

In [None]:
ols_reg = LinearRegression()
sfs = SequentialFeatureSelector(ols_reg, direction='forward',n_features_to_select=5)
sfs.fit(X, y)
print(sfs.get_feature_names_out())

In [None]:
ols_reg = LinearRegression()
sfs = SequentialFeatureSelector(ols_reg, direction='backward',n_features_to_select=5)
sfs.fit(X, y)
print(sfs.get_feature_names_out())

Automating Polynomial Regression and Interaction

In [None]:
poly = PolynomialFeatures(2)
X = df.drop(columns=['s_no','medv'])
y = df[['medv']]
X2 = poly.fit_transform(X)
print(X2.shape)
print(poly.get_feature_names_out())
X2 = sm.add_constant(X2)
mod = sm.OLS(y, X2)
res = mod.fit()
print(res.summary())

In [None]:
poly = PolynomialFeatures(2,interaction_only=True)
X = df.drop(columns=['s_no','medv'])
y = df[['medv']]
X2 = poly.fit_transform(X)
print(X2.shape)
print(poly.get_feature_names_out())
X2 = sm.add_constant(X2)
mod = sm.OLS(y, X2)
res = mod.fit()
print(res.summary())

kNN Regression Method

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
neigh = KNeighborsRegressor(n_neighbors=5)
X = df[['TV','radio']]
y = df[['sales']]
knn_reg = neigh.fit(X, y)
print(knn_reg.score(X, y))
#print(reg.coef_)
#print(reg.intercept_)
y_pred = knn_reg.predict(X)
print("Mean squared error: %.2f" % mean_squared_error(y, y_pred))
print("Coefficient of determination: %.3f" % r2_score(y, y_pred))

In [None]:
neigh = KNeighborsRegressor(n_neighbors=3)
X = df[['TV','radio']]
y = df[['sales']]
knn_reg = neigh.fit(X, y)
print(knn_reg.score(X, y))
#print(reg.coef_)
#print(reg.intercept_)
y_pred = knn_reg.predict(X)
print("Mean squared error: %.2f" % mean_squared_error(y, y_pred))
print("Coefficient of determination: %.3f" % r2_score(y, y_pred))

In [None]:
sfs = SequentialFeatureSelector(neigh, n_features_to_select=1)
sfs.fit(X, y)
print(sfs.get_support())
print(sfs.get_feature_names_out())

In [None]:
neigh = KNeighborsRegressor(n_neighbors=3)
#X = df[['TV','radio']]
#y = df[['sales']]
knn_reg = neigh.fit(X, y)
print(knn_reg.score(X, y))
#print(reg.coef_)
#print(reg.intercept_)
y_pred = knn_reg.predict(X)
print("Mean squared error: %.2f" % mean_squared_error(y, y_pred))
print("Coefficient of determination: %.3f" % r2_score(y, y_pred))

In [None]:
sfs = SequentialFeatureSelector(neigh, direction='forward',n_features_to_select=5)
sfs.fit(X, y)
print(sfs.get_feature_names_out())

In [None]:
sfs = SequentialFeatureSelector(neigh, direction='backward',n_features_to_select=5)
sfs.fit(X, y)
print(sfs.get_feature_names_out())