In [None]:
# import necessary libraries
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
plt.rcParams['font.size']= 14
plt.rcParams['legend.fontsize']=11
plt.rcParams['lines.linewidth']=2
plt.rcParams['font.serif'] = 'Time New Roman'

# Loading the dataset

In [None]:
# Load the dataset
adv_df = pd.read_csv('../input/advertising/Advertising.csv')
adv_df.info()

In [None]:
adv_df.head()

In [None]:
adv_df.shape

**features: TV, radio, newspaper**

**target: sales**

# EDA

In [None]:
# No missing values
# Cehck for outliers
# Explorarity data analysis
plt.figure()
sns.pairplot(data=adv_df)

In [None]:
adv_df.corr()['sales']

In [None]:
plt.figure()
sns.heatmap(adv_df.corr() , annot=True, cmap= "Blues")

It is demonstrated by the correlations that all the features are correlated with sales. the most correlated one is TV advertising.

# Determining the features and target variables

In [None]:
X= adv_df.drop('sales', axis=1)
y= adv_df['sales']

# Preprocessing

In [None]:
X.shape

In [None]:
# apply polynomial regression on this dataset
from sklearn.preprocessing import PolynomialFeatures
poly_converter = PolynomialFeatures(degree=2, include_bias=False)
poly_features = poly_converter.fit_transform(X)

In [None]:
poly_features.shape

# split the data to train and test

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test,y_train, y_test = train_test_split(poly_features, y, test_size=0.3)

# train the model

In [None]:
from sklearn.linear_model import LinearRegression
poly_model= LinearRegression()
poly_model.fit(X_train, y_train)

# evaluating the model (residuals and metrics) and compare it with simple regression

In [None]:
y_pred_poly = poly_model.predict(X_test)
residuals_poly = y_test-y_pred_poly
from sklearn.metrics import mean_absolute_error, mean_squared_error
MAE_poly = mean_absolute_error(y_test, y_pred_poly)
MSE_poly = mean_squared_error(y_test, y_pred_poly)
RMSE_poly = np.sqrt(MSE_poly)

In [None]:
X_train_s , X_test_s ,y_train_s , y_test_s  = train_test_split(X, y, test_size=0.3)
s_model = LinearRegression()
s_model.fit(X_train_s, y_train_s)
y_pred_simple = s_model.predict(X_test_s)
residuals_simple = y_test_s-y_pred_simple
MAE_s = mean_absolute_error(y_test_s , y_pred_simple)
MSE_s = mean_squared_error(y_test_s , y_pred_simple)
RMSE_s = np.sqrt(MSE_s)

In [None]:
pd.DataFrame({'simple model': [MAE_s , MSE_s, RMSE_s] , 'polynomial model':[MAE_poly, MSE_poly, RMSE_poly]} , index=['MAE', 'MSE',' RootMSE'])


**The errors in polynomial regression model are less then the errors in simple regression model**

In [None]:
# CHECK THE RESIDUALS
f , axes = plt.subplots(2,2, figsize= (10,10))
sns.distplot(residuals_simple, ax=axes[0,0])
sns.distplot(residuals_poly, ax=axes[0,1])
axes[1,0].axhline(y=0, color='r')
sns.scatterplot(x= y_test_s , y=residuals_simple, ax=axes[1,0])
plt.axhline(y=0, color='r')
sns.scatterplot(x= y_test , y=residuals_poly, ax=axes[1,1])

**** the residuals are nearly normal
 and they are rendom

# Adjusting model parameters

In [None]:
# finding the optimal degree of the model based on the root squared error of both training dataset and test dataset
train_RMSE=[]
test_RMSE=[]
for i in range(1,10):
    poly_converter_new = PolynomialFeatures(degree=i, include_bias=False)
    poly_features_new = poly_converter_new.fit_transform(X)
    
    # train test split
    X_train_new , X_test_new ,y_train_new , y_test_new = train_test_split(poly_features_new, y, test_size=0.3)
    
    # train the model
    new_model= LinearRegression()
    new_model.fit(X_train_new, y_train_new)
    
    y_pred_train = new_model.predict(X_train_new)
    y_pred_test = new_model.predict(X_test_new)
    
    train_RMSE.append(np.sqrt(mean_squared_error(y_train_new,y_pred_train)))
    test_RMSE.append(np.sqrt(mean_squared_error(y_test_new, y_pred_test)))
    


In [None]:

# plot the two errors wth the rise of the degree
degree = list(range(1,6))
plt.figure()
plt.plot(degree, train_RMSE[:5], label='train RMSE')
plt.plot(degree, test_RMSE[:5], label='test RMSE')
plt.xlabel('Polynomial degree')
plt.ylabel('RMSE')
plt.legend(loc='upper right')

**optimal model is with degree= 2, so the above poly_model is our optimal model **