In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('../input/advertising-dataset/advertising.csv')

In [None]:
df.head()

### 1.Linear regression

#### Feature selection

In [None]:
X = df.drop('Sales',axis=1)

In [None]:
X.shape

In [None]:
y = df['Sales']

In [None]:
y.shape

### Linear regression model creation

In [None]:
#from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [None]:
model = LinearRegression()

In [None]:
model.fit(X_train,y_train)

#### Predicting values the model has never seen before

In [None]:
test_predictions = model.predict(X_test)
test_predictions

### 2.Let's test the model accuracy 

#### MAE

In [None]:
mean_absolute_error(y_test,test_predictions)

#### RMSE

In [None]:
np.sqrt(mean_squared_error(y_test,test_predictions))

#### MAPE

In [None]:
def mape(actual,pred):
    actual, pred = np.array(actual), np.array(pred)
    return np.mean(np.abs((actual - pred) / actual) * 100)

print('The Mean absolute percentage error is:- ',mape(y_test,test_predictions))

#### R-2 score

In [None]:
r2_score(y_test,test_predictions)

### We can see that with Linear Regression the model accuracy achieved is 91.85%

## Let's also see the residual plots

In [None]:
test_residuals = y_test - test_predictions
test_residuals

In [None]:
plt.figure(figsize=(7,6),dpi=90)
sns.scatterplot(x=y_test, y=test_residuals)
plt.ylabel('Residuals from the Linear regression model')
plt.title('Residual plot of Linear resgression')
plt.axhline(y=0,color='red');

#### With the residual plot we can see that the points are normally distributed along the regression line, though points are a bit far off from the line, but we can say that linear regression was a good choice of algo for this dataset.

### 3. Polynomial Regssion - Selecting the degree of polynomial
#### Though we have good accuracy with Linear regression, this can be further increased with Polynomial regression

#### But we need a way to decide what the order of the polynomial should be to get maximum accuracy.

One way is to run a for loop for the entire process from polynomial feature creation to finally testing the accuracy using RMSE and then plotting out them out to see which degree gives the least error so that we can create a final model it.


In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
train_set_rmse = []
test_set_rmse = []

for d in range(1,10):
    
    # Creating polymial features
    polynomial_converter = PolynomialFeatures(degree = d, include_bias=False)
    poly_features = polynomial_converter.fit_transform(X)
    
    # Creating training and test set
    X_p_train, X_p_test, y_p_train, y_p_test = train_test_split(poly_features, y , test_size=0.3, random_state=101)
    
    # Polynomial Model creation
    poly_model = LinearRegression()
    poly_model.fit(X_p_train,y_p_train)
    
    # Predictions of both train and test set
    train_set_preds = poly_model.predict(X_p_train)
    test_set_preds = poly_model.predict(X_p_test)
    
    # Calculating RMSE for both train and test predictions
    train_rmse = np.sqrt(mean_squared_error(y_p_train,train_set_preds))
    test_rmse = np.sqrt(mean_squared_error(y_p_test,test_set_preds))
    
    # Storing the rmse to be later used for plotting
    train_set_rmse.append(train_rmse)
    test_set_rmse.append(test_rmse)

#### Now we have completed the whole process 

#### Let's see what the RMSE values are and plot them

In [None]:
train_set_rmse

#### From the training set RMSE values we can see that the errors are constantly going down with increase in the order of the polynomial, though at one point we can see that for &th degree it shoots up and then gradually decreases which is kind of a redflag but it decreses after that.

In [None]:
test_set_rmse

#### With the testing set RMSE values we can see that after degree 4 the errors shoot up suddenly to a very high value, giving us an idea where we should stop increasing the order

### 4. Let's visualize them

In [None]:
plt.figure(figsize=(7,6),dpi=90)
plt.plot(range(1,6),train_set_rmse[0:5],label='Train RMSE')
plt.plot(range(1,6),test_set_rmse[0:5],label='Test RMSE')
plt.xlabel('Model complexity')
plt.ylabel('RMSE values')
plt.title('Model complexity vs RMSE on both training and testing set')
plt.legend();

### Usually the training set performs a bit better than the tset test, the same can be seen above

### So even though here we see that degree 4 has less error than degree 3 but we also have to think about whether it is worth the risk to go for degree 4 instead of degree3 since we can clearly see taking up degree 4 has a high risk for shooting up the error very badly so what is suggested is that you should go for degree 3

### 5. So we'll go with degree 3 for our final Polymial regression

### 6.Final model creation

In [None]:
final_poly_converter = PolynomialFeatures(degree=3,include_bias=False)

In [None]:
final_poly_features = final_poly_converter.fit_transform(X)

In [None]:
final_poly_features.shape

#### Initially we had just 3 features now as a result of using degree 3 we have the additional features which include the possible squared values of the original values and possible interaction terms

In [None]:
# training and testing data
X_f_train, X_f_test, y_f_train, y_f_test = train_test_split(final_poly_features, y , test_size=0.3, random_state=101)

In [None]:
final_poly_model = LinearRegression()

In [None]:
final_poly_model.fit(X_f_train,y_f_train)

In [None]:
final_poly_predictions = final_poly_model.predict(X_f_test)

In [None]:
final_poly_predictions

### 7.Let's test the accuracy of the Polynomial regression model and compare it with Linear Regression

#### MAE - polynomial regression



In [None]:
print('MAE of Polynomial Regression',mean_absolute_error(y_f_test,final_poly_predictions))

MAE of Linear regression - 1.21

#### RMSE - polynomial regression

In [None]:
print('RMSE of Polynomial Regression',np.sqrt(mean_squared_error(y_f_test,final_poly_predictions)))

RMSE of Linear regression - 1.51

We can see that both the error metrics of Polynomial regression are performing way better than Linear regression

#### R-2 score - Polynomial Regression

In [None]:
print('The r2 score is',r2_score(y_f_test,final_poly_predictions))

### Polynomial Regression Model Accuracy - 98.88%

Linear Regression model accuracy - 91.85%

### Let's visualize the residual plots

In [None]:
poly_test_residuals = y_f_test - final_poly_predictions

In [None]:
plt.figure(figsize=(8,6),dpi=90)
sns.scatterplot(x=y_f_test, y=poly_test_residuals)
plt.ylabel('Residuals from the Polynomial Regression model')
plt.title('Residual plot of Polynomial regression')
plt.axhline(y=0,color='red');

### Let's visualize the residual plots of Linear regression and Polynomial regression side by side

In [None]:
plt.figure(figsize=(12,5),dpi=90)
plt.subplot(1,2,1)
sns.scatterplot(x=y_test, y=test_residuals)
plt.title('Residual plot of Linear regression')
plt.ylabel('Residuals from the Linear Regression model')
plt.axhline(y=0,color='red');

plt.subplot(1, 2, 2) 
sns.scatterplot(x=y_f_test, y=poly_test_residuals)
plt.title('Residual plot of Polynomial regression')
plt.ylabel('Residuals from the Polynomial Regression model')
plt.axhline(y=0,color='red');

#### From the residual plots we can see that with less errors than Linear regression the residuals in Polynomial regression are close to the regression line, telling us that it has less errors.

## Hence from above metrics analysis we can clearly see that using higher order polynomial we get best accuracy than Linear Regression.

### We should keep in mind that this does not apply to every case, i.e. polynomial regression won't always give best accuracy, but we won't find out until we try it for ourselves.

In [None]:
# Saving the model

In [None]:
from joblib import dump

In [None]:
dump(final_poly_model,'Sales_prediction_model_poly_reg.joblib')

In [None]:
dump(final_poly_converter,'Final_sales_features_converter.joblib')