In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from scipy.stats import norm

In [None]:
df= pd.read_csv('../input/advertising-dataset/advertising.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

**Missing Values Count**


In [None]:
df[df.isna()].count()

## **Outlier Percentage**

In [None]:
def outlier(col): 
    l= len(df)
    q1= df[col].quantile(0.25)
    q3=df[col].quantile(0.75)
    IQR= q3-q1
    lower= q1-(IQR*1.5)
    upper= q3+(IQR*1.5)
    n= len(df.loc[np.where((df[col] > upper) | (df[col] < lower))])
    perc= (n/l)*100
    print(f'{col}= {perc}')
    print(n)

In [None]:
for i in df.columns:
  outlier(i)


In [None]:
sns.pairplot(df)

Note:

The Independent varibles are exhibiting colinearity (further check will be done with the help of pearson's correlation) with respect to the dependent variable thus qualifying this for regression model


In [None]:
sns.heatmap(df.corr(), annot= True,linecolor='white',linewidths=.5)

In [None]:
sns.distplot(df['Sales'],fit=norm)

Dividing the dataset into independent and dependent variables

In [None]:
x= df[['Newspaper','Radio','TV']]
y= df['Sales']

Creating test and train for dependent and independent variables 

In [None]:
from sklearn.model_selection import train_test_split 
x_train,x_test,y_train,y_test= train_test_split(x,y,test_size= 0.2,random_state= 101)

In [None]:
y_train

## Train the training sets into the linear regression model

x is the regressor and Beta is the coeff

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression(fit_intercept= True)
regressor.fit(x_train,y_train)

In [None]:
coeff_def= pd.DataFrame(regressor.coef_, x.columns, columns=['Coefficient'])
coeff_def

sign reversal not noticed thus, multicolinerarity not evident as of now

setting beta0 :


In [None]:
regressor.intercept_

## Generating predicted values on x_train

In [None]:
y_pred= regressor.predict(x_train)
y_pred

In [None]:
train_preddf= pd.DataFrame({'Actual': y_train, 'Predicted': y_pred,'Residuals':(y_train-y_pred)})

In [None]:
train_preddf

Calculating Residual value and plotting it against fitted :

In [None]:
residuals= y_train - y_pred

In [None]:
sns.scatterplot(x= y_pred, y= residuals)
plt.xlabel('Fitted')
plt.ylabel('Residuals')

In [None]:
sns.distplot(residuals,fit=norm)
plt.xlabel('Residuals')

In [None]:
from scipy import stats
stats.probplot(residuals, plot=plt)
plt.show()

Introducing constant i.e. Beta0 or Intercept for the ols regression analysis

In [None]:
import statsmodels.api as sm

In [None]:
x_endog= sm.add_constant(x_train)
x_endog1= sm.add_constant(x_test)

In [None]:
res= sm.OLS(y_train,x_endog)
res.fit()

In [None]:
res.fit().summary()

In [None]:
y_pred1 = regressor.predict(x_test)

from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred1))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred1))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred1)))

In [None]:
y_pred2 = res.fit().predict(x_endog)

from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_train, y_pred2))
print('Mean Squared Error:', metrics.mean_squared_error(y_train, y_pred2))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_train, y_pred2)))

In [None]:
TestResiduals=y_test - y_pred1
stats.probplot(TestResiduals, plot=plt)
plt.show()

## **Statistical Interaction Effect**

In [None]:
#generating interaction terms

In [None]:
from sklearn.preprocessing import PolynomialFeatures 
from statsmodels.regression import linear_model

In [None]:
x_interaction = PolynomialFeatures(2,interaction_only= True,include_bias= False).fit_transform(x)

In [None]:
x.columns

In [None]:
interaction_df= pd.DataFrame(x_interaction, columns= ['Newspaper', 'Radio', 'TV','Newspaper:Radio','Newspaper:TV','Radio:TV'] )

In [None]:
interaction_model = linear_model.OLS(y,interaction_df).fit()

In [None]:
interaction_model.pvalues[interaction_model.pvalues<0.05]

In [None]:
x_interactiontrain,x_interactiontest= train_test_split(interaction_df,test_size= 0.2,random_state= 101)

In [None]:
regressor.fit(x_interactiontrain,y_train)

In [None]:
coeff_def_interaction= pd.DataFrame(regressor.coef_, interaction_df.columns, columns=['Coefficient'])
coeff_def_interaction

In [None]:
y_pred_interaction= regressor.predict(x_interactiontrain)
y_pred_interaction

In [None]:
train_preddf_interaction= pd.DataFrame({'Actual': y_train, 'Predicted': y_pred_interaction,'Residuals':(y_train-y_pred_interaction)})
train_preddf_interaction

In [None]:
residuals_interaction= y_train-y_pred_interaction

In [None]:
sns.scatterplot(x= y_pred, y= residuals_interaction)
plt.xlabel('Fitted')
plt.ylabel('Residuals')

In [None]:
sns.distplot(residuals_interaction,fit=norm)
plt.xlabel('Residuals')

In [None]:
stats.probplot(residuals_interaction, plot=plt)
plt.show()

In [None]:
x_interaction_endog= sm.add_constant(x_interactiontrain)
x_interaction_endog1= sm.add_constant(x_interactiontest)

In [None]:
res_interaction= sm.OLS(y_train,x_interaction_endog)
res_interaction.fit().summary()

The Newspaper:Radio as per the above OLS regression result is insignificant at alpha=10%. Will see the performance of the model with and without it and decide its inclusion or exclusion.

With Newspaper:Radio

In [None]:
y_pred2_interaction = regressor.predict(x_interactiontrain)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_train, y_pred2_interaction))
print('Mean Squared Error:', metrics.mean_squared_error(y_train, y_pred2_interaction))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_train, y_pred2_interaction)))

In [None]:
stats.probplot(y_train-y_pred2_interaction, plot=plt)
plt.show()

In [None]:
y_pred1_interaction = regressor.predict(x_interactiontest)

from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred1_interaction))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred1_interaction))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred1_interaction)))

In [None]:
stats.probplot(y_test-y_pred1_interaction, plot=plt)
plt.show()

Without Newpaper:Radio

In [None]:
interaction_df1= interaction_df.drop(columns='Newspaper:Radio')

In [None]:
interaction_model1 = linear_model.OLS(y,interaction_df1).fit()

In [None]:
x_interactiontrain1,x_interactiontest1= train_test_split(interaction_df1,test_size= 0.2,random_state= 101)

In [None]:
x_interaction_endog_wo= sm.add_constant(x_interactiontrain1)
x_interaction_endog1_wo= sm.add_constant(x_interactiontest1)
res_interaction1= sm.OLS(y_train,x_interaction_endog_wo)
res_interaction1.fit().summary()

In [None]:
regressor.fit(x_interactiontrain1,y_train)

In [None]:
y_pred2_interaction1 = regressor.predict(x_interactiontrain1)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_train, y_pred2_interaction1))
print('Mean Squared Error:', metrics.mean_squared_error(y_train, y_pred2_interaction1))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_train, y_pred2_interaction1)))

In [None]:
stats.probplot(y_train-y_pred2_interaction1, plot=plt)
plt.show()

In [None]:
y_pred1_interaction1 = regressor.predict(x_interactiontest1)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred1_interaction1))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred1_interaction1))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred1_interaction1)))

In [None]:
stats.probplot(y_test-y_pred1_interaction1, plot=plt)
plt.show()