### Import libraries

In [None]:
import sklearn

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [None]:
print(sklearn.__version__)

### Advertising and Sales
Source: https://www.kaggle.com/sazid28/advertising.csv/downloads/advertising.csv.zip/1

Data Fields:

TV -- amount spend on TV ads on sales in dollars.   
radio -- amount spend on radio ads on sales in dollars.   
newspaper -- amount spend on newspaper ads on sales in dollars.   
sales --- sale in dollars.

In [None]:
advertising_data = pd.read_csv('../input/advertising.csv/Advertising.csv', index_col=0)

advertising_data.head()

In [None]:
advertising_data.shape

In [None]:
advertising_data.describe()

### Visualizing relationships

In [None]:
plt.figure(figsize=(8, 8))

plt.scatter(advertising_data['newspaper'], advertising_data['sales'], c='y')

plt.show()

In [None]:
plt.figure(figsize=(8, 8))

plt.scatter(advertising_data['radio'], advertising_data['sales'], c='y')

plt.show()

In [None]:
plt.figure(figsize=(8, 8))

plt.scatter(advertising_data['TV'], advertising_data['sales'], c='y')

plt.show()

In [None]:
advertising_data_correlation = advertising_data.corr()

advertising_data_correlation

In [None]:
import seaborn as sns

fig, ax = plt.subplots(figsize=(8, 8))

sns.heatmap(advertising_data_correlation, annot=True)

In [None]:
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

##### This time we are taking only one variable TV ads for predicting the sale

In [None]:
X = advertising_data['TV'].values.reshape(-1, 1)

Y = advertising_data['sales'].values.reshape(-1, 1)

In [None]:
X.shape, Y.shape

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=0)

In [None]:
x_train.shape, y_train.shape

In [None]:
x_test.shape, y_test.shape

### Statsmodels Regression Result

In [None]:
import statsmodels.api as sm

x_train_with_intercept = sm.add_constant(x_train)
stats_model = sm.OLS(y_train, x_train_with_intercept)

fit_model = stats_model.fit()

print(fit_model.summary())

### Single Linear Regression
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html
#### Train a model for predicting the sale

In [None]:
linear_reg = LinearRegression(normalize=True).fit(x_train, y_train)

linear_reg

### r_2 Score
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.r2_score.html

In [None]:
print("Training_score : " , linear_reg.score(x_train, y_train))

In [None]:
y_pred = linear_reg.predict(x_test)

In [None]:
from sklearn.metrics import r2_score

print("Testing_score : ", r2_score(y_test, y_pred))

##### Defining a function for calculating the adjusted r_2 score

In [None]:
def adjusted_r2(r_square, labels, features):
    
    adj_r_square = 1 - ((1 - r_square) * (len(labels) - 1)) / (len(labels) - features.shape[1] - 1)
    
    return adj_r_square

In [None]:
print("Adjusted_r2_score : ", adjusted_r2(r2_score(y_test, y_pred), y_test, x_test))

In [None]:
plt.figure(figsize=(8, 8))

plt.scatter(x_test,
            y_test,
            c='black')

plt.plot(x_test,
         y_pred,
         c='blue',
         linewidth=2)

plt.xlabel("Money spent on TV ads ($)")
plt.ylabel("Sales ($)")

plt.show()

### Multiple Linear Regression

##### This time we are taking multiple variables ( TV, radio and newspaper ads) for predicting the sales

In [None]:
X = advertising_data.drop('sales', axis=1)

Y = advertising_data['sales']

In [None]:
X.head(5)

In [None]:
Y.head()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.30)

In [None]:
x_train.shape, y_train.shape

In [None]:
x_test.shape, y_test.shape

### Statsmodels Regression Result

In [None]:
x_train_with_intercept = sm.add_constant(x_train)
stats_model = sm.OLS(y_train, x_train_with_intercept)

fit_model = stats_model.fit()

print(fit_model.summary())

In [None]:
linear_reg = LinearRegression(normalize=True).fit(x_train, y_train)

linear_reg

In [None]:
print("Training_score : " , linear_reg.score(x_train, y_train))

In [None]:
predictors = x_train.columns

coef = pd.Series(linear_reg.coef_, predictors).sort_values()

print(coef)

In [None]:
y_pred = linear_reg.predict(x_test)

In [None]:
print("Testing_score : ", r2_score(y_test, y_pred))

In [None]:
print("Adjusted_r2_score : ", adjusted_r2(r2_score(y_test, y_pred), y_test, x_test))

In [None]:
plt.figure(figsize = (15, 8))

plt.plot(y_pred, label='Predicted')
plt.plot(y_test.values, label='Actual')

plt.ylabel("Sales ($)")
plt.legend()
plt.show()