In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Problem Statement ##

Build a model which predicts sales based on the money spent on different platforms for marketing.

## Data ##

Use the advertising dataset given in ISLR and analyse the relationship between 'TV advertising' and 'sales' using a simple linear regression model.

[](http://)In this notebook, we'll build a linear regression model to predict Sales using an appropriate predictor variable.

In [None]:
import warnings
warnings.filterwarnings('ignore')

# Import the numpy and pandas package

import numpy as np
import pandas as pd

# Data Visualisation
import matplotlib.pyplot as plt 
import seaborn as sns

In [None]:
advertising = pd.DataFrame(pd.read_csv("/kaggle/input/advertising-dataset/advertising.csv"))

advertising.head()

## Data Exploration ##

In [None]:
advertising.shape

In [None]:
advertising.info()

In [None]:
advertising.describe()

In [None]:
advertising.isnull().sum()

In [None]:
# Let's see how Sales are related with other variables using scatter plot.
sns.pairplot(advertising, x_vars=['TV', 'Newspaper', 'Radio'], y_vars='Sales', height=4, aspect=1, kind='scatter')
plt.show()

In [None]:
# Let's see the correlation between different variables.
sns.heatmap(advertising.corr(), cmap="YlGnBu", annot = True)
plt.show()

As is visible from the pairplot and the heatmap, the variable TV seems to be most correlated with Sales. So let's go ahead and perform simple linear regression using TV as our feature variable.

## Model Building ##

## Performing Simple Linear Regression ##

Equation of linear regression

y=c+m1x1+m2x2+...+mnxn

y  is the response/target/dependent variables

c  is the intercept

m1  is the coefficient for the first feature

mn  is the coefficient for the nth feature

In our case:

y(Sales) =  c +  m1  * TV 

The  m  values are called the model coefficients or model parameters.

Generic Steps in model building using statsmodels

We first assign the feature variable, TV, in this case, to the variable X and the response variable, 

Sales, to the variable y.


In [None]:
X = advertising['TV']
y = advertising['Sales']

## Train-Test Split ##

You now need to split our variable into training and testing sets. You'll perform this by importing train_test_split from the sklearn.model_selection library. It is usually a good practice to keep 70% of the data in your train dataset and the rest 30% in your test dataset

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
         test_size = 0.3, random_state = 100)

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
X_train = X_train.values.reshape(-1,1)
X_test = X_test.values.reshape(-1,1)

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
print(X_test.shape)
print(y_test.shape)

## Build a Model ##

In [None]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()


reg.fit(X_train,y_train)  # Fit  --> start learning 


In [None]:
reg.score(X_train,y_train)  # score --- how good is the model  ?

In [None]:
reg.coef_

In [None]:
reg.intercept_

Y(Sales) = 6.9486 + 0.05454 * X(TV)


In [None]:
plt.scatter(X_train, y_train)
plt.plot(X_train, 6.948 + 0.054*X_train, 'r')
plt.show()

# Best fit Line 

## Model Evaluation

## Residual analysis

To validate assumptions of the model, and hence the reliability for inference

Distribution of the error terms

We need to check if the error terms are also normally distributed (which is infact, one of the major assumptions of linear regression), let us plot the histogram of the error terms and see what it looks like.


In [None]:
y_train_pred = reg.predict(X_train)
print(y_train_pred[0:5])
print(y_train[0:5])
res = (y_train - y_train_pred)
print(res[0:5])  # error  - difference between true value and predicted value 

In [None]:
fig = plt.figure()
sns.distplot(res, bins = 15)
fig.suptitle('Error Terms', fontsize = 15)                  # Plot heading 
plt.xlabel('y_train - y_train_pred', fontsize = 15)         # X-label
plt.show()

###  The residuals are following the normally distributed with a mean 0. All good!

In [None]:
y_pred = reg.predict(X_test)

print(y_test[0:5])
print(y_pred[0:5])

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

#MSE =( (y_test_1 - y_pred_1 )^2 + ..... +   (y_test_60 - y_pred_60 )^2 )/60
#RMSE  = Square Root of MSE

## Root Mean Square Error


In [None]:
#Returns the mean squared error; we'll take a square root
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
r_squared = r2_score(y_test, y_pred)
r_squared

# R-Square value represent - how much variability is explained by model 

In [None]:
plt.scatter(X_test, y_test)
plt.plot(X_test, 6.948 + 0.054 * X_test, 'r')
plt.show()