In [None]:
print("Welcome to Simple or Univariate Linear Regression Model Demo | Authored by Kunal Sharma")

# **Section A**
In this section we're learning about Exploratory Data Analysis and Data Preparation.

In [None]:
# Step 1
# Import all the important libraries like pandas, numpy, and vizualization related libraries for data preparation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Step 2
# Load the data
raw_data = pd.read_csv("../input/advertising-data/advertising.csv")

In [None]:
# Step 3
# First Look at Data
raw_data.head()

In [None]:
raw_data.shape

In [None]:
# Step 4
# Describe the data
raw_data.describe()

In [None]:
# Step 5
# Filter Unknown/nulls
# raw_data.isnull()
raw_data.isnull().sum()
# if case any value is null/NaN, you may select different approaches to solve that issue - https://www.geeksforgeeks.org/working-with-missing-data-in-pandas/

In [None]:
# Step 6
# De-dup
# raw_data.duplicated()
raw_data.duplicated().sum()
# to drop duplicates, do raw_data.drop_duplicates()

In [None]:
from scipy import stats
stats.norm

In [None]:
# Step 7
# Vizualize the data
# Supress Warnings
import warnings
warnings.filterwarnings('ignore')

# lets look at the dependent variable to see if the variable is Normally  distributed
# histogram and normal probability plot
from scipy import stats
sns.distplot(raw_data['Sales'], fit=stats.norm);
fig = plt.figure()
res = stats.probplot(raw_data['Sales'], plot=plt)

Conclusion

The output variable here is normally distributed. In case it is not, we might need to take actions to normally distribute the data.
```
raw_data['Sales'] = np.log(raw_data['Sales'])

sns.distplot(raw_data['Sales'],fit=norm)

fig = plt.figure()

res = stats.probplot(raw_data['Sales'], plot=plt)
```

In [None]:
# Step 8
# Outlier Analysis
fig, axs = plt.subplots(3, figsize = (5,5))
plt1 = sns.boxplot(raw_data['TV'], ax = axs[0])
plt2 = sns.boxplot(raw_data['Newspaper'], ax = axs[1])
plt3 = sns.boxplot(raw_data['Radio'], ax = axs[2])
plt.tight_layout()

In [None]:
# There are outliers in case of Newspaper, but those cases are not significant. The max value at Newspaper 114 seems to be an outlier.

In [None]:
# Let's see how Sales are related with other variables using scatter plot.
sns.pairplot(raw_data, x_vars=['TV', 'Newspaper', 'Radio'], y_vars='Sales', kind='scatter')
plt.show()

In [None]:
# A clear linear relationship can be seen between Sales and TV adveritising

In [None]:
# Let's see the correlation between different variables.
sns.heatmap(raw_data.corr(), annot = True)
plt.show()

In [None]:
# The above correlation matrix confirms the relationship between Sales and TV advertising. This also means that a univariate/simple linear regression model can be used to further predict outcomes here.

# **Section B**
In this section we're going to build a Linear Regression Model and Train it for making predictions.

In [None]:
# Step 1
# Import sklearn to split the data
from sklearn.model_selection import train_test_split

In [None]:
# Step 2
# Define the variables and training and test dataset
X = np.array(raw_data["TV"]).reshape(-1, 1)  # reshaping to avoid the following error https://www.geeksforgeeks.org/python-linear-regression-using-sklearn/
y = np.array(raw_data["Sales"]).reshape(-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state = 0)

In [None]:
# Step 3
# Fit the model
from sklearn.linear_model import LinearRegression

model = LinearRegression()  
model.fit(X_train, y_train)

In [None]:
# Checking model fit on data
# More to learn from https://scikit-learn.org/stable/modules/model_evaluation.html
print(model.score(X_test, y_test))
# higher the score, better is the fit (think of Overfitting)

In [None]:
# Step 4
# Doing Predictions
model_predictions = model.predict(X_test)

In [None]:
# lets visualize the model predictions
plt.scatter(x = y_test, y = model_predictions)
plt.xlabel('Y Test')
plt.ylabel('Predicted Y')

In [None]:
# Step 5
# Evaluate the predictions
# Document to follow - https://scikit-learn.org/stable/modules/model_evaluation.html#regression-metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, explained_variance_score, r2_score
print(mean_absolute_error(y_test, model_predictions))
print(mean_squared_error(y_test, model_predictions))
print(explained_variance_score(y_test, model_predictions))
print(r2_score(y_test, model_predictions))

In [None]:
# Visualizing the training data and model predictions 
plt.scatter(X_train, y_train, color= 'red')
plt.plot(X_train, model.predict(X_train), color = 'blue')
plt.title ("Visuals for Training Dataset with model fit")
plt.xlabel("TV")
plt.ylabel("Sales")
plt.show()

In [None]:
# Visualizing the training data and model predictions 
plt.scatter(X_test, y_test, color= 'red')
plt.plot(X_test, model.predict(X_test), color = 'blue')
plt.title ("Visuals for Test Dataset with model fit")
plt.xlabel("TV")
plt.ylabel("Sales")
plt.show()

In [None]:
raw_data.head()

In [None]:
# Step 6
# Evaluate single input variables on the go
model.predict(np.array(151).reshape(-1,1))

In [None]:
model.predict(np.array(155).reshape(-1,1))

In [None]:
model.predict(np.array(160).reshape(-1,1))

In [None]:
print(model.coef_)
print(model.intercept_)

In [None]:
# Step 7
# Review model from statistical aspects
from statsmodels.api import OLS
OLS(y_test,X_test).fit().summary()

**Takeaways from Above Summary**

1) R-Squared is quite high. https://www.investopedia.com/terms/r/r-squared.asp

2) p-value = 1.89e-33 = 1.89*(2.71828183**-33) = 0.006676667797074559 is quite less than critical value of 0.05. Hence the result is statistically significant.

3) F-statistic is quite large. Meaning that the model fit is statistically significant, and the explained variance isn't purely by chance.

# **Conclusion**
The Univariate Regression model will look as following.

Sales = 0.05473199 X TV + 7.14382225

y = bx + a

In [None]:
# Let's see how Sales prediction looks like
sns.distplot((raw_data), bins = 50)

In [None]:
sns.distplot((y_test), bins = 50)

In [None]:
sns.distplot((model_predictions), bins = 50)

In [None]:
# Let's see how Sales are related with other variables using scatter plot.
sns.jointplot(x = 'TV', y = 'Sales', data = raw_data)
plt.show()