In [None]:
# data operations
import numpy as np
import pandas as pd

import os

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import axes3d

# For regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import scale
#import sklearn.linear_model as skl_lm
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
import statsmodels.formula.api as smf  #Provides a formula-based interface

%matplotlib inline
#plt.style.use('seaborn-white')

homedir=os.environ['HOME'] + '/'
datapath=homedir+ "datasets/"
advtdatafile=datapath + "Advertising.csv"

### Create pandas DataFrame

In [None]:
dataset = pd.read_csv(advtdatafile)
dataset_org = dataset

In [None]:
dataset.info()

In [None]:
dataset.columns

In [None]:
dataset.describe()

In [None]:
## Check whether any column has null entries
dataset.isnull().sum()

### Column selection
* Select all rows
* Select relevant columns - TV, Radio, Newspaper, Sales

In [None]:
tmpds = dataset
dataset = tmpds.iloc[:,1:5]
dataset

### Quick check basics - random data generation

* Random normal distribution (1000 floats)

In [None]:
rnd_normalf = np.random.randn(1000)
print(type(rnd_normalf))

* Choice of 5 samples from previous random samples of 1000

In [None]:
replace_samples = 5
sample_data = np.random.choice(rnd_normalf, replace_samples, replace=True)
print(f"Sample data -> {sample_data}")

In [None]:
import random
num_smpls = 10
rnd_sample = random.sample(range(1, 1000), num_smpls)
print(type(rnd_sample))
#rnd_sample

#### Convert `python list` to `numpy ndarray`
* Python `List` to `numpy array` -> 1D array
* Add pad elements at the end of the array based on columns
* Reshape the array to suitable number of rows and columns
* There could be a better method - but this is one method


In [None]:
# list 
numlist = [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 10 ]
print(f"numlist -> {numlist}, size -> {len(numlist)}")
num_cols = 2
# calculate remainder
rem = len(numlist) % num_cols
# remaining cols for padding
pad = num_cols - rem
print(f"pad numbers -> {pad}")
# convert list to numpy array
numarr = np.array(numlist)
print(f"type numarr -> {type(numarr)}, shape -> {numarr.shape}")
print(f"numarr -> {numarr}")
numarr_pad = np.pad(numarr, pad_width=(0, rem), mode='constant', constant_values=0)
# reshape the numpy array
numrshp = numarr_pad.reshape(-1, 2)
print(f"type numrshp -> {type(numrshp)}, shape -> {numrshp.shape}")
# print(f"numrshp -> {numrshp}")

### Create Pandas DataFrame
* Convert the random sample list to numpy array
* numpy array to Pandas DataFramer

In [None]:
rnd_df = pd.DataFrame(np.array(rnd_sample).reshape(-1, 2))
print(f"rnd_df -> {rnd_df}")

### seaborn pairplot()
* Visualizing pairwise relationships between multiple variables in a dataset. 
* Creates a grid of subplots where each numeric variable is shared across the x and y axes
* Allows quick exploratory data analysis 

#### KDE plot - `Kernel Density Estimate`

In [None]:
import seaborn as sns
cell_enabled = True
if cell_enabled:
    sns.pairplot(rnd_df, diag_kind='kde')

Hist Plot - `Histogram`

In [None]:
import seaborn as sns
cell_enabled = True
if cell_enabled:
    tmpdf = pd.DataFrame(numrshp)
    sns.pairplot(tmpdf, diag_kind='hist')

In [None]:
import seaborn as sns
# Check for linearity. We can also draw the pairplot for 
# checking the relationship
sns.pairplot(dataset, diag_kind='kde')

### Correlation Coefficients - `DataFrame.corr()`
* Computes the pairwise correlation coefficient between all numeric 
columns in a DataFrame,
* Returns DataFrame called as `Correlation Matrix`
* Quantifies the strength and direction of the linear relationship between variables. 
* A result ranges from -1 to 1:
  - `1`: Indicates perfect positive 
  - `-1`: Indicates perfect negative
  - `0`: No relationship


In [None]:
corr = dataset.corr()
#rint(f"correlation ->\n{corr}")
corr

### Correlation Coffecients `Heatmap`

In [None]:
sns.heatmap(corr, vmax=1, vmin=-1, annot=True, cmap='GnBu')

## Using the STATS Models

In [None]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

### OLS - `Ordinary Least Squares`
* Core functionality is to estimate the parameters of a linear 
regression model by minimizing the sum of the squared differences 
(residuals) between the observed and predicted values.
* Returns detailed statistical output.

In [None]:
sales_on_tvradio = smf.ols('Sales ~ TV + Radio', dataset).fit()
sales_on_tvradio.summary().tables[1]

### OLS return - Statistical Data
* Coefficient Estimation
* Statistical significance - `p-value, t-value`
* Model fit metrics - `R-squared`
* Other parameters

In [None]:
sales_on_tvradio.summary()

### Add Newspaper to the set of independent variables

In [None]:
# Add Newspaper to the set of independent variables
sales_on_tvradio = smf.ols('Sales ~ TV + Radio + Newspaper', dataset).fit()
sales_on_tvradio.summary().tables[1]

### `Model` Building with `{Training, Test}` Datasets

#### Column Split original data
* `Y`: dependent variable `{Sales}`
* `X`: independent variable `{TV, Radio, Newspaper}`

In [None]:
dataset_org
X = dataset_org.drop(['Unnamed: 0', 'Sales'], axis=1)
Y = dataset_org.Sales

#### Row Split original data {Training, Test}

In [None]:
Xtrn, Xtst, Ytrn, Ytst = train_test_split(X, Y, test_size=0.3, random_state = 1)

#### LinearRegression

In [None]:
reg_model = LinearRegression()
reg_model.fit(Xtrn, Ytrn)

#### Result Interpretation

* Print the coeffcients

In [None]:
for idx, col_name in enumerate(Xtrn.columns):
    print(f"Coefficent of {col_name} = {reg_model.coef_[idx]}")

* Create DataFrame and Print

In [None]:
print(f"{type(reg_model.coef_)}")
model_coeff_df = pd.DataFrame(reg_model.coef_)
# Co
model_coeff_df.index = [x[1] for x in enumerate(Xtrn.columns)]
print(f"\nCoefficients ->\n{model_coeff_df.to_string(header=False)}")
print(f"\nIntercept -> {reg_model.intercept_}")

### Prediction using Test Data
* Test data features {TV, Radio, Newspaper}
* Coefficients as calculated by the LR Model
* Sales = Intercept + (TV coef * TV) + (Radio coef * Radio) + (Newspaper coef * Newspaper)

In [None]:
Ypred = reg_model.predict(Xtst)

#### R2 Score 
* Coefficient of Determination 
* Represents the proportion of variance (of y) that has been explained by the independent variables in the model

In [None]:
score = reg_model.score(Xtst, Ytst)
print(f"R2 score for the LR model -> {score}")

## MLR with `Interaction`
* Interaction - Change in one of the independent variable affects effectiveness of another independent variable

In [None]:
# Create an Interaction effect between TV and Radio
data = dataset_org
data["TV_Radio_iact"] = data["TV"] * data["Radio"] 
#data

In [None]:
X = dataset[["TV", "Radio", "Newspaper", "TV_Radio_iact"]]
X

In [None]:
sales_ols = smf.ols('Sales ~ TV + Radio + TV_Radio_iact', dataset).fit()
sales_ols.summary().tables[1]

In [None]:
#Statistical Inference Metrics
p_value_F = sales_ols.f_pvalue
# Get R-squared
R_squared = sales_ols.rsquared
R_squared_adj = sales_ols.rsquared_adj

print("p-value:", p_value_F)
print("R-squared:", R_squared)
print("Adjusted R-squared:", R_squared_adj)