In [None]:
import pandas                   as pd
import numpy                    as np
import matplotlib.pyplot        as plt
import seaborn                  as sns
import statsmodels.api          as sm
import statsmodels.stats.api    as sms
from   statsmodels.compat       import lzip
from   statsmodels.stats        import diagnostic as diag


from  statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
import  os
folder  =   r'F:\DSE-Capstone\2023-Oct\PGPDSE_Online-Jun23-G3\Data'
file    =   'Amazon_Sales_data_2024_03_08_19_04_14.csv'

os.chdir(folder)

In [None]:
df    =    pd.read_csv(file, encoding = 'Latin-1')
print(df.info())

In [None]:
df.head().T

In [None]:
feature_names = df.columns.tolist()
feature_names.remove("Qty")
X  =  df.loc[:,feature_names]
y  =  df.Qty

In [None]:
X_            = sm.add_constant(X, has_constant= 'add') # Add an intercept to our model
model         = sm.OLS(y, X_).fit() ## OLS(output, input)
predictions   = model.predict(X_)

## Print the statistics
model.summary()

## Check assumptions
### 1) No outliers

Firstly we try to get the studentized residuals using get_influence( ).

In [None]:
influence     = model.get_influence()  
resid_student = influence.resid_studentized_external
print(resid_student)

### 2) Constant variance

Checking heteroscedasticity Using Goldfeld Quandt we test for heteroscedasticity.

* Null Hypothesis: Error terms are homoscedastic
* Alternative Hypothesis: Error terms are heteroscedastic.

In [None]:
name = ['F statistic', 'p-value']
test = sms.het_goldfeldquandt(model.resid, model.model.exog)
lzip(name, test)

The p-value is < 0.05 and it is 0.001 hence we can say that the residuals do not have constant variance. 

## Constant variance assumption is not satisfied.

### 3) No autocorrelation

#### Checking for autocorrelation To ensure the absence of autocorrelation we use Ljungbox test.

####  Null Hypothesis: Autocorrelation is absent.
#### Alternative Hypothesis: Autocorrelation is present.

In [None]:
diag.acorr_ljungbox(model.resid, lags = 1) 

Since p-value of 0.6365 > 0.05, we can accept the null hypothesis and can say that autocorrelation is absent.

## No auto correlation assumption is satisfied

### 4) Normality of the residuals

#### We use Jarque-Bera test  from scipy library to check the normality of residuals.

Most tests for normality are based either on comparing the empirical cumulative distribution with the theoretical normal cumulative distribution (Kolmogorov-Smirnov, Anderson-Darling, Chi-Square) or empirical quantiles with the theoretical normal quantiles (Wilk-Shapiro). 

The Jarque-Bera test is based on the sample skewness and sample kurtosis. 


#### Null Hypothesis: The residuals are normally distributed.

####  Alternative Hypothesis: The residuals are not normally distributed.

We find from the earlier summary table, that Prob(JB) = 0.172
Since p value of 0.172 > 0.05, we do not have evidence to reject the null hypothesis and thus conclude that the residuals are normally distributed.

## Normality of the residuals assumption is satisfied.

 ### 5) Linearity

The residual vs fitted values plot is used to check for constant variance and linearity, and to identify potential outliers in the data.

In [None]:
residuals  =  model.resid
# Plot the residuals after fitting a linear model
ax         = sns.residplot(y, residuals, lowess = True, color = "g")

ax.set(xlabel='Fitted Value', ylabel='Residuals', title = 'Residual Vs Fitted values PLOT \n')
plt.show()

The residual plot indicates that the model’s residuals are restricting to mean of zero to a great extent exhibiting linearity.

### 6) No multi-collinearity

This assumption is not required for Lasso or Ridge regression techniques.

In [None]:
vif = [variance_inflation_factor(X.values, j) for j in range(X.shape[1])]

In [None]:
print(lzip(vif, feature_names))