In [8]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import scipy as sc
import warnings
warnings.filterwarnings('ignore')
file_path = "D:/MBA/Term/MBA Term 4/BAE/Data/SalesData.xlsx"
data= pd.read_excel(file_path)

print(data.head())


   sales  price  advert
0   73.2   5.69     1.3
1   71.8   6.49     2.9
2   62.4   5.63     0.8
3   67.4   6.22     0.7
4   89.3   5.02     1.5


In [13]:

# Define the dependent variable
Y = data['sales']

# Define the independent variables and add a constant term
X = data[['price', 'advert']]
X = sm.add_constant(X)

# Fit the model
Model = sm.OLS(Y, X).fit()

# Print the summary
print(Model.summary())


                            OLS Regression Results                            
Dep. Variable:                  sales   R-squared:                       0.448
Model:                            OLS   Adj. R-squared:                  0.433
Method:                 Least Squares   F-statistic:                     29.25
Date:                Wed, 21 Aug 2024   Prob (F-statistic):           5.04e-10
Time:                        22:56:11   Log-Likelihood:                -223.87
No. Observations:                  75   AIC:                             453.7
Df Residuals:                      72   BIC:                             460.7
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        118.9136      6.352     18.722      0.0

In [10]:
t_test_p= Model.t_test('price=0')
t_test_a= Model.t_test('advert=0')
print(t_test_p)
print(t_test_a)

                             Test for Constraints                             
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
c0            -7.9079      1.096     -7.215      0.000     -10.093      -5.723
                             Test for Constraints                             
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
c0             1.8626      0.683      2.726      0.008       0.501       3.225


In [14]:
from scipy.stats import t
df=72
alpha=0.05
t_critical=t.ppf(1-alpha/2,df)
print(t_critical)

1.9934635662785827


In [12]:
t_stat_a = t_test_a.statistic
print(t_stat_a)

[[2.72628309]]


In [15]:
#Testing Hypothesis for H0

if abs(t_stat_a) > abs(t_critical):
    print("H0 is rejected under 5% level of significance")
else:
    print("H0 is not rejected under 5% level of significance")

H0 is rejected under 5% level of significance


In [17]:
#Extracting calculated t_statistic for the coefficient of advertisment variable from the model
t_stat_p = t_test_p.statistic
print(t_stat_p)

[[-7.21524149]]


In [18]:
if abs(t_stat_p) > abs(t_critical):
    print("H0 is rejected under 5% level of significance")
else:
    print("H0 is not rejected under 5% level of significance")

H0 is rejected under 5% level of significance


In [22]:
F_test= Model.f_test('price=0, advert=0')
print(F_test)

<F test: F=29.24785947967355, p=5.040856696802471e-10, df_denom=72, df_num=2>


In [23]:
F_stat=F_test.statistic
print(F_stat)

29.24785947967355


In [20]:
from scipy.stats import f

df_num = 2
df_denom = 72
alpha = 0.05
# Calculate the critical value for the F-distribution
F_critical = f.ppf(1 - alpha/2, df_num, df_denom)

print(F_critical)


3.884501393488646


In [24]:
#Testing joint hypothesis for H0 under 5% level of significance

if abs(F_stat) > abs(F_critical):
    print("H0 is rejected under 5% level of significance")
else:
    print("H0 is not rejected under 5% level of significance")

H0 is rejected under 5% level of significance


In [26]:
#Computing fitted values. Calculating RSS ESS TSS
Y_hat = Model.fittedvalues
Y_bar = np.mean(Y)

RSS = np.sum((Y-Y_hat)**2)
ESS = np.sum((Y_hat-Y_bar)**2)
TSS = np.sum((Y-Y_bar)**2)

In [None]:
print(RSS)
print(ESS)
print(RSS+ESS)
print(TSS)

R-squared= ESS/TSS= 1-RSS/TSS

In [28]:
#Extracting R-square from summary statistics
R_squared = Model.rsquared

#R-square using ESS and TSS
R_squared_e = ESS/TSS

#R-square using RSS and TSS
R_squared_r = 1-RSS/TSS

In [31]:
print(R_squared_r)
print(R_squared_e)
print(R_squared)

0.4482577622149436
0.44825776221494323
0.44825776221494373


Calculating Variance Inflating Factor(VIF)

VIF is a measure of regression multicollinearity i.e. dependency of one independent variable on another in the regression system.

VIF = 1: No correlation

1<VIF<5: Low multicollinearity

VIF>5: High multicollinearity


In [32]:
#Computing VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values,i)
                   for i in range(X.shape[1])]
print(vif_data)

  feature         VIF
0   const  126.737084
1   price    1.000696
2  advert    1.000696


In [33]:
#Prepairing for BP test

Mat_X = Model.model.exog
Y_resid = Model.resid

In [34]:
#Performing and presenting BP test of heteroschedasticity

from statsmodels.stats.diagnostic import het_breuschpagan

BP = het_breuschpagan(Y_resid,Mat_X)

BP_labels = {
    'LM Statistic':BP[0],
    'LM p-value': BP[1],
    'F statistic': BP[2],
    'F p-value': BP[3]
}

for label, value in BP_labels.items():
    print(f"{label}:{value}")

LM Statistic:2.5722229762860813
LM p-value:0.276343260613086
F statistic:1.278515383897261
F p-value:0.28469519043223634
