# Week 12 Assignment

### Importing Data

In [1]:
import numpy as np
from scipy import stats
from scipy import special
from pandas import DataFrame

In [2]:
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
import plotly.io as pio
pio.templates.default = 'plotly_white'

In [5]:
pip install patsy

Collecting patsy
  Downloading patsy-1.0.1-py2.py3-none-any.whl.metadata (3.3 kB)
Downloading patsy-1.0.1-py2.py3-none-any.whl (232 kB)
Installing collected packages: patsy
Successfully installed patsy-1.0.1
Note: you may need to restart the kernel to use updated packages.


In [7]:
pip install statsmodels

Collecting statsmodels
  Downloading statsmodels-0.14.4-cp313-cp313-macosx_11_0_arm64.whl.metadata (9.2 kB)
Downloading statsmodels-0.14.4-cp313-cp313-macosx_11_0_arm64.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: statsmodels
Successfully installed statsmodels-0.14.4
Note: you may need to restart the kernel to use updated packages.


In [8]:
from patsy import dmatrices
import statsmodels.api as sm

In [9]:
# Seed the pseudo-random number generator
np.random.seed(12)

# Sample size
n=100

# Create data
wcc = np.round(np.random.normal(15, 5, n), 0)
crp = (wcc * 2) + np.round(np.random.normal(0, 10, n), 0)

lipase = wcc + crp + np.round(np.random.normal(2, 10, n), 0)

# Pandas dataframe obejct
df = DataFrame(
    {'WCC':wcc,
     'CRP':crp,
     'Lipase':lipase}
)

In [11]:
px.scatter_matrix(
    df,
    title='Scatter plot matrix'
)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [12]:
# Target vector and design matrix
y, X = dmatrices('Lipase ~ WCC + CRP', data = df)

### 1. Multiple Linear Regression

In [13]:
# Create a multiple linear regression model with WWC and CRP as predictors of Lipase
model = sm.OLS(y, X).fit()
# Print the summary of the model
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                 Lipase   R-squared:                       0.764
Model:                            OLS   Adj. R-squared:                  0.759
Method:                 Least Squares   F-statistic:                     157.3
Date:                Mon, 21 Apr 2025   Prob (F-statistic):           3.61e-31
Time:                        23:11:14   Log-Likelihood:                -374.15
No. Observations:                 100   AIC:                             754.3
Df Residuals:                      97   BIC:                             762.1
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      2.3741      3.013      0.788      0.4

### 2. Analyzing F Statistic

In [None]:
# Extract the F statistic and its p-value
f_statistic = model.fvalue
f_p_value = model.f_pvalue
print(f"F-statistic: {f_statistic}")
print(f"F p-value: {f_p_value}")

F-statistic: 157.28700016186937
F p-value: 3.6116396316538945e-31


#### The F-statistic is very large and has a very small p-value close to 0. This means that at the 5% level of significance, the null hypothesis can be rejected and there is enough evidence to suggest that the variables WCC and CRP may be strong predictors of the lipase levels.

### 3. Individual Coefficients & P Values

In [20]:
# Print the coefficients WCC, CRP, and Lipase and their p-values
print(f"Coefficient for WCC: {model.params[1]}")
print(f"Coefficient for CRP: {model.params[2]}")
print(f"p-value for WCC: {model.pvalues[1]}")
print(f"p-value for CRP: {model.pvalues[2]}")


Coefficient for WCC: 0.5426102095697339
Coefficient for CRP: 1.1826836477074805
p-value for WCC: 0.04600098849856741
p-value for CRP: 3.6245638880913704e-20


In [26]:
# is the WCC coefficient significant based on the p-value at the 0.05 level?
alpha = 0.05
if model.pvalues[1] < alpha:
    print("Reject the null hypothesis: WCC is significantly associated with Lipase.")
else:
    print("Fail to reject the null hypothesis: WCC is not significantly associated with Lipase.")
# is the CRP coefficient significant based on the p-value at the 0.05 level?
alpha = 0.05
if model.pvalues[2] < alpha:
    print("Reject the null hypothesis: CRP is significantly associated with Lipase.")
else:
    print("Fail to reject the null hypothesis: CRP is not significantly associated with Lipase.")

Reject the null hypothesis: WCC is significantly associated with Lipase.
Reject the null hypothesis: CRP is significantly associated with Lipase.


#### Based on the p-values at the 5% significance level, the coefficients are linearly associated with Lipase.

### 4. R^2 Value

In [23]:
# What is the r-squared value of the model?
r_squared = model.rsquared
print(f"R-squared: {r_squared}")

R-squared: 0.7643194178356721


The r-squred value is 0.7643, which means that the model explains 76.43% of the variance in the Lipase levels. This indicates a strong relationship between the predictors (WCC and CRP) and the response variable (Lipase).