idea:

use generalized linear model to predict gap days as a function of drug, using total drug duration as an offset \
drug: factor with 16 levels

In [8]:
import pandas as pd
import numpy as np
from statsmodels.formula.api import glm
import statsmodels.api as sm

In [44]:
df = pd.read_csv("../../dataset/small_focused_df.csv")
df = df.drop(columns=["eid"])
df.head()

Unnamed: 0,drug_concept_id,drug_era_id,drug_era_start_date,drug_era_end_date,drug_exposure_count,gap_days,concept_name,atc_code
0,708298,1700807090784,12/12/2013,10/01/2014,1,0,midazolam,N05CD08
1,708298,1666447356341,29/04/2013,28/05/2013,1,0,midazolam,N05CD08
2,708298,1194000953715,05/09/2013,26/10/2013,4,0,midazolam,N05CD08
3,708298,1065151983292,16/02/2011,17/03/2011,1,0,midazolam,N05CD08
4,708298,996432425515,14/03/2016,12/04/2016,1,0,midazolam,N05CD08


In [36]:
# number of unique drugs
len(df["atc_code"].unique())

16

In [40]:
len(df)

17668

In [37]:
df["drug_era_start_date"] = pd.to_datetime(df["drug_era_start_date"], format="%d/%m/%Y")
df["drug_era_end_date"] = pd.to_datetime(df["drug_era_end_date"], format="%d/%m/%Y")
# duration = drug_era_end_date - drug_era_start_date + 1
df["duration"] = (df["drug_era_end_date"] - df["drug_era_start_date"]).dt.days + 1

In [45]:
df.head()

Unnamed: 0,drug_concept_id,drug_era_id,drug_era_start_date,drug_era_end_date,drug_exposure_count,gap_days,concept_name,atc_code
0,708298,1700807090784,12/12/2013,10/01/2014,1,0,midazolam,N05CD08
1,708298,1666447356341,29/04/2013,28/05/2013,1,0,midazolam,N05CD08
2,708298,1194000953715,05/09/2013,26/10/2013,4,0,midazolam,N05CD08
3,708298,1065151983292,16/02/2011,17/03/2011,1,0,midazolam,N05CD08
4,708298,996432425515,14/03/2016,12/04/2016,1,0,midazolam,N05CD08


In [39]:
# Fit GLM with Poisson distribution
model = glm(
    formula="gap_days ~ C(atc_code)",
    data=df,
    family=sm.families.Poisson(link=sm.families.links.Log()),
    offset=np.log(df["duration"]),
)
results = model.fit()

# Display summary
print(results.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:               gap_days   No. Observations:                17668
Model:                            GLM   Df Residuals:                    17652
Model Family:                 Poisson   Df Model:                           15
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -1.2294e+05
Date:                Thu, 21 Nov 2024   Deviance:                   2.2328e+05
Time:                        16:59:19   Pearson chi2:                 2.39e+05
No. Iterations:                     7   Pseudo R-squ. (CS):             0.8248
Covariance Type:            nonrobust                                         
                                      coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
Intercept 

log(E(gap_days)) = β₀ + β₁X₁ + β₂X₂ + ... + log(duration), which is equivalent to \
E(gap_days) = duration × exp(β₀ + β₁X₁ + β₂X₂ + ...)

log(λᵢ) = β₀ + β₁X₁ᵢ + β₂X₂ᵢ + ... + log(durationᵢ)

Where: \
E(gap_days) is the expected number of gap days \
β₀ is the intercept \
β₁, β₂, ... are coefficients for each drug type \
X₁, X₂, ... are dummy variables (0/1) for drug types

In [42]:
# Fit GLM with Poisson distribution, using duration as exposure
model = glm(
    formula="gap_days ~ C(atc_code)",
    data=df,
    family=sm.families.Poisson(link=sm.families.links.Log()),
    exposure=df["duration"],  # Changed from offset to exposure
)
results = model.fit()

print(results.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:               gap_days   No. Observations:                17668
Model:                            GLM   Df Residuals:                    17652
Model Family:                 Poisson   Df Model:                           15
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -1.2294e+05
Date:                Thu, 21 Nov 2024   Deviance:                   2.2328e+05
Time:                        17:10:04   Pearson chi2:                 2.39e+05
No. Iterations:                     7   Pseudo R-squ. (CS):             0.8248
Covariance Type:            nonrobust                                         
                                      coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
Intercept 