In [None]:
!pip install linearmodels

In [None]:
pip install scipy==1.10.1 statsmodels==0.13.5


In [None]:
import pandas as pd
df = pd.read_csv("../data/processed/merged_panel_clean_data.csv")
# import pandas as pd

# # Load cleaned panel
# df = pd.read_csv("merged_panel_clean_data.csv")

# Create a combined time identifier
df["zeit"] = df["jahr"].astype(str) + "Q" + df["quartal"].astype(str)
df["zeit"] = df["zeit"].astype("category")

# Optional: define post-treatment indicator (cumulative)
df["post"] = df.groupby("kasse_clean")["treatment_flag"].transform(lambda x: x.cumsum() > 0)

# print(df.columns.tolist())
# import statsmodels.formula.api as smf

df["treatment_flag"] = df["treatment_flag"].astype(int)
df["post"] = df["post"].astype(int)
df["churn_rate"] = pd.to_numeric(df["churn_rate"], errors="coerce")
df["morbidity_index"] = pd.to_numeric(df["morbidity_index"], errors="coerce")
df["zusatzbeitrag_lag"] = pd.to_numeric(df["zusatzbeitrag_lag"], errors="coerce")

df_model = df.dropna(subset=["churn_rate", "treatment_flag", "post", "morbidity_index", "zusatzbeitrag_lag"])

# # Fit OLS model with interaction
# did_model = smf.ols(
#     formula="churn_rate ~ treatment_flag * post + morbidity_index + zusatzbeitrag_lag",
#     data=df
# ).fit()

# # Show results
# print(did_model.summary())

import statsmodels.formula.api as smf

did_model = smf.ols(
    formula="churn_rate ~ treatment_flag * post + morbidity_index + zusatzbeitrag_lag",
    data=df_model
).fit()

print(did_model.summary())


In [None]:
# Install required packages if you haven't yet
# !pip install linearmodels statsmodels

import pandas as pd
from linearmodels.panel import PanelOLS

# Load cleaned panel data
df = pd.read_csv("../data/processed/merged_panel_clean_data.csv")

# Create combined time identifier (e.g., "2023Q1")
df["zeit"] = df["jahr"].astype(str) + "Q" + df["quartal"].astype(str)

# Convert "zeit" to a pandas Period and then to Timestamp for proper panel indexing
df["zeit_dt"] = pd.PeriodIndex(df["zeit"].astype(str), freq="Q").to_timestamp()

# Convert fund ID to string (entity id)
df["kasse_clean"] = df["kasse_clean"].astype(str)

# Define post-treatment indicator (cumulative sum > 0 per fund)
df["post"] = df.groupby("kasse_clean")["treatment_flag"].transform(lambda x: x.cumsum() > 0).astype(int)

# Convert treatment_flag to int if not already
df["treatment_flag"] = df["treatment_flag"].astype(int)

# Ensure numeric columns are properly typed (coerce errors to NaN)
df["churn_rate"] = pd.to_numeric(df["churn_rate"], errors="coerce")
df["morbidity_index"] = pd.to_numeric(df["morbidity_index"], errors="coerce")
df["zusatzbeitrag_lag"] = pd.to_numeric(df["zusatzbeitrag_lag"], errors="coerce")

# Drop rows with missing values in relevant columns
df_model = df.dropna(subset=["churn_rate", "treatment_flag", "post", "morbidity_index", "zusatzbeitrag_lag"])

# Set multi-index for panel data (entity: kasse_clean, time: zeit_dt)
df_model = df_model.set_index(["kasse_clean", "zeit_dt"])

# Specify the model formula:
# We model churn_rate as a function of treatment_flag, morbidity_index, zusatzbeitrag_lag
# + fixed effects for entity and time
model = PanelOLS.from_formula(
    formula="churn_rate ~ treatment_flag + morbidity_index + zusatzbeitrag_lag + EntityEffects + TimeEffects",
    data=df_model
)

# Fit the model
fe_results = model.fit()

# Print the summary of fixed effects DiD model
print(fe_results.summary)