In [3]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors

def run_causal_analysis(data_path):
    # Load dataset
    df = pd.read_csv(data_path, low_memory=False)
    df = df.drop(columns=[col for col in df.columns if "Unnamed" in col])
    df["morbidity_index"] = pd.to_numeric(df["morbidity_index"], errors="coerce")
    df = df.dropna(subset=["churn_rate", "treatment_flag", "zusatzbeitrag", "morbidity_index", "insured_lag", "marktanteil versicherte"])

    # Propensity Score Estimation
    X_psm = df[["zusatzbeitrag", "morbidity_index", "insured_lag", "marktanteil versicherte"]]
    y_psm = df["treatment_flag"].astype(int)

    logit = LogisticRegression(max_iter=1000)
    logit.fit(X_psm, y_psm)
    df["propensity_score"] = logit.predict_proba(X_psm)[:, 1]
    df["treatment"] = y_psm

    # Matching
    treated = df[df["treatment"] == 1]
    control = df[df["treatment"] == 0]
    nn = NearestNeighbors(n_neighbors=1)
    nn.fit(control[["propensity_score"]])
    _, indices = nn.kneighbors(treated[["propensity_score"]])
    matched_control = control.iloc[indices.flatten()].reset_index(drop=True)
    matched_treated = treated.reset_index(drop=True)

    # Average Treatment Effect (ATE)
    ate = matched_treated["churn_rate"].mean() - matched_control["churn_rate"].mean()
    print("ATE (Average Treatment Effect):", round(ate, 6))
    return ate


if __name__ == "__main__":
    data_path = "../data/processed/merged_panel_clean_data.csv"
    run_causal_analysis(data_path)


ATE (Average Treatment Effect): -0.001538
