In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, f1_score, brier_score_loss, precision_recall_curve
from sklearn.utils import class_weight
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input

def run_predictive_model(data_path):
    # Load dataset
    df = pd.read_csv(data_path, low_memory=False)
    df = df.loc[:, ~df.columns.str.contains("Unnamed")]
    df["morbidity_index"] = pd.to_numeric(df["morbidity_index"], errors="coerce")
    
    # Drop rows with missing values in important columns including features and target
    feature_cols = ["zusatzbeitrag", "zusatzbeitrag_avg", "morbidity_index", "marktanteil versicherte", "insured_lag"]
    df_subset = df.dropna(subset=feature_cols + ["churn_rate", "treatment_flag"])
    
    # Extract features and target
    X = df_subset[feature_cols]
    y = df_subset["churn_rate"]
    
    # Binarize target: churn if churn_rate > 0.05
    y_binary = (y > 0.05).astype(int)
    
    # Check class distribution
    print("Class distribution in target:")
    print(y_binary.value_counts())
    
    # Split train/test data (stratify to maintain class balance)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_binary, test_size=0.2, random_state=42, stratify=y_binary
    )
    
    # Confirm no NaNs in train/test splits
    print("Any NaNs in X_train?", np.isnan(X_train).any())
    print("Any NaNs in X_test?", np.isnan(X_test).any())
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Check for NaNs after scaling (should not be any)
    print("Any NaNs in X_train_scaled?", np.isnan(X_train_scaled).any())
    print("Any NaNs in X_test_scaled?", np.isnan(X_test_scaled).any())
    
    # Compute class weights for imbalance
    class_weights = class_weight.compute_class_weight(
        'balanced',
        classes=np.unique(y_train),
        y=y_train
    )
    class_weights_dict = dict(enumerate(class_weights))
    print("Class weights:", class_weights_dict)
    
    # Build model with dropout for regularization
    model = Sequential([
        Input(shape=(X_train_scaled.shape[1],)),
        Dense(64, activation='relu'),
        Dropout(0.3),
        Dense(32, activation='relu'),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    # Train model
    history = model.fit(
        X_train_scaled,
        y_train,
        epochs=50,
        batch_size=32,
        validation_split=0.2,
        class_weight=class_weights_dict,
        verbose=2
    )
    
    # Predict probabilities on test set
    y_pred_prob = model.predict(X_test_scaled).flatten()
    
    # Check for NaNs in predictions
    print("Any NaNs in predicted probabilities?", np.isnan(y_pred_prob).any())
    
    # Find best threshold by maximizing F1 score on test set
    precisions, recalls, thresholds = precision_recall_curve(y_test, y_pred_prob)
    f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-8)
    best_idx = np.argmax(f1_scores)
    best_thresh = thresholds[best_idx]
    
    print(f"Best threshold by F1 score on test set: {best_thresh:.4f}")
    print(f"F1 score at best threshold: {f1_scores[best_idx]:.4f}")
    
    # Compute final metrics using best threshold
    y_pred_class = (y_pred_prob >= best_thresh).astype(int)
    print("ROC-AUC:", roc_auc_score(y_test, y_pred_prob))
    print("F1 Score:", f1_score(y_test, y_pred_class))
    print("Brier Score:", brier_score_loss(y_test, y_pred_prob))

if __name__ == "__main__":
    data_path = "../data/processed/merged_panel_clean_data.csv"
    run_predictive_model(data_path)


Class distribution in target:
churn_rate
0    15196
1        6
Name: count, dtype: int64
Any NaNs in X_train? zusatzbeitrag              False
zusatzbeitrag_avg          False
morbidity_index            False
marktanteil versicherte    False
insured_lag                False
dtype: bool
Any NaNs in X_test? zusatzbeitrag              False
zusatzbeitrag_avg          False
morbidity_index            False
marktanteil versicherte    False
insured_lag                False
dtype: bool
Any NaNs in X_train_scaled? False
Any NaNs in X_test_scaled? False
Class weights: {0: np.float64(0.5002056597564989), 1: np.float64(1216.1)}
Epoch 1/50
304/304 - 1s - 2ms/step - accuracy: 0.9796 - loss: 2.5013 - val_accuracy: 1.0000 - val_loss: 0.0346
Epoch 2/50
304/304 - 0s - 534us/step - accuracy: 0.9995 - loss: 2.9719 - val_accuracy: 1.0000 - val_loss: 0.0456
Epoch 3/50
304/304 - 0s - 528us/step - accuracy: 0.9988 - loss: 1.9863 - val_accuracy: 1.0000 - val_loss: 0.0802
Epoch 4/50
304/304 - 0s - 554us/step -