## Data helper functions (used by all notebooks)

In [1]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE

def split_data(df, target, case_id=None):
    
    # Features: all columns except target column
    X = df.drop(columns=[target])
    # Target variable
    y = df[target]

    return capstone_train_test_split(X, y, case_id)

def split_data_apply_smote(df, target, case_id=None):

    # Features: all columns except target column
    X = df.drop(columns=[target, 'case_id']) # SMOTE cannot work with string / guid, case_id drop
    # Target variable
    y = df[target]

    sm = SMOTE(random_state=42) # can have different parameters
    X_res, y_res = sm.fit_resample(X, y)

    return capstone_train_test_split(X_res, y_res, case_id)

def capstone_train_test_split(X, y, case_id):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)
    
    # Take out case ID but keep then available for testing data (for initial validation)
    if case_id is not None:
        test_case_id = X_test[case_id]
        X.drop(columns=[case_id], inplace=True)
        X_train.drop(columns=[case_id], inplace=True)
        X_test.drop(columns=[case_id], inplace=True)
    else:
        test_case_id = None
    
    # Training size = 0.8 * 977 ≈ 781
    # Test size = 0.2 * 977 ≈ 196
    print(f"{X_train.shape=}")
    print(f"{X_test.shape=}")
    print(f"{y_train.shape=}")
    print(f"{y_test.shape=}")

    return X, y, X_train, X_test, y_train, y_test, test_case_id


def get_metrics(y_true, y_pred, y_prob=None):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    metrics = {
        "accuracy": accuracy_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred),
        "f1_score": f1_score(y_true, y_pred),
        "roc_auc": roc_auc_score(y_true, y_prob) if y_prob is not None else None,
        "true_positive": tp,
        "true_negative": tn,
        "false_positive": fp,
        "false_negative": fn,
    }
    return metrics

def get_cross_validation_metrics(model, X, y, cv = 5):
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=RANDOM_STATE)
    results = []

    for fold, (train_index, val_index) in enumerate(skf.split(X, y)):
        X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
        y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]

        model.fit(X_train_fold, y_train_fold)
        y_pred_fold = model.predict(X_val_fold)
        y_prob_fold = model.predict_proba(X_val_fold)[:, 1]
        
        metrics = get_metrics(y_val_fold, y_pred_fold, y_prob_fold)
        metrics["fold"] = fold + 1 # ID 0 will be used for the initial testing data
        results.append(metrics)

    df = pd.DataFrame(results)
    df.set_index("fold", inplace=True)
    return df

def print_evaluated_model_accuracy(y_test, y_pred):
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")    

def print_validated_model_accuracy(model, metrics):
    print(f"Model validation for {type(model).__name__}:")
    accuracy = metrics["accuracy"]
    print(accuracy.to_list())
    print(f"\nMean accuracy: {accuracy.mean():.4f}\n")
    return metrics