## Data helper functions (used by all notebooks)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from pandas import DataFrame, Series
from enum import StrEnum, auto

RANDOM_STATE = 42

TrainTestData = tuple[DataFrame, Series, DataFrame, DataFrame, Series, Series]
Model = LogisticRegression | SVC | RandomForestClassifier

class FeatureVariant(StrEnum):
    RESEARCH_MANUAL = 'research_ManualAssessment'
    RESEARCH_BORUTA = 'research_BORUTA'
    RESEARCH_TREE = 'research_extraTrees'
    RESEARCH_LOG = 'research_log'
    RESEARCH_LOGSCALED = 'research_logScaled'
    RESEARCH_ANOVA = 'research_kBestANOVA'
    LITERATURE = auto()
    STATISTICAL = auto()
    AUTOMATED = auto()

    def print_info():
        print([key for key in FeatureVariant.__members__])

class ModelVariant(StrEnum):
    SVM = 'svm'
    RF = 'random_forest'
    LG = 'logistic_regression'

    def print_info():
        print([key for key in ModelVariant.__members__])


def split_data(df: DataFrame, target: str, case_id=None) -> TrainTestData:

    # Features: all columns except target column
    X = df.drop(columns=[target])
    # Target variable
    y = df[target]

    return capstone_train_test_split(X, y, case_id)

def split_data_apply_smote(df: DataFrame, target: str) -> TrainTestData:

    # Features: all columns except target column
    X: DataFrame = df.drop(columns=[target, 'case_id']) # SMOTE cannot work with string / guid, case_id drop
    # Target variable
    y: Series = df[target]

    sm = SMOTE(random_state=42) # can have different parameters
    X_res, y_res = sm.fit_resample(X, y)

    return capstone_train_test_split(X_res, y_res)

def capstone_train_test_split(X: DataFrame, y: Series, contains_case_id: bool = False) -> TrainTestData:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)

    # Take out case ID but keep then available for testing data (for initial validation)
    if contains_case_id:
        case_id: str = "case_id"
        test_case_id: Series = X_test[case_id]
        X.drop(columns=[case_id], inplace=True)
        X_train.drop(columns=[case_id], inplace=True)
        X_test.drop(columns=[case_id], inplace=True)
    else:
        test_case_id = None

    # Training size = 0.8 * 977 ≈ 781
    # Test size = 0.2 * 977 ≈ 196
    print(f"{X_train.shape=}")
    print(f"{X_test.shape=}")
    print(f"{y_train.shape=}")
    print(f"{y_test.shape=}")

    return X, y, X_train, X_test, y_train, y_test, test_case_id


def get_metrics(y_true: Series, y_pred: Series, y_prob=None) -> dict[str, float | int]:
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    metrics: dict[str, float | int] = {
        "accuracy": accuracy_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred),
        "f1_score": f1_score(y_true, y_pred),
        "roc_auc": roc_auc_score(y_true, y_prob) if y_prob is not None else None,
        "true_positive": tp,
        "true_negative": tn,
        "false_positive": fp,
        "false_negative": fn,
    }
    return metrics

def get_cross_validation_metrics(model: Model, X: DataFrame, y: Series, cv: int = 5) -> DataFrame:
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=RANDOM_STATE)
    results = []

    for fold, (train_index, val_index) in enumerate(skf.split(X, y)):
        X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
        y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]

        model.fit(X_train_fold, y_train_fold)
        y_pred_fold = model.predict(X_val_fold)
        y_prob_fold = model.predict_proba(X_val_fold)[:, 1]
        
        metrics: dict[str, float | int] = get_metrics(y_val_fold, y_pred_fold, y_prob_fold)
        metrics["fold"] = fold + 1 # ID 0 will be used for the initial testing data
        results.append(metrics)

    df = DataFrame(results)
    df.set_index("fold", inplace=True)
    return df

def print_evaluated_model_accuracy(y_test: Series, y_pred: Series) -> None:
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")    

def print_validated_model_accuracy(model: Model, metrics: DataFrame) -> DataFrame:
    print(f"Model validation for {type(model).__name__}:")
    accuracy = metrics["accuracy"]
    print(accuracy.to_list())
    print(f"\nMean accuracy: {accuracy.mean():.4f}\n")
    return metrics