In [1]:
import numpy as np
import matplotlib.pyplot as plt
import shap
import pandas as pd
import yaml
from typing import Literal

from sklearn.inspection import permutation_importance
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.feature_selection import SelectKBest, SelectFromModel, f_classif, mutual_info_regression
from sklearn.kernel_ridge import KernelRidge

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
from pathlib import Path

# Go up one directory level from the notebook's location
project_root = Path().resolve().parent  # Navigate to the parent directory
os.chdir(project_root)  # Set this as the working directory

print("Current working directory set to:", os.getcwd())

Current working directory set to: C:\Users\risch\Desktop\GitHub\VitalTrack


In [3]:
# Hyperparameters
SEED = 42
N_SPLITS = 5

# Fixed parameters
NUM_CLF_LABELS = 11
NUM_RGR_LABELS = 4

In [4]:
with open('config/paths.yaml', 'r') as file:
    paths = yaml.load(file, Loader=yaml.FullLoader)

with open('config/params.yaml', 'r') as file:
    params = yaml.load(file, Loader=yaml.FullLoader)

# Load the data
X = pd.read_csv(paths["train"]["final_train_file"])
y = pd.read_csv(paths["train"]["train_labels"])
test_X = pd.read_csv(paths["test"]["final_test_file"])

print(f"Number of Features: {len(X.columns)}")
print(f"Number of Labels: {len(y.columns)}")

Number of Features: 172
Number of Labels: 16


In [5]:
clf_model = params["clf_model"]
regr_model = params["regr_model"]

clf_params = params[clf_model]
regr_params = params[regr_model]

clf_model_class = eval(clf_model)
regr_model_class = eval(regr_model)

# Instantiate the model
clf_model = clf_model_class(**clf_params)
regr_model = regr_model_class(**regr_params)

print(clf_model)
print(regr_model)

RandomForestClassifier(class_weight='balanced', max_depth=32, n_estimators=200)
KernelRidge(alpha=1.0, gamma=0.01, kernel='rbf')


In [6]:
def feature_selection(X, y, selector, model_class, model_params, splitter):
    """
    Performs feature selection using Stratified K-Fold cross-validation and
    returns the indices of the selected features.

    Parameters:
    - X: pd.DataFrame, input features
    - y: pd.Series, target labels
    - selector: feature selector (e.g., SelectFromModel, SelectKBest)
    - model: sklearn model to train (e.g., LogisticRegression, RandomForestClassifier)
    - model_params: dict, hyperparameters for the model
    - splitter: sklearn splitter (e.g., StratifiedKFold, KFold)

    Returns:
    # - X_selected: pd.DataFrame, input features with selected features
    - selected_indices: list, indices of the selected features
    """

    cv_splits = splitter.split(X, y)

    cv_scores = []
    best_selector = None

    for train_idx, val_idx in cv_splits:
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # Fit the selector on the training data
        selector.fit(X_train, y_train)

        X_train_selected = selector.transform(X_train)
        X_val_selected = selector.transform(X_val)

        # Recreate the model (reset it) each time in the loop
        if model_params is None:
            model = model_class()  # Create model without parameters
        else:
            model = model_class(**model_params)  # Create model with parameters

        # Fit the model on the selected features
        model.fit(X_train_selected, y_train)

        # Evaluate the model on the validation data
        score = model.score(X_val_selected, y_val)
        cv_scores.append(score)

        # Keep track of the selector that gives the best score
        if best_selector is None or score > max(cv_scores):
            best_selector = selector

    # Get the selected features based on the best selector
    selected_indices = best_selector.get_support(indices=True)
    # X_selected = best_selector.transform(X)

    return selected_indices


def create_feature_mask(
    X, y, selector, model_class, model_params, splitter, out_file=None
):
    """
    Creates a mask of the selected features.

    Parameters:
    - X: pd.DataFrame, input features
    - y: pd.DataFrame, target labels
    - selector: feature selector (e.g., SelectFromModel, SelectKBest)
    - model: sklearn model to train (e.g., LogisticRegression, RandomForestClassifier)
    - model_params: dict, hyperparameters for the model
    - splitter: sklearn splitter (e.g., StratifiedKFold, KFold)
    - out_file: str, path to save the mask

    Returns:
    - mask: pd.DataFrame, boolean mask of the selected features
    """

    # Initialize the mask DataFrame with all False (no feature selected)
    mask_df = pd.DataFrame(False, index=y.columns, columns=X.columns)
    mask_df.index.name = "label"

    for label in y.columns:
        selected_indices = feature_selection(
            X=X,
            y=y[label],
            selector=selector,
            model_class=model_class,
            model_params=model_params,
            splitter=splitter,
        )
        mask_df.loc[label, X.columns[selected_indices]] = True

    if out_file is not None:
        mask_df.to_csv(out_file, index=True)

    return mask_df

In [7]:
clf_mask_file = paths["clf_mask_file"]
regr_mask_file = paths["regr_mask_file"]

print("Classifier mask file:", clf_mask_file)
print("Regressor mask file:", regr_mask_file)

# For testing purposes
# X = X.iloc[:1000, :]
# y = y.iloc[:1000, :]

# Remove the 'pid' column from the features and labels
X.drop(columns='pid', inplace=True)
y.drop(columns='pid', inplace=True)

y_clf = y.iloc[:, :NUM_CLF_LABELS]
y_regr = y.iloc[:, NUM_CLF_LABELS:]

Classifier mask file: data/processed/clf_feature_mask.csv
Regressor mask file: data/processed/regr_feature_mask.csv


In [8]:
# Create a mask of the selected features for the classifiers
clf_splitter = StratifiedKFold(n_splits=N_SPLITS, random_state=SEED, shuffle=True)
clf_selector = SelectFromModel(clf_model, max_features=25)
clf_mask = create_feature_mask(
    X=X,
    y=y_clf,
    selector=clf_selector,
    model_class=clf_model_class,
    model_params=clf_params,
    splitter=clf_splitter,
    out_file=clf_mask_file,
)

In [None]:
# Create a mask of the selected features for the regressors
regr_splitter = KFold(n_splits=N_SPLITS, random_state=SEED, shuffle=True)
regr_selector = SelectKBest(mutual_info_regression, k=30)
regr_mask = create_feature_mask(
    X=X,
    y=y_regr,
    selector=regr_selector,
    model_class=regr_model_class,
    model_params=regr_params,
    splitter=regr_splitter,
    out_file=regr_mask_file,
)