# Data Processing


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Dataset Preprocessing required libraries
import numpy as np
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import KBinsDiscretizer, LabelEncoder, StandardScaler, OneHotEncoder

# TabNet required libraries
!pip install pytorch_tabnet wget

%load_ext autoreload
%autoreload 2

from pytorch_tabnet.tab_model import TabNetClassifier

import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, f1_score

import pandas as pd
import numpy as np
np.random.seed(0)

import os
import wget
from pathlib import Path

from matplotlib import pyplot as plt
%matplotlib inline

from pytorch_tabnet.pretraining import TabNetPretrainer

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/tabnet/dat_hyp_mice.csv')
weights = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/tabnet/weights.csv')

# Sampling weights give an error for TabNet.... so just cite the following quote
# https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0280387
# However, the error was mitigated if the F1 score was subsequently recalculated with observed outcomes from the weighted dataset



exclude = ['svy_subpop_htn', 'svy_subpop_chol', 'DSD010.x', 'DRDINT.x', 'DRDINT.y', 'LBDHDDSI', 'URXUMS', 'URXCRS', 'BPQ050A', 'cc_cvd_stroke']
labels = ['bp_control_jnc7', 'bp_control_accaha', 'bp_control_140_90', 'bp_control_130_80']
before_2013 = [1,2,3,4,5,6] # remember not to use 1999-2000
after_2013 = [7,8,9] # remember not to use 1999-2000


# Exclude the specified columns
df = df.drop(columns=exclude)

# Define a threshold for what you consider a categorical variable
threshold = 10

# Find categorical columns
categorical_columns = [col for col in df.columns if df[col].nunique() <= threshold]
continuous_columns = [col for col in df.columns if df[col].nunique() > threshold]

for col in categorical_columns:
    # Convert the categorical columns to categorical type
    df[col] = df[col].astype('category')

for col in continuous_columns:
    df[col] = df[col].astype(float) # Convert to float and coerce errors

label_encoder = LabelEncoder()

codebook = {}


'''
Column 'svy_year':
  0 => 1999-2000
  1 => 2001-2002
  2 => 2003-2004
  3 => 2005-2006
  4 => 2007-2008
  5 => 2009-2010
  6 => 2011-2012
  7 => 2013-2014
  8 => 2015-2016
  9 => 2017-2020
'''


for col in categorical_columns:
    if col == 'svy_psu': # skip encoding of this cateogry
      continue

    # Store the mapping of original categories to encoded labels
    codebook[col] = {index: label for index, label in enumerate(df[col].unique())}

    # Fit and transform the data to apply label encoding
    df[col] = label_encoder.fit_transform(df[col])


standard_scaler = StandardScaler()
for col in continuous_columns:
    # Fit and transform the data to apply standard scaling
    df[col] = standard_scaler.fit_transform(df[col].values.reshape(-1, 1))

# Print the codebook
print("\nCodebook:")
for col, mapping in codebook.items():
    print(f"Column '{col}':")
    for original, encoded in mapping.items():
        print(f"  {original} => {encoded}")


# Filter the data for the specified years
df_before_2013 = df[df['svy_year'].isin(before_2013)]
df_after_2013 = df[df['svy_year'].isin(after_2013)]
df_all_years = df.copy()

datasets = {}  # A dictionary to hold all 12 datasets
for label in labels:
    other_labels = [l for l in labels if l != label]

    for period, df_period in zip(['before_2013', 'after_2013', 'all_years'], [df_before_2013, df_after_2013, df_all_years]):
        # Ensure the label is present in the dataset
        if label in df_period.columns:
            # Create a copy of the dataframe to avoid modifying the original data
            dataset = df_period.copy()

            dataset.drop(columns=other_labels, inplace=True)

            # Add the processed dataset to the dictionary
            datasets[f'{label}_{period}'] = dataset

# A dictionary to hold all the splits for each dataset
splits = {}
for dataset_name, df in datasets.items():
    # Identify the target variable (column starts with 'bp_control')
    target_col = [col for col in df.columns if col.startswith('bp_control')][0]

    # get sampling weight
    weights = df['svy_weight_mec']
    weights = np.maximum(weights, 0)

    # Define the features and target
    X = df.drop(columns=[target_col, 'svy_weight_mec'])
    y = df[target_col]
    feat_list = X.columns

    # First split: Separate 20% of the data as test set
    X_temp, X_test, y_temp, y_test, w_temp, w_test = train_test_split(X, y, weights, test_size=0.1)

    # Adjust the ratio for the second split (0.125 x 0.8 = 0.1)
    # This will split the remaining data into 80% train and 20% validation, which corresponds to 10% of the original data
    X_train, X_valid, y_train, y_valid, w_train, w_valid = train_test_split(X_temp, y_temp, w_temp, test_size=1/9)

    # Store the splits in the dictionary
    splits[dataset_name] = {
      'train': (X_train.to_numpy(), y_train.to_numpy(), w_train, feat_list),
      'valid': (X_valid.to_numpy(), y_valid.to_numpy(), w_valid),
      'test': (X_test.to_numpy(), y_test.to_numpy(), w_test)
    }


Codebook:
Column 'svy_year':
  0 => 1999-2000
  1 => 2001-2002
  2 => 2003-2004
  3 => 2005-2006
  4 => 2007-2008
  5 => 2009-2010
  6 => 2011-2012
  7 => 2013-2014
  8 => 2015-2016
  9 => 2017-2020
Column 'demo_age_cat':
  0 => 18 to 44
  1 => 45 to 64
  2 => 65 to 74
  3 => 75+
Column 'demo_race':
  0 => Non-Hispanic White
  1 => Hispanic
  2 => Other
  3 => Non-Hispanic Black
  4 => Non-Hispanic Asian
Column 'demo_race_black':
  0 => No
  1 => Yes
Column 'demo_pregnant':
  0 => No
  1 => Yes
Column 'demo_gender':
  0 => Men
  1 => Women
Column 'bp_cat_meds_excluded':
  0 => SBP 160+ or DBP 100+ mm Hg
  1 => SBP of 130 to <140 or DBP 80 to <90 mm Hg
  2 => SBP of 140 to <160 or DBP 90 to <100 mm Hg
  3 => SBP <120 and DBP <80 mm Hg
  4 => SBP of 120 to <130 and DBP <80 mm Hg
Column 'bp_cat_meds_included':
  0 => taking antihypertensive medications
  1 => SBP of 130 to <140 or DBP 80 to <90 mm Hg
  2 => SBP of 140 to <160 or DBP 90 to <100 mm Hg
  3 => SBP 160+ or DBP 100+ mm Hg
Colu

# Running TabNet for all datasets

In [None]:
for d_name in datasets.keys():
    dataset_name = d_name
    X_train, y_train, w_train, feat_list = splits[dataset_name]['train']
    X_valid, y_valid, w_valid = splits[dataset_name]['valid']
    X_test, y_test, w_test = splits[dataset_name]['test']

    # pretraining
    unsupervised_model = TabNetPretrainer() # define sampling weights here
    max_epochs = 100
    unsupervised_model.fit(
        X_train=X_train,
        eval_set=[X_valid],
        max_epochs=max_epochs , patience=5,
        batch_size=2048, virtual_batch_size=128,
        num_workers=0,
        drop_last=False,
        pretraining_ratio=0.5,
    )

    #unsupervised_model.save_model('/content/drive/MyDrive/Colab Notebooks/tabnet/test_pretrain')
    #loaded_pretrain = TabNetPretrainer()
    #loaded_pretrain.load_model('/content/drive/MyDrive/Colab Notebooks/tabnet/test_pretrain.zip')

    clf = TabNetClassifier(optimizer_fn=torch.optim.Adam,
                          optimizer_params=dict(lr=2e-3),
                          scheduler_params={"step_size":10, # how to use learning rate scheduler
                                            "gamma":0.9},
                          scheduler_fn=torch.optim.lr_scheduler.StepLR,
                          mask_type='sparsemax', # This will be overwritten if using pretrain model
                          verbose=5,
                          )
    clf.fit(
        X_train=X_train, y_train=y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        eval_name=['train', 'valid'],
        eval_metric=['auc'],
        max_epochs=max_epochs , patience=20,
        batch_size=1024, virtual_batch_size=128,
        num_workers=0,
        weights=1,
        drop_last=False,
        from_unsupervised=unsupervised_model,
    )

    preds = clf.predict(X_test)
    f1 = f1_score(y_test, preds, sample_weight=w_test)

    feature_importances = clf.feature_importances_
    feature_indexes = np.arange(len(feature_importances))
    indexed_importances = list(zip(feat_list, feature_importances))
    sorted_indexed_importances = sorted(indexed_importances, key=lambda x: x[1], reverse=True)
    sorted_indices = np.argsort(feature_importances)[::-1]
    sorted_importances = feature_importances[sorted_indices]
    feature_indices = feat_list[sorted_indices]

    plt.figure(figsize=(12, 8))  # Adjust the figure size as necessary
    plt.title(dataset_name)
    plt.bar(feature_indices[:50], sorted_importances[:50], align='center')
    plt.xticks(feature_indices[:50], rotation='vertical')  # Label x-ticks with feature indices
    plt.ylabel('Importance')
    plt.xlabel('Features')
    plt.tight_layout()
    plt.savefig(f"/content/drive/MyDrive/Colab Notebooks/tabnet/results/{dataset_name}_feature_importances.png")

    # Create a DataFrame with feature names and importances
    feature_data = pd.DataFrame({
        'Feature': feature_indices,
        'Importance': sorted_importances
    })

    # Add the validation and test scores as new rows
    feature_data = feature_data.append([
        {'f1-score': f1}
    ], ignore_index=True)

    # Save to CSV
    feature_data.to_csv(f'/content/drive/MyDrive/Colab Notebooks/tabnet/results/{dataset_name}_feature_importances_scores.csv', index=False)




IndexError: ignored