In [None]:
import pandas as pd
import numpy as np
import os
import json
import joblib

import altair as alt
import vegafusion as vf

pd.options.mode.copy_on_write = True
alt.data_transformers.enable("vegafusion")

In [None]:
# Pre-define chart function
def chart(df, x, y, title, color=alt.value('steelblue'), width=480, height=320):
    return alt.Chart(df).encode(
        x=x,
        y=y,
        color=color,
    ).properties(
    title=title,
    width=width,
    height=height,
).configure(
    axis=alt.AxisConfig(
        domain=False, # remove axis line
        ticks=False, # remove ticks
        labelAngle=0, # rotate labels
        labelColor='gray', # color of labels
        labelFontSize=10,
    ),
    font='Helvetica Neue',
    view=alt.ViewConfig(stroke=None), # remove border
)

## 1 - Load the training data

In [None]:
input_path = os.path.join('..', 'data', 'cleaned')

train_df = pd.read_parquet(os.path.join(input_path, 'train.parquet'))
train_df.head()

## 2 - Load the vairiables dictionary and define features for clustering

In [None]:
# Load the variables dictionary
with open(os.path.join(input_path, 'variables.json'), 'r') as f:
    variables = json.load(f)

print(f'Variable Categories:\n')
for category, list in variables.items():
    print(f'{category}')
    print(f'{list}')

### 2.1 Defining features

##### !!! The statistical test result of the features should be referred first

In [None]:
train_df[variables['visitReason']].value_counts()

In [None]:
# Defining the independent variables as features for classification
features = \
    ['AGE', 'AGER', 'SEX', 'USETOBAC'] + variables['visitReason'] + ['PASTVIS'] + variables['vitalSigns'] \
    + variables['presentSymptomsStatus'] + variables['textFeature']

print(f'Features: {features}')
print(f'Number of Features: {len(features)}')

In [None]:
X_train = train_df.loc[:, features].copy()

## 3 - Preprocess and engineer the features

 ### 3.1 - Bin the Reason for Visit variables into Modules
 RFV1, RFV2, RFV3

In [None]:
# Load the REASON FOR VISIT classification summary of codes
rfv_summary = pd.read_excel(os.path.join('..', 'data', 'raw', 'RFV_codes_summary.xlsx'))

# Split the 'CODE NUMBER' column into 'START' and 'END' columns
rfv_summary[['START', 'END']] = rfv_summary['CODE NUMBER'].str.split('-', expand=True).astype(int)

# Remove the leading and trailing whitespaces from `MODULE_1` and `MODULE_2` columns
rfv_summary['MODULE_1'] = rfv_summary['MODULE_1'].str.strip()
rfv_summary['MODULE_2'] = rfv_summary['MODULE_2'].str.strip()

rfv_summary.head(5)

In [None]:
# Find the `START` and `END` range, 
# and map the corresponding `MODULE_1` and `MODULE_2` to X_train as new columns `MODULE_1` and `MODULE_2`,
# according to the value of `RFV1`, `RFV2`, and `RFV3` columns
def get_module(code):
    module = rfv_summary.loc[(rfv_summary['START'] <= code) & (rfv_summary['END'] >= code), ['MODULE_1', 'MODULE_2']]
    if len(module) == 0:
        return pd.Series([pd.NA, pd.NA], index=['MODULE_1', 'MODULE_2'])
    else:
        return module.iloc[0]



X_train[['RFV1_MOD1', 'RFV1_MOD2']] = X_train['RFV1'].apply(lambda x: get_module(int(str(x)[:4])) if pd.notna(x) else pd.Series([pd.NA, pd.NA], index=['MODULE_1', 'MODULE_2']))
print(f'RFV1 unique values: \n{X_train["RFV1_MOD2"].value_counts()}')

X_train[['RFV2_MOD1', 'RFV2_MOD2']] = X_train['RFV2'].apply(lambda x: get_module(int(str(x)[:4])) if pd.notna(x) else pd.Series([pd.NA, pd.NA], index=['MODULE_1', 'MODULE_2']))
print(f'RFV2 unique values: \n{X_train["RFV2_MOD2"].value_counts()}')

X_train[['RFV3_MOD1', 'RFV3_MOD2']] = X_train['RFV3'].apply(lambda x: get_module(int(str(x)[:4])) if pd.notna(x) else pd.Series([pd.NA, pd.NA], index=['MODULE_1', 'MODULE_2']))
print(f'RFV3 unique values: \n{X_train["RFV3_MOD2"].value_counts()}')

### 3.2 - Binning of quantitative variables to categorical features
Bin the following quantitative variables:

AGE, BMI, TEMPF, BPSYS, BPDIAS

#### 3.2.1 - Bin the AGE variable
Do we bin as recoded age groups (`AGER`) or as each 20 years?

In [None]:
# Check the distribution of `AGER`
# 1 = Under 15 years 
# 2 = 15-24 years 
# 3 = 25-44 years 
# 4 = 45-64 years 
# 5 = 65-74 years 
# 6 = 75 years and over|
 
chart(
    df=train_df,
    x='AGER:O',
    y='count()',
    title='Distribution of AGER',
).mark_bar()

In [None]:
# Bin the AGE variable as age groups
# 0-2 = Infant
# 2-4 = Toddler
# 4-12 = Child
# 12-20 = Teenager
# 20-40 = Adult
# 40-60 = Middle Aged
# >= 60 = Senior

age_groups = ['Infant', 'Toddler', 'Child', 'Teenager', 'Child or Teenager', 'Adult', 'Middle Aged', 'Senior']

def bin_age(age):
    if pd.isna(age): return pd.NA
    #if age < 2: return 'Infant'
    #elif age < 4: return 'Toddler'
    #elif age < 12: return 'Child'
    #elif age < 20: return 'Teenager'
    elif age < 20: return 'Child or Teenager'
    elif age < 40: return 'Adult'
    elif age < 60: return 'Middle Aged'
    else: return 'Senior'
    

X_train['AGE_GROUP'] = X_train['AGE'].apply(bin_age)

# Check the distribution of age groups
chart(
    df=X_train,
    x=alt.X('AGE_GROUP:O', sort=age_groups),
    y='count()',
    title='Distribution of AGE GROUPS',
).mark_bar()

#### 3.2.2 - Bin the vitalSigns variables
BMI, TEMPF, BPSYS, BPDIAS

In [None]:
# Bin the BMI as weight status
# <18.5 = Underweight
# 18.5-25 = Normal weight
# 25-30 = Overweight
# >=30 = Obesity

bmi_groups = ['Underweight', 'Normal weight', 'Overweight', 'Obesity']

def bin_bmi(bmi):
    if pd.isna(bmi): return pd.NA
    elif bmi < 18.5: return 'Underweight'
    elif bmi < 25: return 'Normal weight'
    elif bmi < 30: return 'Overweight'
    else: return 'Obesity'

X_train['BMI_GROUP'] = X_train['BMI'].apply(bin_bmi)

# Check the distribution of BMI groups
chart(
    df=X_train,
    x=alt.X('BMI_GROUP:O', sort=bmi_groups),
    y='count()',
    title='Distribution of BMI GROUPS',
).mark_bar()

In [None]:
# Bin the TEMPF as fever status
# <95 = Hypothermia
# 95-99 = Normal temperature
# 99-100 = Low grade fever
# 100-103 = Fever
# >=103 = Hyperpyrexia

tempf_groups = ['Hypothermia', 'Normal temperature', 'Low grade fever', 'Fever', 'Hyperpyrexia']

def bin_tempf(tempf):
    if pd.isna(tempf): return pd.NA
    elif tempf < 95: return 'Hypothermia'
    elif tempf < 99: return 'Normal temperature'
    #elif tempf < 100: return 'Low grade fever'
    elif tempf < 103: return 'Fever'
    else: return 'Hyperpyrexia'

X_train['TEMPF_GROUP'] = X_train['TEMPF'].apply(bin_tempf)

# Check the distribution of TEMPF groups
chart(
    df=X_train,
    x=alt.X('TEMPF_GROUP:O', sort=tempf_groups),
    y='count()',
    title='Distribution of TEMPF GROUPS',
).mark_bar()

In [None]:
# Bin the BPSYS as systolic blood pressure status
# <90 = Hypotension
# 90-120 = Normal blood pressure
# 120-140 = Prehypertension
# >=140 = Hypertension

bpsys_groups = ['Hypotension', 'Normal blood pressure', 'Prehypertension', 'Hypertension']

def bin_bpsys(bpsys):
    if pd.isna(bpsys): return pd.NA
    elif bpsys < 90: return 'Hypotension'
    elif bpsys < 120: return 'Normal blood pressure'
    elif bpsys < 140: return 'Prehypertension'
    else: return 'Hypertension'

X_train['BPSYS_GROUP'] = X_train['BPSYS'].apply(bin_bpsys)

# Check the distribution of BPSYS groups
chart(
    df=X_train,
    x=alt.X('BPSYS_GROUP:O', sort=bpsys_groups),
    y='count()',
    title='Distribution of BPSYS GROUPS',
).mark_bar()

In [None]:
# Bin the BPDIAS as diastolic blood pressure status
# <60 = Low diastolic blood pressure
# 60-90 = Normal diastolic blood pressure
# 90-110 = High diastolic blood pressure
# >=110 = Hypertension

bpdias_groups = [
    'Low diastolic blood pressure', 'Normal diastolic blood pressure', 'High diastolic blood pressure', 'Hypertension'
]

def bin_bpdias(bpdias):
    if pd.isna(bpdias): return pd.NA
    elif bpdias < 60: return 'Low diastolic blood pressure'
    elif bpdias < 90: return 'Normal diastolic blood pressure'
    elif bpdias < 110: return 'High diastolic blood pressure'
    else: return 'Hypertension'

X_train['BPDIAS_GROUP'] = X_train['BPDIAS'].apply(bin_bpdias)

# Check the distribution of BPDIAS groups
chart(
    df=X_train,
    x=alt.X('BPDIAS_GROUP:O', sort=bpdias_groups),
    y='count()',
    title='Distribution of BPDIAS GROUPS',
).mark_bar().configure_axisX(labelAngle=45)

### 3.3 - Create interaction features
AGE, HTIN, WTLB, BMI, BPSYS, BPDIAS, CEBVD, CHF, DIABETES, HYPLIPID, HTN, OBESITY

### 3.4 - Redefine the features for training

In [None]:
binary_features = [feature for feature in features if X_train[feature].nunique() == 2]
print(f'Binary Features: {binary_features}')

ordinal_features = ['CASTAGE']
print(f'Ordinal Features: {ordinal_features}')
print()

# With Binned Groups
quantitative_features_w_bin = ['PASTVIS', 'HTIN', 'WTLB']
print(f'Quantitative Features with Binned Groups: {quantitative_features_w_bin}')

nominal_features_w_bin = ['INJDET', 'MAJOR'] + ['RFV1_MOD1', 'RFV2_MOD1', 'RFV3_MOD1'] + ['RFV1_MOD2', 'RFV2_MOD2', 'RFV3_MOD2'] + ['AGE_GROUP', 'BMI_GROUP', 'TEMPF_GROUP', 'BPSYS_GROUP', 'BPDIAS_GROUP']
print(f'Nominal Features with Binned Groups: {nominal_features_w_bin}')

print(f'Number of Features with Binned Groups: {len(quantitative_features_w_bin + binary_features + ordinal_features + nominal_features_w_bin)}')
print()

# Without Binned Groups
quantitative_features_wo_bin = ['AGE', 'PASTVIS', 'HTIN', 'WTLB', 'BMI', 'TEMPF', 'BPSYS', 'BPDIAS']
print(f'Quantitative Features without Binned Groups: {quantitative_features_wo_bin}')
nominal_features_wo_bin = ['INJDET', 'MAJOR'] + ['RFV1', 'RFV2', 'RFV3']
print(f'Nominal Features without Binned Groups: {nominal_features_wo_bin}')

print(f'Number of Features without Binned Groups: {len(quantitative_features_wo_bin + binary_features + ordinal_features + nominal_features_wo_bin)}')


### 3.5 - Handeling missing values in categorical features

In [None]:
# Check the missing values in X_train
print(f'Missing Values in X_train: \n{X_train.isna().sum().where(lambda x: x > 0).dropna()}')
print()

# Fill the missing values in the categorical features
# with -9 for 'CASTAGE',
# with -999 for 'USETOBAC', 'INJDET', 'MAJOR',
# with -9 for 'RFV1', 'RFV2', 'RFV3'
# with 'NA' for 'RFV1_MOD1', 'RFV2_MOD1', 'RFV3_MOD1', 'RFV1_MOD2', 'RFV2_MOD2', 'RFV3_MOD2',
# with 'NA' for 'BMI_GROUP', 'TEMPF_GROUP', 'BPSYS_GROUP', 'BPDIAS_GROUP'
X_train.fillna({'CASTAGE': -9}, inplace=True)
X_train.fillna({'USETOBAC': -999, 'INJDET': -999, 'MAJOR': -999}, inplace=True)
X_train.fillna({'RFV1': -9, 'RFV2': -9, 'RFV3': -9}, inplace=True)
X_train.fillna(
    {
        'RFV1_MOD1': 'NA', 'RFV2_MOD1': 'NA', 'RFV3_MOD1': 'NA',
        'RFV1_MOD2': 'NA', 'RFV2_MOD2': 'NA', 'RFV3_MOD2': 'NA',
        'BMI_GROUP': 'NA', 'TEMPF_GROUP': 'NA', 'BPSYS_GROUP': 'NA', 'BPDIAS_GROUP': 'NA'
    },
    inplace=True
)


# Check the missing values in X_train after filling
print(f'Missing Values in X_train after Filling: \n{X_train.isna().sum().where(lambda x: x > 0).dropna()}')

## 4 - Prepare dependent variables

In [None]:
# Check the missing values in 'DIAG1', 'DIAG2', and 'DIAG3'
print(f'Missing Values in DIAG1: {train_df["DIAG1"].isna().sum()}')
print(f'Missing Values in DIAG2: {train_df["DIAG2"].isna().sum()}')
print(f'Missing Values in DIAG3: {train_df["DIAG3"].isna().sum()}')
print()

# Check the numbers of ruled out or questionable diagnoses
# (when 'PRDIAG1', 'PRDIAG2', and 'PRDIAG3' equals to 1)
print(f'Number of Ruled Out Diagnoses in DIAG1: {train_df["PRDIAG1"].sum()}')
print(f'Number of Ruled Out Diagnoses in DIAG2: {train_df["PRDIAG2"].sum()}')
print(f'Number of Ruled Out Diagnoses in DIAG3: {train_df["PRDIAG3"].sum()}')
print()

# Check the number of samples with missing 'DIAG1' and 'PRDIAG1' equals to 1
print(f'Number of Samples with Missing DIAG1 and PRDIAG1 equals to 1: {train_df[(train_df["DIAG1"].isna()) & (train_df["PRDIAG1"] == 1)].shape[0]}')
print()

# Check the number of available dependent samples
# (when 'DIAG1' is not missing and 'PRDIAG1' is not 1)
print(f'Number of Available Dependent Samples: {train_df[(~train_df["DIAG1"].isna()) & (train_df["PRDIAG1"] != 1)].shape[0]}')

### 4.1 - Load and the list of three-digit categories of ICD-9-CM

In [None]:
# Load the list of three-digit categories of ICD-9-CM
icd9cm_3dcat = pd.read_excel(os.path.join('..', 'data', 'raw', 'ICD9CM_3DCat.xlsx'), dtype=str)

icd9cm_3dcat.head()

### 4.2 - Employing the hierachical classifications of ICD-9-CM codes to prepare the target labels

In [None]:
# Map the three-digit categories of ICD-9-CM to 'DIAG1', 'DIAG2', and 'DIAG3',
# if 'PRDIAG1', 'PRDIAG2', and 'PRDIAG3' are not 1 respectively

def get_icd9cm_3dcat(diag, prdiag, category='CATEGORY_1'):
    try:
        if pd.notna(diag) and (pd.isna(prdiag) | prdiag != 1):
            if diag == 'V997-':
                return 'No diagnosis/disease or healthy'
            else:
                return icd9cm_3dcat[icd9cm_3dcat['3D_CODE'] == diag[:3]][category].values[0]
        else:
            return pd.NA
    except:
        print(f'Error: {diag}')
        print(f'Error: {prdiag}')

    
get_icd9cm_3dcat(train_df.iloc[0].DIAG1, train_df.iloc[0].PRDIAG1, category='CATEGORY_2')

In [None]:
# Map the three-digit categories of ICD-9-CM to 'DIAG1', 'DIAG2', and 'DIAG3',
# if 'PRDIAG1', 'PRDIAG2', and 'PRDIAG3' are not 1 respectively

y_train = train_df.apply(lambda x: get_icd9cm_3dcat(x.DIAG1, x.PRDIAG1, category='CATEGORY_1'), axis=1)
y_train_cat2 = train_df.apply(lambda x: get_icd9cm_3dcat(x.DIAG1, x.PRDIAG1, category='CATEGORY_2'), axis=1)

print(f'Dependent DataFrame with CATEGORY_1 Shape: {y_train.shape}')
print(f'Dependent DataFrame with CATEGORY_2 Shape: {y_train_cat2.shape}')

### 4.3 - Drop the rows from both X_train, y_train with NA in y_train

In [None]:
print(f'Number of available dependent samples: {y_train.notna().sum()}')
print()

non_missing_mask = y_train.notna()

X_train = X_train.loc[non_missing_mask]

y_train = y_train.loc[non_missing_mask]
y_train_cat2 = y_train_cat2[non_missing_mask]

print(f'X_train Shape: {X_train.shape}')
print(f'y_train with CATEGORY_1 Shape: {y_train.shape}')
print(f'y_train with CATEGORY_2 Shape: {y_train_cat2.shape}')

### 4.4 - Add in text feature
From 'RFV1_TEXT', 'RFV2_TEXT', 'RFV3_TEXT'

From combined textual feature

In [None]:
import spacy

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

random_state = 42

#### 4.4.1 - Combine and preprocess textual features

In [None]:
# Combine 'RFV1_TEXT', 'RFV2_TEXT', 'RFV3_TEXT' into 'TEXT'
X_train['TEXT'] = X_train['RFV1_TEXT'].fillna('') + ' ' + X_train['RFV2_TEXT'].fillna('') + ' ' + X_train['RFV3_TEXT'].fillna('')

In [None]:
# Preprocess the text features with Spacy
nlp = spacy.load('en_core_web_sm')

def preprocess_text(text):
    doc = nlp(text)
    return ' '.join([token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct])

X_train['TEXT'] = X_train['TEXT'].apply(preprocess_text)

#### 4.4.2 - Generate text feature with LDA (Topic probabilities)

In [None]:
def generate_topic_features(df, n_topics=10, n_top_words=10, transform=False, random_state=random_state):
    """Generate topic features (topic probabilities) from text features using LDA."""
    # Define the count vectorizer
    vectorizer = TfidfVectorizer(
        #stop_words='english',
        ngram_range=(1, 1),
        max_features=1000,
        min_df=1,
        max_df=0.95,
    )
    tf = vectorizer.fit_transform(df['TEXT'])

    lda = LatentDirichletAllocation(n_components=n_topics, learning_method='batch', n_jobs=-1, random_state=random_state)
    lda.fit(tf)

    # Define the function to display the top words for each topic
    def display_topics(model, feature_names, n_top_words):
        for topic_idx, topic in enumerate(model.components_):
            print(f'Topic {topic_idx}:')
            print(' '.join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
            print()
    
    display_topics(lda, vectorizer.get_feature_names_out(), n_top_words)

    # Define the topic features
    topics = lda.transform(tf)
    topic_features = [f'TOPIC_{i}' for i in range(topics.shape[1])]
    print(f'Topic Features: {topic_features}')
    topics = pd.DataFrame(topics, columns=topic_features, index=df.index)

    # Transform the topic features with PowerTransformer or Log transformation
    if transform == 'power':
        topics = np.sqrt(topics)
    elif transform == 'log':
        topics = np.log(topics + 0.0001)

    # Combine the topic features with df
    df = pd.concat([df, topics], axis=1)
    print(f'DataFrame Shape: {df.shape}')
    return df, vectorizer, tf, lda, topic_features

In [None]:
X_train, vectorizer, tf, lda, topic_features = generate_topic_features(
    X_train, n_topics=10, n_top_words=10, transform='log'
)

#### 4.4.3 - Visualize the topics

In [None]:
# Visualize the topics with pyLDAvis
import pyLDAvis
import pyLDAvis.lda_model

pyLDAvis.enable_notebook()

In [None]:
lda_vis = pyLDAvis.lda_model.prepare(lda, tf, vectorizer, mds='tsne')
lda_vis

## 5 - Classification models

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import HistGradientBoostingClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.decomposition import PCA
from sklearn.multiclass import OneVsRestClassifier

random_state = 42

### 5.1 - Model selection

In [None]:
# Setup the pipeline
def set_pipeline(model, binary_features, ordinal_features, nominal_features, quantitative_features, imputer=None, n_neighbors=5, pca=False, ovr=False):
    # Define the column transformer for the independent variables
    preprocessor = ColumnTransformer(
        transformers=[
            ('binary', 'passthrough', binary_features),
            ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), nominal_features),
            ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-9), ordinal_features),
        ]
    )

    if imputer == 'knn':
        preprocessor.transformers.insert(1, ('impute', KNNImputer(n_neighbors=n_neighbors), quantitative_features))

    if ovr:
        model = OneVsRestClassifier(model)

    # Define the pipeline
    pipeline = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('scaler', StandardScaler()),
            ('classifier', model),
        ]
    )

    if pca:
        pipeline.steps.insert(2, ('pca', PCA()))

    return pipeline

#### 5.1.1 - Logistic Regression - Baseline model

##### 5.1.1.1 - Logistic Regression - Dropping the quantitative columns with missing values

In [None]:
# With Binned Groups and RFV Modules
quantitative_features = quantitative_features_w_bin
nominal_features = nominal_features_w_bin

features = binary_features + ordinal_features + nominal_features + quantitative_features

# Drop the quantitative columns with missing values
print(f'Dropping columns with missing values: {set(X_train[features].columns) - set(X_train[features].dropna(axis=1).columns)}')
quantitative_features = [feature for feature in quantitative_features if feature in X_train[features].dropna(axis=1).columns]

print()
print(f'Shape of X_train after dropping columns with missing values: {X_train[features].dropna(axis=1).shape}')
print(f'Features after dropping columns with missing values: \n{X_train[features].columns}')
print()

# Define the model
pipeline = set_pipeline(
    model=LogisticRegression(
        random_state=random_state, n_jobs=-1,
        max_iter=1000, class_weight='balanced'
    ),
    binary_features=binary_features,
    ordinal_features=ordinal_features,
    nominal_features=nominal_features,
    quantitative_features=quantitative_features,
)

# Cross-validate the model
scores = cross_val_score(
    pipeline, X_train[features], y_train, cv=5, n_jobs=-1,
    scoring='f1_weighted'
)

print(f'Cross-Validation Scores: {scores}')
print(f'Mean Cross-Validation Score: {scores.mean()}')

In [None]:
# With original quantitative data and RFV codes
quantitative_features = quantitative_features_wo_bin
nominal_features = nominal_features_wo_bin

features = binary_features + ordinal_features + nominal_features + quantitative_features

# Drop the quantitative columns with missing values
print(f'Dropping columns with missing values: {set(X_train[features].columns) - set(X_train[features].dropna(axis=1).columns)}')
quantitative_features = [feature for feature in quantitative_features if feature in X_train[features].dropna(axis=1).columns]
print()

print(f'Shape of X_train after dropping columns with missing values: {X_train[features].dropna(axis=1).shape}')
print(f'Features after dropping columns with missing values: \n{X_train[features].columns}')
print()

# Define the model
pipeline = set_pipeline(
    model=LogisticRegression(
        random_state=random_state, n_jobs=-1,
        max_iter=1000, class_weight='balanced'
    ),
    binary_features=binary_features,
    ordinal_features=ordinal_features,
    nominal_features=nominal_features,
    quantitative_features=quantitative_features,
)

# Cross-validate the model
scores = cross_val_score(
    pipeline, X_train[features], y_train, cv=5, n_jobs=-1,
    scoring='f1_weighted'
)

print(f'Cross-Validation Scores: {scores}')
print(f'Mean Cross-Validation Score: {scores.mean()}')

##### 5.1.1.2 - Logistic Regression - Imputing the missing values - KNN Imputer

In [None]:
# With original quantitative data and RFV codes
quantitative_features = quantitative_features_wo_bin
nominal_features = nominal_features_wo_bin

features = binary_features + ordinal_features + nominal_features + quantitative_features

print(f'Missing Values in X_train: \n{X_train[features].isna().sum().where(lambda x: x > 0).dropna()}')
print()
print(f'Shape of X_train: {X_train[features].shape}')
print(f'Features to be fit: \n{X_train[features].columns}')
print()

# Define the model
n_neighbors = 5
pipeline = set_pipeline(
    model=LogisticRegression(
        random_state=random_state, n_jobs=-1,
        max_iter=1000, class_weight='balanced'
    ),
    binary_features=binary_features,
    ordinal_features=ordinal_features,
    nominal_features=nominal_features,
    quantitative_features=quantitative_features,
    imputer='knn',
    n_neighbors=n_neighbors
)

# Cross-validate the model
scores = cross_val_score(
    pipeline, X_train[features], y_train, cv=5, n_jobs=-1,
    scoring='f1_weighted'
    #scoring='accuracy'
)

print(f'Cross-Validation Scores: {scores}')
print(f'Mean Cross-Validation Score: {scores.mean()}')

In [None]:
# With Binned Groups and RFV codes
quantitative_features = ['PASTVIS', 'HTIN', 'WTLB']
nominal_features = ['INJDET', 'MAJOR', 'RFV1', 'RFV2', 'RFV3', 'AGE_GROUP', 'BMI_GROUP', 'TEMPF_GROUP', 'BPSYS_GROUP', 'BPDIAS_GROUP']

features = binary_features + ordinal_features + nominal_features + quantitative_features

print(f'Missing Values in X_train: \n{X_train[features].isna().sum().where(lambda x: x > 0).dropna()}')
print()
print(f'Shape of X_train: {X_train[features].shape}')
print(f'Features to be fit: \n{X_train[features].columns}')
print()

# Define the model
n_neighbors = 5
pipeline = set_pipeline(
    model=LogisticRegression(
        random_state=random_state, n_jobs=-1,
        max_iter=1000, class_weight='balanced'
    ),
    binary_features=binary_features,
    ordinal_features=ordinal_features,
    nominal_features=nominal_features,
    quantitative_features=quantitative_features,
    imputer='knn',
    n_neighbors=n_neighbors
)

# Cross-validate the model
scores = cross_val_score(
    pipeline, X_train[features], y_train, cv=5, n_jobs=-1,
    scoring='f1_weighted'
    #scoring='accuracy'
)

print(f'Cross-Validation Scores: {scores}')
print(f'Mean Cross-Validation Score: {scores.mean()}')

In [None]:
# With Binned Groups and RFV codes + RFV Modules
quantitative_features = ['PASTVIS', 'HTIN', 'WTLB']
nominal_features = ['INJDET', 'MAJOR', 'RFV1', 'RFV2', 'RFV3', 'AGE_GROUP', 'BMI_GROUP', 'TEMPF_GROUP', 'BPSYS_GROUP', 'BPDIAS_GROUP']
nominal_features = nominal_features + ['RFV1_MOD1', 'RFV2_MOD1', 'RFV3_MOD1'] + ['RFV1_MOD2', 'RFV2_MOD2', 'RFV3_MOD2']

features = binary_features + ordinal_features + nominal_features + quantitative_features

print(f'Missing Values in X_train: \n{X_train[features].isna().sum().where(lambda x: x > 0).dropna()}')
print()
print(f'Shape of X_train: {X_train[features].shape}')
print(f'Features to be fit: \n{X_train[features].columns}')
print()

# Define the model
n_neighbors = 5
pipeline = set_pipeline(
    model=LogisticRegression(
        random_state=random_state, n_jobs=-1,
        max_iter=1000, class_weight='balanced'
    ),
    binary_features=binary_features,
    ordinal_features=ordinal_features,
    nominal_features=nominal_features,
    quantitative_features=quantitative_features,
    imputer='knn',
    n_neighbors=n_neighbors
)

# Cross-validate the model
scores = cross_val_score(
    pipeline, X_train[features], y_train, cv=5, n_jobs=-1,
    scoring='f1_weighted'
    #scoring='accuracy'
)

print(f'Cross-Validation Scores: {scores}')
print(f'Mean Cross-Validation Score: {scores.mean()}')

In [None]:
# With Binned Groups and RFV codes + RFV Modules + Topic Features
quantitative_features = ['PASTVIS', 'HTIN', 'WTLB']
nominal_features = ['INJDET', 'MAJOR', 'RFV1', 'RFV2', 'RFV3', 'AGE_GROUP', 'BMI_GROUP', 'TEMPF_GROUP', 'BPSYS_GROUP', 'BPDIAS_GROUP']
nominal_features = nominal_features + ['RFV1_MOD1', 'RFV2_MOD1', 'RFV3_MOD1'] + ['RFV1_MOD2', 'RFV2_MOD2', 'RFV3_MOD2']

features = binary_features + ordinal_features + nominal_features + quantitative_features + topic_features

print(f'Missing Values in X_train: \n{X_train[features].isna().sum().where(lambda x: x > 0).dropna()}')
print()
print(f'Shape of X_train: {X_train[features].shape}')
print(f'Features to be fit: \n{X_train[features].columns}')
print()

# Define the model
n_neighbors = 5
pipeline = set_pipeline(
    model=LogisticRegression(
        random_state=random_state, n_jobs=-1,
        max_iter=1000, class_weight='balanced'
    ),
    binary_features=binary_features,
    ordinal_features=ordinal_features,
    nominal_features=nominal_features,
    quantitative_features=quantitative_features,
    imputer='knn',
    n_neighbors=n_neighbors
)

# Cross-validate the model
scores = cross_val_score(
    pipeline, X_train[features], y_train, cv=5, n_jobs=-1,
    scoring='f1_weighted'
    #scoring='accuracy'
)

print(f'Cross-Validation Scores: {scores}')
print(f'Mean Cross-Validation Score: {scores.mean()}')

In [None]:
# With original quantitative data and RFV codes + RFV Modules + Topic Features
quantitative_features = quantitative_features_wo_bin
nominal_features = nominal_features_wo_bin
nominal_features = nominal_features + ['RFV1_MOD1', 'RFV2_MOD1', 'RFV3_MOD1'] + ['RFV1_MOD2', 'RFV2_MOD2', 'RFV3_MOD2']

features = binary_features + ordinal_features + nominal_features + quantitative_features + topic_features

print(f'Missing Values in X_train: \n{X_train[features].isna().sum().where(lambda x: x > 0).dropna()}')
print()
print(f'Shape of X_train: {X_train[features].shape}')
print(f'Features to be fit: \n{X_train[features].columns}')
print()

# Define the model
n_neighbors = 5
pipeline = set_pipeline(
    model=LogisticRegression(
        random_state=random_state, n_jobs=-1,
        max_iter=1000, class_weight='balanced'
    ),
    binary_features=binary_features,
    ordinal_features=ordinal_features,
    nominal_features=nominal_features,
    quantitative_features=quantitative_features,
    imputer='knn',
    n_neighbors=n_neighbors
)

# Cross-validate the model
scores = cross_val_score(
    pipeline, X_train[features], y_train, cv=5, n_jobs=-1,
    scoring='f1_weighted'
    #scoring='accuracy'
)

print(f'Cross-Validation Scores: {scores}')
print(f'Mean Cross-Validation Score: {scores.mean()}')

#### 5.1.2 - Random Forest

In [None]:
# With Binned Groups and RFV codes + RFV Modules
quantitative_features = ['PASTVIS', 'HTIN', 'WTLB']
nominal_features = ['INJDET', 'MAJOR', 'RFV1', 'RFV2', 'RFV3', 'AGE_GROUP', 'BMI_GROUP', 'TEMPF_GROUP', 'BPSYS_GROUP', 'BPDIAS_GROUP']
nominal_features = nominal_features + ['RFV1_MOD1', 'RFV2_MOD1', 'RFV3_MOD1'] + ['RFV1_MOD2', 'RFV2_MOD2', 'RFV3_MOD2']

features = binary_features + ordinal_features + nominal_features + quantitative_features

print(f'Missing Values in X_train: \n{X_train[features].isna().sum().where(lambda x: x > 0).dropna()}')
print()
print(f'Shape of X_train: {X_train[features].shape}')
print(f'Features to be fit: \n{X_train[features].columns}')
print()

# Define the model
n_neighbors = 5
pipeline = set_pipeline(
    model=RandomForestClassifier(
        n_estimators=1000,
        criterion='entropy',
        class_weight='balanced',
        n_jobs=-1,
        random_state=random_state
    ),
    binary_features=binary_features,
    ordinal_features=ordinal_features,
    nominal_features=nominal_features,
    quantitative_features=quantitative_features,
    imputer='knn',
    n_neighbors=n_neighbors
)

# Cross-validate the model
scores = cross_val_score(
    pipeline, X_train[features], y_train, cv=5, n_jobs=-1,
    scoring='f1_weighted'
    #scoring='accuracy'
)

print(f'Cross-Validation Scores: {scores}')
print(f'Mean Cross-Validation Score: {scores.mean()}')

In [None]:
# With Binned Groups and RFV codes + RFV Modules + Topic Features
quantitative_features = ['PASTVIS', 'HTIN', 'WTLB']
nominal_features = ['INJDET', 'MAJOR', 'RFV1', 'RFV2', 'RFV3', 'AGE_GROUP', 'BMI_GROUP', 'TEMPF_GROUP', 'BPSYS_GROUP', 'BPDIAS_GROUP']
nominal_features = nominal_features + ['RFV1_MOD1', 'RFV2_MOD1', 'RFV3_MOD1'] + ['RFV1_MOD2', 'RFV2_MOD2', 'RFV3_MOD2']

features = binary_features + ordinal_features + nominal_features + quantitative_features + topic_features

print(f'Missing Values in X_train: \n{X_train[features].isna().sum().where(lambda x: x > 0).dropna()}')
print()
print(f'Shape of X_train: {X_train[features].shape}')
print(f'Features to be fit: \n{X_train[features].columns}')
print()

# Define the model
n_neighbors = 5
pipeline = set_pipeline(
    model=RandomForestClassifier(
        n_estimators=1000,
        criterion='entropy',
        class_weight='balanced',
        n_jobs=-1,
        random_state=random_state
    ),
    binary_features=binary_features,
    ordinal_features=ordinal_features,
    nominal_features=nominal_features,
    quantitative_features=quantitative_features,
    imputer='knn',
    n_neighbors=n_neighbors
)

# Cross-validate the model
scores = cross_val_score(
    pipeline, X_train[features], y_train, cv=5, n_jobs=-1,
    scoring='f1_weighted'
    #scoring='accuracy'
)

print(f'Cross-Validation Scores: {scores}')
print(f'Mean Cross-Validation Score: {scores.mean()}')

In [None]:
# With original quantitative data and RFV codes + RFV Modules + Topic Features
quantitative_features = quantitative_features_wo_bin
nominal_features = nominal_features_wo_bin
nominal_features = nominal_features + ['RFV1_MOD1', 'RFV2_MOD1', 'RFV3_MOD1'] + ['RFV1_MOD2', 'RFV2_MOD2', 'RFV3_MOD2']

features = binary_features + ordinal_features + nominal_features + quantitative_features + topic_features

print(f'Missing Values in X_train: \n{X_train[features].isna().sum().where(lambda x: x > 0).dropna()}')
print()
print(f'Shape of X_train: {X_train[features].shape}')
print(f'Features to be fit: \n{X_train[features].columns}')
print()

# Define the model
n_neighbors = 5
pipeline = set_pipeline(
    model=RandomForestClassifier(
        n_estimators=1000,
        criterion='entropy',
        class_weight='balanced',
        n_jobs=-1,
        random_state=random_state
    ),
    binary_features=binary_features,
    ordinal_features=ordinal_features,
    nominal_features=nominal_features,
    quantitative_features=quantitative_features,
    imputer='knn',
    n_neighbors=n_neighbors
)

# Cross-validate the model
scores = cross_val_score(
    pipeline, X_train[features], y_train, cv=5, n_jobs=-1,
    scoring='f1_weighted'
    #scoring='accuracy'
)

print(f'Cross-Validation Scores: {scores}')
print(f'Mean Cross-Validation Score: {scores.mean()}')

#### 5.1.3 - Multi-layer Perceptron Classifier

In [None]:
# With original quantitative data and RFV codes + RFV Modules + Topic Features
quantitative_features = quantitative_features_wo_bin
nominal_features = nominal_features_wo_bin
nominal_features = nominal_features + ['RFV1_MOD1', 'RFV2_MOD1', 'RFV3_MOD1'] + ['RFV1_MOD2', 'RFV2_MOD2', 'RFV3_MOD2']

features = binary_features + ordinal_features + nominal_features + quantitative_features + topic_features

print(f'Missing Values in X_train: \n{X_train[features].isna().sum().where(lambda x: x > 0).dropna()}')
print()
print(f'Shape of X_train: {X_train[features].shape}')
print(f'Features to be fit: \n{X_train[features].columns}')
print()

# Define the model
n_neighbors = 5
pipeline = set_pipeline(
    model=MLPClassifier(
        #hidden_layer_sizes=(100, 50),
        #activation='tanh',
        #solver='adam',
        #alpha=0.0001,
        #max_iter=1000,
        random_state=random_state
    ),
    binary_features=binary_features,
    ordinal_features=ordinal_features,
    nominal_features=nominal_features,
    quantitative_features=quantitative_features,
    imputer='knn',
    n_neighbors=n_neighbors
)

# Cross-validate the model
scores = cross_val_score(
    pipeline, X_train[features], y_train, cv=2, n_jobs=-1,
    scoring='f1_weighted'
    #scoring='accuracy'
)

print(f'Cross-Validation Scores: {scores}')
print(f'Mean Cross-Validation Score: {scores.mean()}')

#### 5.1.4 - Histogram-based Gradient Boosting Classification Tree

In [None]:
# With original quantitative data and RFV codes + RFV Modules + Topic Features
quantitative_features = quantitative_features_wo_bin
nominal_features = nominal_features_wo_bin
nominal_features = nominal_features + ['RFV1_MOD1', 'RFV2_MOD1', 'RFV3_MOD1'] + ['RFV1_MOD2', 'RFV2_MOD2', 'RFV3_MOD2']

features = binary_features + ordinal_features + nominal_features + quantitative_features + topic_features

print(f'Missing Values in X_train: \n{X_train[features].isna().sum().where(lambda x: x > 0).dropna()}')
print()
print(f'Shape of X_train: {X_train[features].shape}')
print(f'Features to be fit: \n{X_train[features].columns}')
print()

# Define the model
n_neighbors = 5
pipeline = set_pipeline(
    model=HistGradientBoostingClassifier(
        loss='log_loss',
        learning_rate=0.1,
        max_iter=100,
        max_leaf_nodes=31,
        max_depth=None,
        min_samples_leaf=20,
        l2_regularization=0,
        #max_features=1.0,
        max_bins=255,
        #categorical_features=binary_features + ordinal_features + nominal_features,
        #interaction_cst='pairwise',
        random_state=random_state,
        class_weight='balanced'
    ),
    binary_features=binary_features,
    ordinal_features=ordinal_features,
    nominal_features=nominal_features,
    quantitative_features=quantitative_features,
    imputer='knn',
    n_neighbors=n_neighbors
)

# Cross-validate the model
scores = cross_val_score(
    pipeline, X_train[features], y_train, cv=5, n_jobs=-1,
    scoring='f1_weighted'
    #scoring='accuracy'
)

print(f'Cross-Validation Scores: {scores}')
print(f'Mean Cross-Validation Score: {scores.mean()}')

#### 5.1.4 - Gradient Boosting Classifier

In [None]:
# With Binned Groups and RFV codes + RFV Modules
quantitative_features = ['PASTVIS', 'HTIN', 'WTLB']
nominal_features = ['INJDET', 'MAJOR', 'RFV1', 'RFV2', 'RFV3', 'AGE_GROUP', 'BMI_GROUP', 'TEMPF_GROUP', 'BPSYS_GROUP', 'BPDIAS_GROUP']
nominal_features = nominal_features + ['RFV1_MOD1', 'RFV2_MOD1', 'RFV3_MOD1'] + ['RFV1_MOD2', 'RFV2_MOD2', 'RFV3_MOD2']

features = binary_features + ordinal_features + nominal_features + quantitative_features

print(f'Missing Values in X_train: \n{X_train[features].isna().sum().where(lambda x: x > 0).dropna()}')
print()
print(f'Shape of X_train: {X_train[features].shape}')
print(f'Features to be fit: \n{X_train[features].columns}')
print()

# Define the model
n_neighbors = 5
pipeline = set_pipeline(
    model=GradientBoostingClassifier(
        loss='log_loss',
        learning_rate=0.1,
        n_estimators=1000,
        max_depth=3,
        random_state=random_state,
    ),
    binary_features=binary_features,
    ordinal_features=ordinal_features,
    nominal_features=nominal_features,
    quantitative_features=quantitative_features,
    imputer='knn',
    n_neighbors=n_neighbors
)

# Cross-validate the model
#scores = cross_val_score(
#    pipeline, X_train[features], y_train, cv=5, n_jobs=-1,
#    scoring='f1_weighted'
#    #scoring='accuracy'
#)

#print(f'Cross-Validation Scores: {scores}')
#print(f'Mean Cross-Validation Score: {scores.mean()}')

### 5.2 - Model evaluation

In [None]:
# Load custom function to combine text features
import sys
import importlib
sys.path.append('../src/features/')

import build_features
importlib.reload(build_features)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, label_ranking_average_precision_score, classification_report

In [None]:
input_path = os.path.join('..', 'data', 'cleaned')

# Load the variables dictionary and return the features list
varaibles_path = os.path.join(input_path, 'variables.json')
features = build_features.load_features(varaibles_path)

# Load and clean the REASON FOR VISIT classification summary of codes
rfv_path = os.path.join('..', 'data', 'raw', 'RFV_codes_summary.xlsx')
rfv_df = build_features.load_rfv(rfv_path)

# Load the list of three-digit categories of ICD-9-CM
icd9cm_path = os.path.join('..', 'data', 'raw', 'ICD9CM_3DCat.xlsx')
icd9cm_df = build_features.load_icd9cm(icd9cm_path)

icd9cm_category = 'CATEGORY_1'

#### 5.2.1 - Load and prepare the training dataset

In [None]:
# Load the training dataset
train_df = pd.read_parquet(os.path.join(input_path, 'train.parquet'))

X_train, y_train = build_features.build_features(train_df, features, rfv_df, icd9cm_df, category=icd9cm_category)

In [None]:
# Combine 'RFV1_TEXT', 'RFV2_TEXT', 'RFV3_TEXT' into 'TEXT'
X_train['TEXT'] = X_train['RFV1_TEXT'].fillna('') + ' ' + X_train['RFV2_TEXT'].fillna('') + ' ' + X_train['RFV3_TEXT'].fillna('')

# Preprocess the text features with Spacy
X_train['TEXT'] = X_train['TEXT'].apply(preprocess_text)

# Generate text feature with LDA (Topic probabilities)
X_train, vectorizer, tf, lda, topic_features = generate_topic_features(
    X_train, n_topics=10, n_top_words=10, transform='log'
)

In [None]:
# Plot the heat map of topic distributions among the labels in the training dataset with Altair
topic_df = pd.concat(
    [X_train[topic_features], y_train], axis=1
).rename(columns={0: 'CATEGORY'}).melt(id_vars='CATEGORY', var_name='Topic', value_name='Probability')

chart(
    df=topic_df,
    y='CATEGORY:N',
    x='Topic:N',
    color='Probability:Q',
    title='Distribution of the Labels in the Training Dataset',
).mark_rect().configure_axisY(
    labelLimit=300, title=None
).configure_axisX(
    title=None
).properties(width=480, height=480)

In [None]:
# Check the distribution and percentage of the labels in the training dataset
y_train.value_counts(normalize=True)

# Plot the distribution and percentage of true labels
chart(
    df=y_train.value_counts(normalize=True).reset_index(),
    x='index:N',
    y='proportion:Q',
    title='Distribution of the Labels in the Training Dataset',
).mark_bar().configure_axisX(labelAngle=45, labelLimit=300, title=None).configure_axisY(title=None)

##### 5.2.1.1 - Account for the imbalance in the labels

In [None]:
from imblearn.under_sampling import RandomUnderSampler

In [None]:
# Random Undersampling
def random_undersampling(X_train, y_train):
    undersampler = RandomUnderSampler(sampling_strategy='auto', random_state=random_state)
    X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)

    return X_train_resampled, y_train_resampled


#X_train, y_train = random_undersampling(X_train, y_train)

##### 5.2.1.2 - Confirm the distribution and percentage of the labels in the training dataset

In [None]:
# Check the distribution and percentage of the labels in the training dataset
y_train.value_counts(normalize=True)

# Plot the distribution and percentage of true labels
chart(
    df=y_train.value_counts(normalize=True).reset_index(),
    x='index:N',
    y='proportion:Q',
    title='Distribution of the Labels in the Training Dataset',
).mark_bar().configure_axisX(labelAngle=45, labelLimit=300, title=None).configure_axisY(title=None)

#### 5.2.2 - Model training

In [None]:
# Train the model
# With original quantitative data and RFV codes + RFV Modules + Topic Features
quantitative_features = quantitative_features_wo_bin
nominal_features = nominal_features_wo_bin
nominal_features = nominal_features + ['RFV1_MOD1', 'RFV2_MOD1', 'RFV3_MOD1'] + ['RFV1_MOD2', 'RFV2_MOD2', 'RFV3_MOD2']

clf_features = binary_features + ordinal_features + nominal_features + quantitative_features + topic_features

print(f'Missing Values in X_train: \n{X_train[clf_features].isna().sum().where(lambda x: x > 0).dropna()}')
print()
print(f'Shape of X_train: {X_train[clf_features].shape}')
print(f'Features to be fit: \n{X_train[clf_features].columns}')
print()

# Define the model
n_neighbors = 5
clf_pipeline = set_pipeline(
    model=RandomForestClassifier(
        n_estimators=1000,
        criterion='entropy',
        #class_weight='balanced',
        n_jobs=-1,
        random_state=random_state
    ),
    binary_features=binary_features,
    ordinal_features=ordinal_features,
    nominal_features=nominal_features,
    quantitative_features=quantitative_features,
    imputer='knn',
    n_neighbors=n_neighbors,
    #pca=True,
    #ovr=True
)

clf_model_name = clf_pipeline.named_steps['classifier'].__class__.__name__
if 'one' in clf_model_name.lower():
    clf_model_name = f'OneVsRest {type(clf_pipeline.esitmator).__name__}'
print(f'Classifier: {clf_model_name}')

clf_pipeline.fit(X_train[clf_features], y_train)

#### 5.2.3 - Load and prepare the test dataset

In [None]:
# Load the test dataset
test_df = pd.read_parquet(os.path.join(input_path, 'test.parquet'))

X_test, y_test = build_features.build_features(test_df, features, rfv_df, icd9cm_df, category=icd9cm_category)

In [None]:
# Combine 'RFV1_TEXT', 'RFV2_TEXT', 'RFV3_TEXT' into 'TEXT'
X_test['TEXT'] = X_test['RFV1_TEXT'].fillna('') + ' ' + X_test['RFV2_TEXT'].fillna('') + ' ' + X_test['RFV3_TEXT'].fillna('')

# Preprocess the text features with Spacy
X_test['TEXT'] = X_test['TEXT'].apply(preprocess_text)

# Transform the text features with the pre-trained vectorizer and lda
test_tf = vectorizer.transform(X_test['TEXT'])
test_topics = lda.transform(test_tf)
test_topics = pd.DataFrame(test_topics, columns=topic_features, index=X_test.index)

X_test = pd.concat([X_test, test_topics], axis=1)
print(f'X_test Shape: {X_test.shape}')

In [None]:
# Check the distribution and percentage of true labels
y_test.value_counts(normalize=True)

# Plot the distribution and percentage of true labels
chart(
    df=y_test.value_counts(normalize=True).reset_index(),
    x='index:N',
    y='proportion:Q',
    title='Distribution of True Labels',
).mark_bar().configure_axisX(labelAngle=45)

#### 5.2.4 - Prediction and Metrics

In [None]:
y_pred = clf_pipeline.predict(X_test)

In [None]:
le = LabelEncoder()
y_test_encoded = le.fit_transform(y_test)
y_pred_encoded = le.transform(y_pred)

n_classes = len(le.classes_)
y_test_binarized = label_binarize(y_test_encoded, classes=range(n_classes))
y_pred_binarized = label_binarize(y_pred_encoded, classes=range(n_classes))

lrap = label_ranking_average_precision_score(y_test_binarized, y_pred_binarized)
print(f'Label Ranking Average Precision Score: {lrap}')

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Prediction Accuracy: {accuracy}')

f1 = f1_score(y_test, y_pred, average='weighted')
print(f'Weighted Prediction F1 Score: {f1}')

print(classification_report(y_test[y_test.notna()], y_pred[y_test.notna()]))

#### 5.2.5 - Confusion matrix

In [None]:
# Plot the confusion matrix with percentages
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred)
cm_percent = cm / cm.sum(axis=1)[:, np.newaxis]

plt.figure(figsize=(16, 12))
sns.heatmap(cm_percent, annot=True, fmt=".2f", cmap='Blues', xticklabels=clf_pipeline.classes_, yticklabels=clf_pipeline.classes_)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix with Percentages')
plt.show()

## 6 - Extract text features from each group

In [None]:
# Load custom function to combine text features
import sys
sys.path.append('../src/features/')

from combine_textual import combine_features

#### 6.1 - Aggregate text data by group

In [None]:
# Define the list of textual features to combine
textual_features = [
    'AGE', 'SEX', 'USETOBAC', 
    'MAJOR', 'RFV1', 'RFV2', 'RFV3', 
    'BMI', 'TEMPF', 'BPSYS', 'BPDIAS',
    'ARTHRTIS', 'ASTHMA', 'CANCER', 'CEBVD', 'CHF', 'CRF', 'COPD', 'DEPRN', 'DIABETES', 'HYPLIPID', 'HTN', 'IHD', 'OBESITY', 'OSTPRSIS', 'NOCHRON', 'DMP',
    'DIAG1', 'DIAG2', 'DIAG3'
]

# Export the list of textual features
with open(os.path.join(file_path, 'textual_features.json'), 'w') as f:
    json.dump(textual_features, f)

# Combine the text features
train_df['CombinedText'] = train_df.apply(lambda x: combine_features(x, textual_features), axis=1)

train_df.CombinedText.head()

In [None]:
train_df.CombinedText.notna().sum()

#### 6.2 - Preprocess text data

In [None]:
import spacy
import re

In [None]:
nlp = spacy.load('en_core_web_sm')

def preprocess_text(row):
    row = re.sub(r'(\d+),(\d+)', r'\1\2', row)
    row = re.sub(r'(\d+)-(\d+)', r'\1_\2', row)
    doc = nlp(row)
    processed_text = ' '.join(token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct)
    row = re.sub(r'(\d+)_(\d+)', r'\1-\2', row)
    return processed_text

In [None]:
train_df['ProcessedText'] = train_df['CombinedText'].apply(preprocess_text)

In [None]:
train_df.ProcessedText.head()

In [None]:
# Save the preprocessed DataFrame
processed_file_path = os.path.join('..', 'data', 'processed')
train_df.to_csv(os.path.join(processed_file_path, f'train_{clustering_model_name}.csv'), index=False)

#### 6.3 - Calculate term frequencies

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Method 1
# Calculate the TF-IDF for each cluster,
# taking the ProcessedText of each cluster as the documents,
# and the ProcessedText of the entire dataset as the corpus

#clustered_text = train_df.groupby('cluster')['ProcessedText'].apply(lambda row: ' '.join(row)).reset_index()

#vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=1000, min_df=5, max_df=0.7)
#tfidf_matrix = vectorizer.fit_transform(clustered_text['ProcessedText'])

#print(tfidf_matrix)

#tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
#tfidf_df

In [None]:
# Method 2
# Calculate the TF-IDF of each row within each cluster
# Calculate the average TF-IDF for each cluster

vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=1000, min_df=5, max_df=0.7)
tfidf_matrix = vectorizer.fit_transform(train_df['ProcessedText'])

# Calculate the average TF-IDF for each cluster
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
cluster_tfidf = pd.concat([train_df['cluster'], tfidf_df], axis=1).groupby('cluster').mean()

cluster_tfidf

In [None]:
# Punish the weight of '\d+_year_old' by multiplying it by 0.5, using the regex pattern

#tfidf_df = tfidf_df.apply(lambda row: row * 0.5 if re.match(r'\d+_year_old', row.name) else row)
#cluster_tfidf = cluster_tfidf.apply(lambda row: row * 0.5 if re.match(r'\d+_year_old', row.name) else row)

#cluster_tfidf

#### 6.4 - Generate word clouds for each group

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [None]:
# Plot the word cloud for each cluster basd on the average TF-IDF
for i in range(n_clusters):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(cluster_tfidf.loc[i])
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Cluster {i}')
    plt.show()

## 7 - Medical pathways

In [None]:
# 'services' features
binary_services = [feature for feature in variables['services'] if train_df[feature].nunique() <= 2]
print(f'Binary Services: {binary_services}')

nominal_services = [feature for feature in variables['services'] if feature not in binary_services]
print(f'Nominal Services: {nominal_services}')
print()

# 'medicationsAndImmunizations' features
quantitative_med = ['NUMNEW', 'NUMCONT']
print(f'Quantitative Medications: {quantitative_med}')

binary_med = [feature for feature in variables['medicationsAndImmunizations'] if train_df[feature].nunique() <= 2 and feature not in quantitative_med]
print(f'Binary Medications: {binary_med}')

nominal_med = [feature for feature in variables['medicationsAndImmunizations'] if feature not in binary_med and feature not in quantitative_med]
print(f'Nominal Medications: {nominal_med}')
print()

# 'providersSeen' features
binary_ps = [feature for feature in variables['providersSeen'] if train_df[feature].nunique() <= 2]
print(f'Binary Providers Seen: {binary_ps}')
print()

# 'visitDisposition' features
binary_vd = [feature for feature in variables['visitDisposition'] if train_df[feature].nunique() <= 2]
print(f'Binary Visit Disposition: {binary_vd}')
print()

# 'DIAG' features
nominal_diag = ['DIAG1', 'DIAG1_CAT1', 'DIAG1_CAT2']
print(f'Nominal Diagnosis: {nominal_diag}')
text_diag = 'DIAG1_TEXT'
print(f'Text Diagnosis: {text_diag}')