## Model Training – ML & Deep Learning

In [2]:
import pandas as pd
import numpy as np

# Scikit-learn imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import f1_score 
from sklearn.feature_extraction.text import TfidfVectorizer

# Deep Learning imports (TensorFlow/Keras)
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, concatenate, Conv1D, GlobalMaxPooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping

In [4]:
#load processed data
# Load the data
df = pd.read_csv('../data/processed/processed_patient_behavior_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19999 entries, 0 to 19998
Data columns (total 24 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   patient_id       19999 non-null  int64  
 1   gender           19999 non-null  object 
 2   medication       19999 non-null  object 
 3   dose             16651 non-null  object 
 4   name             19999 non-null  object 
 5   surname          19999 non-null  object 
 6   bmi              19999 non-null  float64
 7   weight           19999 non-null  float64
 8   height           19999 non-null  float64
 9   systolic         19999 non-null  int64  
 10  diastolic        19999 non-null  int64  
 11  concentration    19999 non-null  int64  
 12  distractibility  19999 non-null  int64  
 13  impulsivity      19999 non-null  int64  
 14  hyperactivity    19999 non-null  int64  
 15  sleep            19999 non-null  int64  
 16  mood             19999 non-null  int64  
 17  appetite    

### 3.1 Feature Engineering

In [5]:
# Prepare features: encode categorical and normalise numeric
def prepare_features_for_tabular_model(df_input, target_column):
    """
    Prepares tabular features for ML models.
    Separates features and target, encodes categorical, scales numerical.
    Adjusts target for classification if necessary.
    """
    print(f"\nPreparing features for predicting '{target_column}'...")
    df = df_input.copy()

    if target_column not in df.columns:
        print(f"Error: Target column '{target_column}' not found in DataFrame.")
        return None, None, None
    
    y = df[target_column] # Define y first

    # Adjust target for classification: shift and convert to int
    # Original range is -2 to 2, shifting by 2 to get 0 to 4
    if y.min() < 0: # Basic check if shifting is needed
        print(f"Shifting target '{target_column}' by adding 2 and converting to int for classification.")
        y = (y + 2).astype(int)
    else: # If already non-negative, ensure it's int
        y = y.astype(int)
    print(f"  Target '{target_column}' unique values after adjustment: {np.sort(y.unique())}")


    # Define features (X)
    behavioral_targets = ['concentration', 'impulsivity', 'mood', 'sleep', 'appetite', 'distractibility', 'hyperactivity']
    potential_other_targets = [col for col in behavioral_targets if col != target_column]
    
    columns_to_drop_for_X = ['patient_id', 'name', 'surname', 
                                   'doctor_notes', 'dose', 
                                   'processed_notes', 'processed_notes_nltk', 
                                   target_column
                                  ] + potential_other_targets
    
    cols_to_drop_existing = [col for col in columns_to_drop_for_X if col in df.columns]
    X_tabular = df.drop(columns=cols_to_drop_existing, errors='ignore')
    
    categorical_features = X_tabular.select_dtypes(include=['object', 'category']).columns.tolist()
    numerical_features = X_tabular.select_dtypes(include=np.number).columns.tolist()

    X_tabular = X_tabular[numerical_features + categorical_features].copy() 

    print(f"  Features for tabular model: {X_tabular.columns.tolist()}")
    print(f"  Identified categorical features: {categorical_features}")
    print(f"  Identified numerical features: {numerical_features}")
    
    if X_tabular.empty and not (numerical_features or categorical_features) :
        print("Error: Feature set X_tabular is empty after dropping columns and identifying types.")
        return None, None, None
    elif not numerical_features and not categorical_features and not X_tabular.empty :
         print("Warning: No specific numerical or categorical features identified by dtype, but X_tabular is not empty.")
    
    transformers_list = []
    if numerical_features:
        numerical_transformer = StandardScaler()
        transformers_list.append(('num', numerical_transformer, numerical_features))
    if categorical_features:
        categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False) 
        transformers_list.append(('cat', categorical_transformer, categorical_features))

    if not transformers_list and not X_tabular.empty:
        print("Warning: No transformers created. Preprocessor will use remainder='passthrough'.")
        preprocessor = ColumnTransformer(transformers=[], remainder='passthrough') 
    elif not X_tabular.empty:
        preprocessor = ColumnTransformer(
            transformers=transformers_list, 
            remainder='passthrough' 
        )
    else: 
        preprocessor = None
        print("Error: Cannot create preprocessor as X_tabular is empty.")
    
    return X_tabular, y, preprocessor

X_tab_rf, y_rf, preprocessor_rf = prepare_features_for_tabular_model(df, target_column='concentration')

# Display y_rf's new range to confirm
if y_rf is not None:
    print(f"Target 'concentration' (y_rf) unique values for RF model: {np.sort(y_rf.unique())}")
    print(f"Target 'concentration' (y_rf) dtype: {y_rf.dtype}")


Preparing features for predicting 'concentration'...
Shifting target 'concentration' by adding 2 and converting to int for classification.
  Target 'concentration' unique values after adjustment: [0 1 2 3 4]
  Features for tabular model: ['bmi', 'weight', 'height', 'systolic', 'diastolic', 'is_medicated', 'dose_mg', 'gender', 'medication', 'bmi_category', 'bp_category']
  Identified categorical features: ['gender', 'medication', 'bmi_category', 'bp_category']
  Identified numerical features: ['bmi', 'weight', 'height', 'systolic', 'diastolic', 'is_medicated', 'dose_mg']
Target 'concentration' (y_rf) unique values for RF model: [0 1 2 3 4]
Target 'concentration' (y_rf) dtype: int64


In [6]:
X_tab_rf.head(3)

Unnamed: 0,bmi,weight,height,systolic,diastolic,is_medicated,dose_mg,gender,medication,bmi_category,bp_category
0,26.8,82.9,1.76,113,88,1,15.0,Other,Adderall,Overweight,Hypertension Stage 1
1,17.3,52.3,1.74,136,72,0,0.0,Other,NoMedication,Underweight,Hypertension Stage 1
2,35.5,114.9,1.8,128,77,1,15.0,Male,Vyvanse,Obese,Elevated


### 3.2 2.	Baseline: Predict `concentration` using Random Forest

In [7]:
def train_random_forest_baseline(X_tabular_input, y_target_input, preprocessor_input):
    """
    Trains a Random Forest Classifier to predict the target.
    """
    target_name = y_target_input.name
    print(f"\n--- Training Random Forest Baseline to predict '{target_name}' (Classification) ---")

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X_tabular_input, y_target_input, test_size=0.2, random_state=42, stratify=y_target_input if y_target_input.nunique() > 1 else None) # Added stratify

    # Create the full pipeline including preprocessing and the model
    # RandomForestClassifier for classification
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, class_weight='balanced') # Added class_weight
    
    pipeline_rf = Pipeline(steps=[('preprocessor', preprocessor_input),
                                  ('classifier', rf_model)]) # Changed regressor to classifier

    print("Training Random Forest Classifier...")
    pipeline_rf.fit(X_train, y_train)

    # Evaluate the model
    y_pred_rf = pipeline_rf.predict(X_test)
    
    print(f"\nRandom Forest Classifier Evaluation for '{target_name}':")
    
    # Classification metrics
    accuracy = accuracy_score(y_test, y_pred_rf)
    # For multi-class, specify average for F1 score, e.g., 'weighted' or 'macro'
    # Since target was -2 to 2, now 0 to 4, it's multi-class
    f1 = f1_score(y_test, y_pred_rf, average='weighted') 
    
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  F1 Score (weighted): {f1:.4f}")
    print("\nClassification Report:")
    # Ensure y_test and y_pred_rf have the same labels present or specify labels in classification_report
    # target_names can be created if you have a mapping from 0-4 back to original labels
    # For now, using default numeric labels
    print(classification_report(y_test, y_pred_rf, zero_division=0)) # Added zero_division

    # Feature importances
    try:
        # Get feature names after one-hot encoding
        cat_features_in_X_train = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
        
        # Check if 'cat' transformer exists and has features
        if 'cat' in pipeline_rf.named_steps['preprocessor'].named_transformers_ and len(cat_features_in_X_train) > 0 :
            ohe_feature_names = pipeline_rf.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(
                cat_features_in_X_train
            )
        else:
            ohe_feature_names = np.array([]) # Empty array if no categorical features or no 'cat' transformer

        num_feature_names = X_train.select_dtypes(include=np.number).columns.tolist()
        
        # Handle case where passthrough columns might exist and are not numbers (should not happen with current X_tabular selection)
        # For simplicity, assuming all_feature_names can be constructed this way
        all_feature_names = np.concatenate([num_feature_names, ohe_feature_names])
        
        importances = pipeline_rf.named_steps['classifier'].feature_importances_
        feature_importance_df = pd.DataFrame({'feature': all_feature_names, 'importance': importances})
        feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)
        print("\nTop 10 Feature Importances (Random Forest Classifier):")
        print(feature_importance_df.head(10))
    except Exception as e:
        print(f"Could not retrieve feature importances: {e}")
        
    return pipeline_rf # Return the trained pipeline

# Assuming X_tab_rf and y_rf are already prepared with y_rf as integer classes
if X_tab_rf is not None and y_rf is not None and preprocessor_rf is not None:
    # Ensure y_rf is int for classification
    if not pd.api.types.is_integer_dtype(y_rf):
        print("Warning: y_rf is not integer type. Attempting conversion (this should have been handled in prepare_features).")
        # This is a safeguard; prepare_features_for_tabular_model should already handle it.
        y_rf_class = (y_rf + (2 if y_rf.min() < 0 else 0)).astype(int)
    else:
        y_rf_class = y_rf

    print(f"Unique values in y_rf_class for RF training: {np.sort(y_rf_class.unique())}")
    rf_pipeline_trained = train_random_forest_baseline(X_tab_rf, y_rf_class, preprocessor_rf)
else:
    print("Skipping Random Forest training as inputs are not ready.")


Unique values in y_rf_class for RF training: [0 1 2 3 4]

--- Training Random Forest Baseline to predict 'concentration' (Classification) ---
Training Random Forest Classifier...

Random Forest Classifier Evaluation for 'concentration':
  Accuracy: 0.2003
  F1 Score (weighted): 0.2002

Classification Report:
              precision    recall  f1-score   support

           0       0.19      0.20      0.20       799
           1       0.20      0.20      0.20       803
           2       0.20      0.20      0.20       800
           3       0.23      0.22      0.22       792
           4       0.19      0.17      0.18       806

    accuracy                           0.20      4000
   macro avg       0.20      0.20      0.20      4000
weighted avg       0.20      0.20      0.20      4000


Top 10 Feature Importances (Random Forest Classifier):
                 feature  importance
1                 weight    0.158980
0                    bmi    0.155571
2                 height    0.1431

### 3.3.1	Deep models: Predict `impulsivity` from doctor_notes using LSTM


In [8]:
# --- Deep models: LSTM : Predict impulsivity from doctor_notes ---
MAX_VOCAB_SIZE_LSTM = 10000
MAX_SEQUENCE_LENGTH_LSTM = 150 # Max length of sequences (notes)
EMBEDDING_DIM_LSTM = 100 # Dimension of word embeddings

def train_lstm_model(df_input, target_column='impulsivity'):
    """
    Trains an LSTM model to predict a target (e.g., 'impulsivity') from 'processed_notes'
    as a classification task.
    """
    print(f"\n--- Training LSTM Model to predict '{target_column}' (Classification) ---")
    df = df_input.copy()
    
    if target_column not in df.columns or 'processed_notes' not in df.columns:
        print(f"Error: Missing '{target_column}' or 'processed_notes' in DataFrame for LSTM.")
        return None, None, None

    texts = df['processed_notes'].astype(str).values # Ensure string type
    labels_original = df[target_column].values

    # Adjust labels for classification: shift if negative, then convert to int
    if labels_original.min() < 0:
        print(f"Shifting target '{target_column}' for LSTM by adding 2 and converting to int.")
        labels_adjusted = (labels_original + 2).astype(int)
    else:
        labels_adjusted = labels_original.astype(int)
    
    num_classes = len(np.unique(labels_adjusted))
    print(f"  Target '{target_column}' for LSTM - Unique classes: {np.sort(np.unique(labels_adjusted))}, Num classes: {num_classes}")


    # Tokenizer: Convert text to sequences of integers
    tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE_LSTM, oov_token="<oov>")
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    word_index = tokenizer.word_index
    print(f"  Found {len(word_index)} unique tokens.")

    # Pad sequences to ensure uniform length
    padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH_LSTM, padding='post', truncating='post')
    print(f"  Shape of padded sequences: {padded_sequences.shape}")

    y_lstm = labels_adjusted # Use the adjusted integer labels

    # Split data
    X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(
        padded_sequences, y_lstm, test_size=0.2, random_state=42, stratify=y_lstm if num_classes > 1 else None
    )

    # Build LSTM model
    print("  Building LSTM model architecture for classification...")
    input_lstm = Input(shape=(MAX_SEQUENCE_LENGTH_LSTM,))
    embedding_layer = Embedding(input_dim=min(MAX_VOCAB_SIZE_LSTM, len(word_index) + 1), 
                                output_dim=EMBEDDING_DIM_LSTM, 
                                input_length=MAX_SEQUENCE_LENGTH_LSTM)(input_lstm)
    lstm_layer = LSTM(64, dropout=0.2, recurrent_dropout=0.2)(embedding_layer) 
    dense_layer = Dense(32, activation='relu')(lstm_layer)
    dropout_layer = Dropout(0.3)(dense_layer)
    # Output layer for multi-class classification
    output_layer = Dense(num_classes, activation='softmax')(dropout_layer) 

    model_lstm = Model(inputs=input_lstm, outputs=output_layer)
    # Compile for classification
    model_lstm.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    model_lstm.summary()

    print("\n  Training LSTM model...")
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    history_lstm = model_lstm.fit(
        X_train_lstm, y_train_lstm,
        epochs=30, 
        batch_size=32,
        validation_split=0.1, 
        callbacks=[early_stopping],
        verbose=1
    )

    # Evaluate LSTM model
    print("\n  LSTM Model Evaluation on Test Set (Classification):")
    loss_lstm, accuracy_lstm = model_lstm.evaluate(X_test_lstm, y_test_lstm, verbose=0)
    print(f"  Test Loss: {loss_lstm:.4f}")
    print(f"  Test Accuracy: {accuracy_lstm:.4f}")

    y_pred_probs_lstm = model_lstm.predict(X_test_lstm)
    y_pred_lstm = np.argmax(y_pred_probs_lstm, axis=1)
    
    f1_lstm = f1_score(y_test_lstm, y_pred_lstm, average='weighted')
    print(f"  Test F1 Score (weighted): {f1_lstm:.4f}")
    print("\n  Classification Report (LSTM):")
    print(classification_report(y_test_lstm, y_pred_lstm, zero_division=0))
    
    return model_lstm, tokenizer, history_lstm

# Run LSTM Model (Predicting impulsivity from doctor_notes)
# Ensure 'impulsivity' and 'processed_notes' exist, and drop NaNs
df_for_lstm_prep = df[['processed_notes', 'impulsivity']].copy().dropna()

if not df_for_lstm_prep.empty:
    lstm_model_trained, lstm_tokenizer, lstm_history = train_lstm_model(df_for_lstm_prep, target_column='impulsivity')
else:
    print("DataFrame for LSTM is empty after dropping NaNs. Skipping LSTM training.")
    lstm_model_trained, lstm_tokenizer, lstm_history = None, None, None


--- Training LSTM Model to predict 'impulsivity' (Classification) ---
Shifting target 'impulsivity' for LSTM by adding 2 and converting to int.
  Target 'impulsivity' for LSTM - Unique classes: [0 1 2 3 4], Num classes: 5
  Found 822 unique tokens.
  Shape of padded sequences: (19999, 150)
  Building LSTM model architecture for classification...





  Training LSTM model...
Epoch 1/30
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 107ms/step - accuracy: 0.2139 - loss: 1.6101 - val_accuracy: 0.2113 - val_loss: 1.6090
Epoch 2/30
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 103ms/step - accuracy: 0.1945 - loss: 1.6100 - val_accuracy: 0.2113 - val_loss: 1.6091
Epoch 3/30
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 102ms/step - accuracy: 0.2025 - loss: 1.6095 - val_accuracy: 0.2013 - val_loss: 1.6091
Epoch 4/30
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 104ms/step - accuracy: 0.2034 - loss: 1.6095 - val_accuracy: 0.2013 - val_loss: 1.6092
Epoch 5/30
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 105ms/step - accuracy: 0.2027 - loss: 1.6096 - val_accuracy: 0.2113 - val_loss: 1.6089
Epoch 6/30
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 106ms/step - accuracy: 0.1988 - loss: 1.6098 - val_accuracy: 0.2113 - v

### 3.3.2	Deep models: Predict `concentration` from doctor_notes using CLSTM (CNN + LSTM)

In [9]:
# --- CLSTM (CNN + LSTM) : Combine text and tabular features ---

MAX_VOCAB_SIZE_CLSTM = 10000
MAX_SEQUENCE_LENGTH_CLSTM = 150 # Should be same as LSTM if using its tokenizer
EMBEDDING_DIM_CLSTM = 100     # Should be same as LSTM if using its tokenizer

def train_clstm_model(df_input, target_column='concentration', text_tokenizer=None):
    """
    Trains a CLSTM model combining text ('processed_notes') and tabular features
    for a classification task.
    Args:
        df_input (pd.DataFrame): The fully processed DataFrame.
        target_column (str): The name of the target column to predict.
        text_tokenizer (tf.keras.preprocessing.text.Tokenizer, optional): 
                         A pre-fitted tokenizer. If None, a new one will be fitted.
    """
    print(f"\n--- Training CLSTM Model to predict '{target_column}' (Classification) ---")
    df_clstm = df_input.copy()

    # 1. Prepare Tabular Features (y_clstm will be adjusted for classification here)
    # The prepare_features_for_tabular_model function already handles shifting y and converting to int.
    X_tabular_clstm, y_clstm, preprocessor_clstm = prepare_features_for_tabular_model(df_clstm, target_column)
    
    if X_tabular_clstm is None or y_clstm is None or preprocessor_clstm is None:
        print("Error: Tabular feature preparation failed for CLSTM. Aborting.")
        return None, None, None, None
        
    # Ensure y_clstm is integer type for classification (should be handled by prepare_features)
    if not pd.api.types.is_integer_dtype(y_clstm):
         print(f"Warning: y_clstm for CLSTM is not integer. Type: {y_clstm.dtype}. This should be handled earlier.")
         # Attempt to fix, assuming it was float from -2 to 2
         y_clstm = (y_clstm + (2 if y_clstm.min() < 0 else 0)).astype(int)

    num_classes_clstm = len(np.unique(y_clstm))
    print(f"  Target '{target_column}' for CLSTM - Unique classes: {np.sort(np.unique(y_clstm))}, Num classes: {num_classes_clstm}")


    # 2. Prepare Text Features
    # Ensure 'processed_notes' is present and align with X_tabular_clstm's indices
    if 'processed_notes' not in df_clstm.columns:
        print("Error: 'processed_notes' column missing for CLSTM text features.")
        return None, None, None, None
    
    texts_clstm = df_clstm.loc[X_tabular_clstm.index, 'processed_notes'].astype(str).values 
    
    current_tokenizer_clstm = None
    if text_tokenizer is None:
        print("  Fitting new tokenizer for CLSTM text data...")
        current_tokenizer_clstm = Tokenizer(num_words=MAX_VOCAB_SIZE_CLSTM, oov_token="<oov>")
        current_tokenizer_clstm.fit_on_texts(texts_clstm)
    else:
        print("  Using provided tokenizer for CLSTM text data...")
        current_tokenizer_clstm = text_tokenizer
        
    sequences_clstm = current_tokenizer_clstm.texts_to_sequences(texts_clstm)
    word_index_clstm = current_tokenizer_clstm.word_index
    print(f"  Found {len(word_index_clstm)} unique tokens for CLSTM (using {'provided' if text_tokenizer else 'new'} tokenizer).")
    
    padded_sequences_clstm = pad_sequences(sequences_clstm, maxlen=MAX_SEQUENCE_LENGTH_CLSTM, padding='post', truncating='post')
    print(f"  Shape of padded text sequences for CLSTM: {padded_sequences_clstm.shape}")

    # y_clstm (target) is already prepared from tabular prep, and it's aligned with X_tabular_clstm.
    # padded_sequences_clstm is aligned because it uses X_tabular_clstm.index.

    # 3. Split Data (Tabular data will be processed after split)
    X_train_tab_raw, X_test_tab_raw, \
    X_train_text, X_test_text, \
    y_train_clstm, y_test_clstm = train_test_split(
        X_tabular_clstm, padded_sequences_clstm, y_clstm.values, # Use .values for y for consistency
        test_size=0.2, random_state=42, stratify=y_clstm.values if num_classes_clstm > 1 else None
    )
    
    # Apply preprocessing to tabular data (fit on train, transform test)
    print("  Preprocessing CLSTM tabular data (fitting on train split)...")
    X_train_tab_processed = preprocessor_clstm.fit_transform(X_train_tab_raw)
    X_test_tab_processed = preprocessor_clstm.transform(X_test_tab_raw)
    # Keras generally expects dense arrays
    if hasattr(X_train_tab_processed, "toarray"):
        X_train_tab_processed = X_train_tab_processed.toarray()
        X_test_tab_processed = X_test_tab_processed.toarray()

    print(f"  CLSTM Train shapes: Tabular {X_train_tab_processed.shape}, Text {X_train_text.shape}, Target {y_train_clstm.shape}")
    print(f"  CLSTM Test shapes: Tabular {X_test_tab_processed.shape}, Text {X_test_text.shape}, Target {y_test_clstm.shape}")


    # 4. Build CLSTM Model Architecture
    print("  Building CLSTM model architecture for classification...")
    
    # Text Input Branch (Using LSTM as in notebook, not CNN+Pool for this conversion)
    input_text = Input(shape=(MAX_SEQUENCE_LENGTH_CLSTM,), name='text_input')
    # Ensure vocab size for embedding is correct based on the tokenizer used
    embedding_vocab_size = min(MAX_VOCAB_SIZE_CLSTM, len(word_index_clstm) + 1)
    embedding_text = Embedding(input_dim=embedding_vocab_size,
                               output_dim=EMBEDDING_DIM_CLSTM,
                               input_length=MAX_SEQUENCE_LENGTH_CLSTM)(input_text)
    # Original notebook had LSTM -> Dense for text features
    lstm_text_branch = LSTM(64, dropout=0.2, recurrent_dropout=0.2)(embedding_text)
    text_features = Dense(32, activation='relu')(lstm_text_branch)

    # Tabular Input Branch
    input_tabular = Input(shape=(X_train_tab_processed.shape[1],), name='tabular_input')
    dense_tabular_branch = Dense(64, activation='relu')(input_tabular) 
    tabular_features_branch = Dense(32, activation='relu')(dense_tabular_branch)
    tabular_features_dropout = Dropout(0.3)(tabular_features_branch)

    # Concatenate features
    combined_features = concatenate([text_features, tabular_features_dropout])
    
    # Output layers
    combined_dense = Dense(64, activation='relu')(combined_features)
    combined_dropout = Dropout(0.4)(combined_dense)
    output_clstm = Dense(num_classes_clstm, activation='softmax')(combined_dropout)

    model_clstm = Model(inputs=[input_text, input_tabular], outputs=output_clstm)
    model_clstm.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    model_clstm.summary()

    # 5. Train CLSTM Model
    print("\n  Training CLSTM model...")
    early_stopping_clstm = EarlyStopping(monitor='val_loss', patience=7, restore_best_weights=True)
    history_clstm = model_clstm.fit(
        [X_train_text, X_train_tab_processed], y_train_clstm, # Ensure correct order of inputs
        epochs=40, 
        batch_size=32,
        validation_split=0.1,
        # validation_data=([X_test_text, X_test_tab_processed], y_test_clstm), # Alternative: use explicit test set for validation
        callbacks=[early_stopping_clstm],
        verbose=1
    )

    # 6. Evaluate CLSTM Model
    print("\n  CLSTM Model Evaluation on Test Set (Classification):")
    loss_clstm, accuracy_clstm = model_clstm.evaluate([X_test_text, X_test_tab_processed], y_test_clstm, verbose=0)
    print(f"  Test Loss: {loss_clstm:.4f}")
    print(f"  Test Accuracy: {accuracy_clstm:.4f}")

    y_pred_probs_clstm = model_clstm.predict([X_test_text, X_test_tab_processed])
    y_pred_clstm = np.argmax(y_pred_probs_clstm, axis=1)
    
    f1_clstm = f1_score(y_test_clstm, y_pred_clstm, average='weighted')
    print(f"  Test F1 Score (weighted): {f1_clstm:.4f}")
    print("\n  Classification Report (CLSTM):")
    print(classification_report(y_test_clstm, y_pred_clstm, zero_division=0))

    return model_clstm, current_tokenizer_clstm, preprocessor_clstm, history_clstm


# Run CLSTM Model (Predicting concentration using both text and tabular data)
# Ensure concentration and processed_notes exist, drop NaNs from these specific columns
df_for_clstm_prep = df[['processed_notes', 'concentration'] + df.columns.drop(['processed_notes', 'concentration'], errors='ignore').tolist()].copy()
df_for_clstm_prep.dropna(subset=['concentration', 'processed_notes'], inplace=True)

# Filter out rows where processed_notes might be empty strings after potential earlier processing, though astype(str) handles most.
df_for_clstm_prep = df_for_clstm_prep[df_for_clstm_prep['processed_notes'].str.strip().astype(bool)]


if not df_for_clstm_prep.empty:
    # Use the tokenizer fitted by the LSTM model if available and if it was for the same text
    # Otherwise, train_clstm_model will fit a new one.
    # Assuming lstm_tokenizer was trained on processed_notes
    clstm_model_trained, clstm_tokenizer_used, clstm_preprocessor_used, clstm_history = train_clstm_model(
        df_for_clstm_prep, 
        target_column='concentration', 
        text_tokenizer=lstm_tokenizer if 'lstm_tokenizer' in locals() and lstm_tokenizer is not None else None
    )
else:
    print("DataFrame for CLSTM is empty after preprocessing. Skipping CLSTM training.")



--- Training CLSTM Model to predict 'concentration' (Classification) ---

Preparing features for predicting 'concentration'...
Shifting target 'concentration' by adding 2 and converting to int for classification.
  Target 'concentration' unique values after adjustment: [0 1 2 3 4]
  Features for tabular model: ['bmi', 'weight', 'height', 'systolic', 'diastolic', 'is_medicated', 'dose_mg', 'gender', 'medication', 'bmi_category', 'bp_category']
  Identified categorical features: ['gender', 'medication', 'bmi_category', 'bp_category']
  Identified numerical features: ['bmi', 'weight', 'height', 'systolic', 'diastolic', 'is_medicated', 'dose_mg']
  Target 'concentration' for CLSTM - Unique classes: [0 1 2 3 4], Num classes: 5
  Using provided tokenizer for CLSTM text data...
  Found 822 unique tokens for CLSTM (using provided tokenizer).
  Shape of padded text sequences for CLSTM: (19999, 150)
  Preprocessing CLSTM tabular data (fitting on train split)...
  CLSTM Train shapes: Tabular (15




  Training CLSTM model...
Epoch 1/40
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 112ms/step - accuracy: 0.2022 - loss: 1.6197 - val_accuracy: 0.2094 - val_loss: 1.6095
Epoch 2/40
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 133ms/step - accuracy: 0.2058 - loss: 1.6105 - val_accuracy: 0.2025 - val_loss: 1.6095
Epoch 3/40
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 134ms/step - accuracy: 0.2064 - loss: 1.6090 - val_accuracy: 0.1950 - val_loss: 1.6109
Epoch 4/40
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 128ms/step - accuracy: 0.2104 - loss: 1.6083 - val_accuracy: 0.1963 - val_loss: 1.6107
Epoch 5/40
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 136ms/step - accuracy: 0.2087 - loss: 1.6086 - val_accuracy: 0.2181 - val_loss: 1.6090
Epoch 6/40
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 138ms/step - accuracy: 0.2151 - loss: 1.6076 - val_accuracy: 0.1919 - 