## Model Training – ML & Deep Learning

In [None]:
import pandas as pd
import numpy as np

# Scikit-learn imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import f1_score, mean_squared_error, r2_score, mean_absolute_error
from sklearn.feature_extraction.text import TfidfVectorizer

# Deep Learning imports (TensorFlow/Keras)
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, concatenate, Conv1D, GlobalMaxPooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
#load processed data
# Load the data
df = pd.read_csv('../data/processed/patient_behavior_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19999 entries, 0 to 19998
Data columns (total 24 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   patient_id       19999 non-null  int64  
 1   gender           19999 non-null  object 
 2   medication       19999 non-null  object 
 3   dose             16651 non-null  object 
 4   name             19999 non-null  object 
 5   surname          19999 non-null  object 
 6   bmi              19999 non-null  float64
 7   weight           19999 non-null  float64
 8   height           19999 non-null  float64
 9   systolic         19999 non-null  int64  
 10  diastolic        19999 non-null  int64  
 11  concentration    19999 non-null  int64  
 12  distractibility  19999 non-null  int64  
 13  impulsivity      19999 non-null  int64  
 14  hyperactivity    19999 non-null  int64  
 15  sleep            19999 non-null  int64  
 16  mood             19999 non-null  int64  
 17  appetite    

### 3.1 Feature Engineering

In [None]:
# Prepare features: encode categorical and normalise numeric 
def prepare_features_for_tabular_model(df_input, target_column):
    """
    Prepares tabular features for ML models.
    Separates features and target, encodes categorical, scales numerical.
    """
    print(f"\nPreparing features for predicting '{target_column}'...")
    df = df_input.copy()

    if target_column not in df.columns:
        print(f"Error: Target column '{target_column}' not found in DataFrame.")
        return None, None, None
    y = df[target_column] # Define y first

    # Define features (X)
    # Columns to exclude from features for X_tabular:
    # - Identifiers, raw text, original dose string
    # - The preprocessed text notes (used by text models)
    # - The target column itself
    # - Other potential behavioral target columns
    behavioral_targets = ['concentration', 'impulsivity', 'mood', 'sleep', 'appetite', 'distractibility', 'hyperactivity']
    potential_other_targets = [col for col in behavioral_targets if col != target_column]
    
    columns_to_drop_for_X = ['patient_id', 'name', 'surname', # Assuming these might exist
                                   'doctor_notes', 'dose', 
                                   'processed_notes', 'processed_notes_nltk', # Exclude all versions of processed notes
                                   target_column # CRITICAL: Ensure target column is dropped from features
                                  ] + potential_other_targets
    
    # Ensure only existing columns are dropped
    cols_to_drop_existing = [col for col in columns_to_drop_for_X if col in df.columns]
    X_tabular = df.drop(columns=cols_to_drop_existing, errors='ignore')
    
    # Identify categorical and numerical features from the remaining columns in X_tabular
    categorical_features = X_tabular.select_dtypes(include=['object', 'category']).columns.tolist()
    numerical_features = X_tabular.select_dtypes(include=np.number).columns.tolist()

    # This check ensures we only try to process columns that are actually in X_tabular
    # Create a copy to avoid SettingWithCopyWarning if X_tabular is a slice
    X_tabular = X_tabular[numerical_features + categorical_features].copy() 


    print(f"  Features for tabular model: {X_tabular.columns.tolist()}")
    print(f"  Identified categorical features: {categorical_features}")
    print(f"  Identified numerical features: {numerical_features}")
    
    if X_tabular.empty and not (numerical_features or categorical_features) :
        print("Error: Feature set X_tabular is empty after dropping columns and identifying types.")
        return None, None, None
    elif not numerical_features and not categorical_features and not X_tabular.empty :
         print("Warning: No specific numerical or categorical features identified by dtype, but X_tabular is not empty. ColumnTransformer might process based on column names if they were manually assigned.")
    
    # Create preprocessing pipelines for numerical and categorical features
    transformers_list = []
    if numerical_features: # Only add transformer if there are numerical features
        numerical_transformer = StandardScaler()
        transformers_list.append(('num', numerical_transformer, numerical_features))
    if categorical_features: # Only add transformer if there are categorical features
        categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False) # sparse_output=False for RF
        transformers_list.append(('cat', categorical_transformer, categorical_features))

    if not transformers_list and not X_tabular.empty: 
        print("Warning: No transformers created (no numerical or categorical features found by dtype to process specifically). Preprocessor will use remainder='passthrough'.")
        preprocessor = ColumnTransformer(transformers=[], remainder='passthrough') 
    elif not X_tabular.empty:
        preprocessor = ColumnTransformer(
            transformers=transformers_list, 
            remainder='passthrough' 
        )
    else: # X_tabular is empty, cannot create preprocessor
        preprocessor = None
        print("Error: Cannot create preprocessor as X_tabular is empty.")

    
    return X_tabular, y, preprocessor

X_tab_rf, y_rf, preprocessor_rf = prepare_features_for_tabular_model(df, target_column='concentration')


Preparing features for predicting 'concentration'...
  Features for tabular model: ['bmi', 'weight', 'height', 'systolic', 'diastolic', 'is_medicated', 'dose_mg', 'gender', 'medication', 'bmi_category', 'bp_category']
  Identified categorical features: ['gender', 'medication', 'bmi_category', 'bp_category']
  Identified numerical features: ['bmi', 'weight', 'height', 'systolic', 'diastolic', 'is_medicated', 'dose_mg']


In [4]:
X_tab_rf

Unnamed: 0,bmi,weight,height,systolic,diastolic,is_medicated,dose_mg,gender,medication,bmi_category,bp_category
0,26.8,82.9,1.76,113,88,1,15.0,Other,Adderall,Overweight,Hypertension Stage 1
1,17.3,52.3,1.74,136,72,0,0.0,Other,NoMedication,Underweight,Hypertension Stage 1
2,35.5,114.9,1.80,128,77,1,15.0,Male,Vyvanse,Obese,Elevated
3,36.5,91.2,1.58,101,78,0,5.0,Female,NoMedication,Obese,Normal
4,17.7,40.4,1.51,111,81,1,30.0,Female,Vyvanse,Underweight,Hypertension Stage 1
...,...,...,...,...,...,...,...,...,...,...,...
19994,23.1,80.8,1.87,131,77,1,10.0,Other,Concerta,Normal weight,Hypertension Stage 1
19995,30.9,116.2,1.94,120,71,1,20.0,Female,Adderall,Obese,Elevated
19996,11.5,44.8,1.97,91,69,1,30.0,Male,Adderall,Underweight,Normal
19997,19.9,75.0,1.94,138,89,0,5.0,Other,NoMedication,Normal weight,Hypertension Stage 1


### 3.2 2.	Baseline: Predict `concentration` using Random Forest

In [None]:
def train_random_forest_baseline(X_tabular_input, y_target_input, preprocessor_input):
    """
    Trains a Random Forest Regressor to predict 'concentration'.
    """
    target_name = y_target_input.name
    print(f"\n--- Training Random Forest Baseline to predict '{target_name}' ---")

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X_tabular_input, y_target_input, test_size=0.2, random_state=42)

    # Create the full pipeline including preprocessing and the model
    # RandomForestRegressor as concentration is a numerical rating (-2 to 2)
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
                                        # n_estimators=500,  # Increased number of trees
                                        # max_depth=15,      # Allowing deeper trees
                                        # min_samples_split=3,  # More aggressive splitting
                                        # min_samples_leaf=1,   # More granular leaf nodes
                                        # max_features='sqrt',  # Standard RF feature selection
                                        # random_state=42,
                                        # n_jobs=-1  # Use all available cores
                                        # )
    
    pipeline_rf = Pipeline(steps=[('preprocessor', preprocessor_input),
                                  ('regressor', rf_model)])

    print("Training Random Forest Regressor...")
    pipeline_rf.fit(X_train, y_train)

    # Evaluate the model
    y_pred_rf = pipeline_rf.predict(X_test)
    
    print("\nRandom Forest Regressor Evaluation:")
    mse = mean_squared_error(y_test, y_pred_rf)
    mae = mean_absolute_error(y_test, y_pred_rf) # MAE is required by assessment
    r2 = r2_score(y_test, y_pred_rf)

    # For F1 score, we need to convert regression to classification
    # Let's create binary classes based on median value for demonstration
    y_test_binary = (y_test > y_test.median()).astype(int)
    y_pred_binary = (y_pred_rf> y_test.median()).astype(int)
    f1 = f1_score(y_test_binary, y_pred_binary)

    print(f"  Mean Squared Error (MSE): {mse:.4f}")
    print(f"  Mean Absolute Error (MAE): {mae:.4f}") # Key metric
    print(f"  R-squared (R2): {r2:.4f}")
    print(f"F1 Score (binary): {f1:.4f}")

    # Feature importances
    try:
        # Get feature names after one-hot encoding
        ohe_feature_names = pipeline_rf.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(
            X_train.select_dtypes(include=['object', 'category']).columns.tolist()
        )
        num_feature_names = X_train.select_dtypes(include=np.number).columns.tolist()
        all_feature_names = np.concatenate([num_feature_names, ohe_feature_names])
        
        importances = pipeline_rf.named_steps['regressor'].feature_importances_
        feature_importance_df = pd.DataFrame({'feature': all_feature_names, 'importance': importances})
        feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)
        print("\nTop 10 Feature Importances (Random Forest):")
        print(feature_importance_df.head(10))
    except Exception as e:
        print(f"Could not retrieve feature importances: {e}")
        
    return pipeline_rf # Return the trained pipeline

rf_pipeline_trained = train_random_forest_baseline(X_tab_rf, y_rf, preprocessor_rf)


--- Training Random Forest Baseline to predict 'concentration' ---
Training Random Forest Regressor...

Random Forest Regressor Evaluation:
  Mean Squared Error (MSE): 2.1203
  Mean Absolute Error (MAE): 1.2581
  R-squared (R2): -0.0596
F1 Score (binary): 0.4238

Top 10 Feature Importances (Random Forest):
                feature  importance
1                weight    0.187710
0                   bmi    0.163330
3              systolic    0.148484
4             diastolic    0.132025
2                height    0.125509
6               dose_mg    0.068058
13   medication_Ritalin    0.016243
8           gender_Male    0.015886
15   medication_Vyvanse    0.015445
11  medication_Concerta    0.015329


### 3.3.1	Deep models: Predict `impulsivity` from doctor_notes using LSTM


In [None]:
# --- Deep models: LSTM : Predict impulsivity from doctor_notes ---
MAX_VOCAB_SIZE_LSTM = 10000
MAX_SEQUENCE_LENGTH_LSTM = 150 # Max length of sequences (notes)
EMBEDDING_DIM_LSTM = 100 # Dimension of word embeddings

def train_lstm_model(df_input, target_column='impulsivity'):
    """
    Trains an LSTM model to predict a target (e.g., 'impulsivity') from 'processed_notes'.
    """
    print(f"\n--- Training LSTM Model to predict '{target_column}' ---")
    df = df_input.copy()
    
    texts = df['processed_notes'].values
    labels = df[target_column].values

    # Tokenizer: Convert text to sequences of integers
    tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE_LSTM, oov_token="<oov>")
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    word_index = tokenizer.word_index
    print(f"  Found {len(word_index)} unique tokens.")

    # Pad sequences to ensure uniform length
    padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH_LSTM, padding='post', truncating='post')
    print(f"  Shape of padded sequences: {padded_sequences.shape}")

    # Prepare labels for regression (LSTM can output continuous values)
    # If labels are categorical, you'd one-hot encode or use sparse_categorical_crossentropy
    # For regression target like -2 to 2, no change needed for y, but ensure it's float
    y_lstm = labels.astype(float)

    # Split data
    X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(
        padded_sequences, y_lstm, test_size=0.2, random_state=42
    )

    # Build LSTM model
    print("  Building LSTM model architecture...")
    input_lstm = Input(shape=(MAX_SEQUENCE_LENGTH_LSTM,))
    embedding_layer = Embedding(input_dim=min(MAX_VOCAB_SIZE_LSTM, len(word_index) + 1), # Vocab size +1 for padding
                                output_dim=EMBEDDING_DIM_LSTM, 
                                input_length=MAX_SEQUENCE_LENGTH_LSTM)(input_lstm)
    lstm_layer = LSTM(64, dropout=0.2, recurrent_dropout=0.2)(embedding_layer) # 64 units
    dense_layer = Dense(32, activation='relu')(lstm_layer)
    dropout_layer = Dropout(0.3)(dense_layer)
    output_layer = Dense(1, activation='linear')(dropout_layer) # Linear activation for regression

    model_lstm = Model(inputs=input_lstm, outputs=output_layer)
    model_lstm.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error'])
    model_lstm.summary()

    print("\n  Training LSTM model...")
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    history_lstm = model_lstm.fit(
        X_train_lstm, y_train_lstm,
        epochs=30, # Adjust epochs
        batch_size=32,
        validation_split=0.1, # Use part of training data for validation during training
        callbacks=[early_stopping],
        verbose=1
    )

    # Evaluate LSTM model
    print("\n  LSTM Model Evaluation on Test Set:")
    loss_lstm, mae_lstm = model_lstm.evaluate(X_test_lstm, y_test_lstm, verbose=0)
    print(f"  Test Loss (MSE): {loss_lstm:.4f}")
    print(f"  Test Mean Absolute Error (MAE): {mae_lstm:.4f}") # Key metric

    # To get R2, we need predictions
    y_pred_lstm_test = model_lstm.predict(X_test_lstm).flatten()
    r2_lstm = r2_score(y_test_lstm, y_pred_lstm_test)
    print(f"  Test R-squared (R2): {r2_lstm:.4f}")
    
    return model_lstm, tokenizer, history_lstm # Return tokenizer for CLSTM if needed

# Run LSTM Model (Predicting impulsivity from doctor_notes)
df_for_lstm = df[['processed_notes', 'impulsivity']].copy().dropna()
lstm_model_trained, lstm_tokenizer, _ = train_lstm_model(df_for_lstm, target_column='impulsivity')


--- Training LSTM Model to predict 'impulsivity' ---
  Found 822 unique tokens.
  Shape of padded sequences: (19999, 150)
  Building LSTM model architecture...
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 150)]             0         
                                                                 
 embedding (Embedding)       (None, 150, 100)          82300     
                                                                 
 lstm (LSTM)                 (None, 64)                42240     
                                                                 
 dense (Dense)               (None, 32)                2080      
                                                                 
 dropout (Dropout)           (None, 32)                0         
                                                                 
 dense_1 (Dense)             (No

### 3.3.2	Deep models: Predict `concentration` from doctor_notes using CLSTM (CNN + LSTM)

In [None]:
# --- Section 3.4: CLSTM (CNN + LSTM) : Combine text and tabular features ---
# This is more complex. Let's define a function stub and outline steps.
# We need to decide on the target for CLSTM. Let's assume 'concentration' for comparison.

MAX_VOCAB_SIZE_CLSTM = 10000
MAX_SEQUENCE_LENGTH_CLSTM = 150
EMBEDDING_DIM_CLSTM = 100

def train_clstm_model(df_input, target_column='concentration', text_tokenizer=None):
    """
    Trains a CLSTM model combining text ('processed_notes') and tabular features.
    Args:
        df_input (pd.DataFrame): The fully processed DataFrame.
        target_column (str): The name of the target column to predict.
        text_tokenizer (tf.keras.preprocessing.text.Tokenizer, optional): 
                         A pre-fitted tokenizer. If None, a new one will be fitted.
    """
    print(f"\n--- Training CLSTM Model to predict '{target_column}' ---")
    df = df_input.copy()

    # Prepare Tabular Features
    X_tabular_clstm, y_clstm, preprocessor_clstm = prepare_features_for_tabular_model(df, target_column)
    
    # Apply preprocessing to the entire tabular dataset
    X_tabular_processed = preprocessor_clstm.fit_transform(X_tabular_clstm)
    print(f"  Shape of processed tabular data: {X_tabular_processed.shape}")

    # Prepare Text Features
    texts_clstm = df.loc[X_tabular_clstm.index, 'processed_notes'].values # Ensure alignment with tabular data
    
    if text_tokenizer is None:
        print("  Fitting new tokenizer for CLSTM text data...")
        tokenizer_clstm = Tokenizer(num_words=MAX_VOCAB_SIZE_CLSTM, oov_token="<oov>")
        tokenizer_clstm.fit_on_texts(texts_clstm)
    else:
        print("  Using provided tokenizer for CLSTM text data...")
        tokenizer_clstm = text_tokenizer
        
    sequences_clstm = tokenizer_clstm.texts_to_sequences(texts_clstm)
    word_index_clstm = tokenizer_clstm.word_index
    print(f"  Found {len(word_index_clstm)} unique tokens for CLSTM.")
    
    padded_sequences_clstm = pad_sequences(sequences_clstm, maxlen=MAX_SEQUENCE_LENGTH_CLSTM, padding='post', truncating='post')
    print(f"  Shape of padded text sequences for CLSTM: {padded_sequences_clstm.shape}")

    # Ensure y_clstm is float for regression
    y_clstm = y_clstm.astype(float).values

    # Split Data (for both Tabular and Text)
    X_train_tab, X_test_tab, \
    X_train_text, X_test_text, \
    y_train_clstm, y_test_clstm = train_test_split(
        X_tabular_processed, padded_sequences_clstm, y_clstm,
        test_size=0.2, random_state=42
    )
    print(f"  CLSTM Train shapes: Tabular {X_train_tab.shape}, Text {X_train_text.shape}, Target {y_train_clstm.shape}")
    print(f"  CLSTM Test shapes: Tabular {X_test_tab.shape}, Text {X_test_text.shape}, Target {y_test_clstm.shape}")

    # Build CLSTM Model Architecture
    print("  Building CLSTM model architecture...")
    
    # Text Input Branch (CNN + LSTM)
    input_text = Input(shape=(MAX_SEQUENCE_LENGTH_CLSTM,), name='text_input')
    embedding_text = Embedding(input_dim=min(MAX_VOCAB_SIZE_CLSTM, len(word_index_clstm) + 1),
                               output_dim=EMBEDDING_DIM_CLSTM,
                               input_length=MAX_SEQUENCE_LENGTH_CLSTM)(input_text)
    conv_layer = Conv1D(filters=64, kernel_size=3, activation='relu')(embedding_text)
    pool_layer = GlobalMaxPooling1D()(conv_layer) # Or LSTM after Conv1D
    
    # Text Input Branch (LSTM)
    # input_text = Input(shape=(MAX_SEQUENCE_LENGTH_CLSTM,), name='text_input')
    # embedding_text = Embedding(...)(input_text)
    lstm_text = LSTM(64, dropout=0.2, recurrent_dropout=0.2)(embedding_text)
    text_features = Dense(32, activation='relu')(lstm_text)

    # Tabular Input Branch
    input_tabular = Input(shape=(X_train_tab.shape[1],), name='tabular_input')
    dense_tabular = Dense(64, activation='relu')(input_tabular)
    tabular_features = Dense(32, activation='relu')(dense_tabular)
    tabular_features = Dropout(0.3)(tabular_features)

    # Concatenate features
    combined_features = concatenate([text_features, tabular_features])
    
    # Output layers
    combined_dense = Dense(64, activation='relu')(combined_features)
    combined_dropout = Dropout(0.4)(combined_dense)
    output_clstm = Dense(1, activation='linear')(combined_dropout) # Regression

    model_clstm = Model(inputs=[input_text, input_tabular], outputs=output_clstm)
    model_clstm.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error'])
    model_clstm.summary()

    # Train CLSTM Model
    print("\n  Training CLSTM model...")
    early_stopping_clstm = EarlyStopping(monitor='val_loss', patience=7, restore_best_weights=True)
    history_clstm = model_clstm.fit(
        [X_train_text, X_train_tab], y_train_clstm,
        epochs=40, # Adjust epochs
        batch_size=32,
        validation_split=0.1,
        callbacks=[early_stopping_clstm],
        verbose=1
    )

    # Evaluate CLSTM Model
    print("\n  CLSTM Model Evaluation on Test Set:")
    loss_clstm, mae_clstm = model_clstm.evaluate([X_test_text, X_test_tab], y_test_clstm, verbose=0)
    print(f"  Test Loss (MSE): {loss_clstm:.4f}")
    print(f"  Test Mean Absolute Error (MAE): {mae_clstm:.4f}")

    y_pred_clstm_test = model_clstm.predict([X_test_text, X_test_tab]).flatten()
    r2_clstm = r2_score(y_test_clstm, y_pred_clstm_test)
    print(f"  Test R-squared (R2): {r2_clstm:.4f}")

    return model_clstm, tokenizer_clstm, preprocessor_clstm, history_clstm


# Run CLSTM Model (Predicting concentration using both text and tabular data)
df_for_clstm = df.copy().dropna(subset=['concentration', 'processed_notes'])
df_for_clstm = df_for_clstm[df_for_clstm['processed_notes'].str.strip().astype(bool)]

clstm_model_trained, _, _, _ = train_clstm_model(df_for_clstm, target_column='concentration', text_tokenizer=lstm_tokenizer)


--- Training CLSTM Model to predict 'concentration' ---

Preparing features for predicting 'concentration'...
  Features for tabular model: ['bmi', 'weight', 'height', 'systolic', 'diastolic', 'is_medicated', 'dose_mg', 'gender', 'medication', 'bmi_category', 'bp_category']
  Identified categorical features: ['gender', 'medication', 'bmi_category', 'bp_category']
  Identified numerical features: ['bmi', 'weight', 'height', 'systolic', 'diastolic', 'is_medicated', 'dose_mg']
  Shape of processed tabular data: (19999, 24)
  Using provided tokenizer for CLSTM text data...
  Found 822 unique tokens for CLSTM.
  Shape of padded text sequences for CLSTM: (19999, 150)
  CLSTM Train shapes: Tabular (15999, 24), Text (15999, 150), Target (15999,)
  CLSTM Test shapes: Tabular (4000, 24), Text (4000, 150), Target (4000,)
  Building CLSTM model architecture...
Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)         