1. Load Data (stratified sample)
2. train-val-test split
4. clean/tokenize, not stop, not lemma
5. convert to tf.dataset
6. run models

In [None]:
# loading sample Data
%load_ext autoreload
%autoreload 2


In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, Sequential
import time
import nltk
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('wordnet', quiet=True)

def prepare_tf_dataset(X, y, batch_size, is_training=False):
    """
    Prepares a TensorFlow dataset for efficient training or evaluation.
    """
    dataset = tf.data.Dataset.from_tensor_slices((X, y))
    if is_training:
        dataset = dataset.shuffle(10000)  # Shuffle only if dataset is for training
    return dataset.batch(batch_size).cache().prefetch(tf.data.AUTOTUNE)

def load_data(path, include_validation=False, sample_size=0):
    """Load review data from a CSV file, with optional sampling and validation split."""
    df = pd.read_csv(path)[['Review_Title', 'Review', 'Recommended']]
    if sample_size > 0:
        df = df.sample(sample_size)
    
    X = df['Review_Title'] + ' ' + df['Review']
    y = df['Recommended'].map({'yes': 1, 'no': 0})
    
    if include_validation:
        return split_data_with_validation(X, y)
    else:
        return train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

def split_data_with_validation(X, y):
    """Split data into training, validation, and test sets."""
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)
    return X_train, X_val, X_test, y_train, y_val, y_test

def get_wordnet_pos(treebank_tag):
    """Map POS tag to first character lemmatize() accepts."""
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def clean_text(review, stop_words=None, lemmatize=True):
    """Clean and preprocess a single review text."""
    tokenizer = RegexpTokenizer(r"([a-zA-Z]+(?:’[a-z]+)?)")
    lemmatizer = WordNetLemmatizer() if lemmatize else None
    
    tokens = tokenizer.tokenize(review.lower())
    if lemmatize:
        pos_tags = pos_tag(tokens)
        tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
    if stop_words:
        tokens = [word for word in tokens if word not in stop_words]
    
    return ' '.join(tokens)

def preprocess_texts(reviews, stop_words=None, lemmatize=False):
    """Apply text cleaning and preprocessing to a list of texts."""
    return [clean_text(review, stop_words=stop_words, lemmatize=lemmatize) for review in reviews]

def load_and_prepare_data(path, include_validation=True, sample_size=5000, 
                          stop_words=None, lemmatize=False, 
                          max_tokens=10000, percentile_len=0.9, batch_size=64):
    """
    Load, clean, and prepare data for training and validation.
    """
    # Load the data with validation split and optional sampling
    X_train, X_val, X_test, y_train, y_val, y_test = load_data(
        path, include_validation=include_validation, sample_size=sample_size)
    
    # Clean the text data
    X_train = preprocess_texts(X_train, stop_words=stop_words, lemmatize=lemmatize)
    X_val = preprocess_texts(X_val, stop_words=stop_words, lemmatize=lemmatize)
    
    # Calculating percentile sequence length and vocabulary size for training set
    lengths = pd.Series([len(review.split()) for review in X_train])
    sequence_length = int(lengths.quantile(percentile_len))
    vocab_size = min(max_tokens, len(set(word for review in X_train for word in review.split())))
    
    # Initialize and adapt the TextVectorization layer
    text_vectorization = tf.keras.layers.TextVectorization(
        standardize=None,
        max_tokens=vocab_size,
        output_mode='int',
        output_sequence_length=sequence_length)
    
    text_vectorization.adapt(X_train)
    X_train = text_vectorization(X_train)
    X_val = text_vectorization(X_val)
    
    # Prepare the datasets
    train_ds = prepare_tf_dataset(X_train, y_train, batch_size, is_training=True)
    val_ds = prepare_tf_dataset(X_val, y_val, batch_size)

    return train_ds, val_ds, vocab_size, sequence_length, batch_size

def add_rnn_layer(model, units, rnn_type='gru', bidirectional=False, return_sequences=False):
    LayerClass = layers.GRU if rnn_type == 'gru' else layers.LSTM
    layer = LayerClass(units, return_sequences=return_sequences)
    if bidirectional:
        layer = layers.Bidirectional(layer)
    model.add(layer)

def build_rnn_model(rnn_layers, dense_layers, recurrent_type, bi_directional, dropout_rate, units, sequence_length, vocab_size):

    model = Sequential()
    model.add(layers.Embedding(input_dim=vocab_size, output_dim=units, input_length=sequence_length))

    for i in range(rnn_layers):
        add_rnn_layer(model, units, rnn_type=recurrent_type, bidirectional=bi_directional, 
                      return_sequences=(i < rnn_layers - 1))
        model.add(layers.Dropout(dropout_rate))
        units = max(2, units // 2)

    for _ in range(dense_layers):
        model.add(layers.Dense(units, activation="relu"))
        model.add(layers.Dropout(dropout_rate))
        units = max(2, units // 2)

    model.add(layers.Dense(1, activation='sigmoid'))
    
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy', tf.keras.metrics.AUC(name="auc")])
    return model

def extract_performance_metrics(history, callbacks, duration):
    early_stopping = next(
        (cb for cb in callbacks if isinstance(cb, tf.keras.callbacks.EarlyStopping)), 
        None
    )
    if early_stopping and early_stopping.stopped_epoch > 0:
        adjusted_epoch = early_stopping.stopped_epoch - early_stopping.patience
        max_epoch_index = len(history.history['loss']) - 1
        best_epoch = max(0, min(adjusted_epoch, max_epoch_index))
    else:
        best_epoch = len(history.history['loss']) - 1

    metrics = {
        'loss': history.history['loss'][best_epoch],
        'val_loss': history.history['val_loss'][best_epoch],
        'val_accuracy': history.history.get('val_accuracy', [None])[best_epoch],
        'val_auc': history.history.get('val_auc', [None])[best_epoch],
        'duration': duration
    }
    return metrics

def train_and_evaluate(model_function, train_ds, val_ds, epochs):
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            min_delta=0.01,
            patience=5, 
            restore_best_weights=True,
            verbose=0
        )
    ]
    
    model = model_function()    
    start_time = time.time()
    history = model.fit(
        train_ds, 
        epochs=epochs, 
        validation_data=val_ds,
        callbacks=callbacks,
        verbose=0
    )
    duration = time.time() - start_time
    return extract_performance_metrics(history, callbacks, duration)

def calculate_average_metrics(runs, model_function, train_ds, val_ds, epochs):
    metrics_aggregate = {'loss': 0, 'val_loss': 0, 'val_accuracy': 0, 'val_auc': 0, 'duration': 0}

    for _ in range(runs):
        metrics = train_and_evaluate(model_function, train_ds, val_ds, epochs)
        for key in metrics_aggregate:
            metrics_aggregate[key] += metrics[key]

    return {key: val / runs for key, val in metrics_aggregate.items()}

2024-04-01 12:11:34.989486: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Load and Prepare the Data
train_ds, val_ds, vocab_size, sequence_length, batch_size = load_and_prepare_data(
    path='data/Airline_review.csv', 
    include_validation=True, 
    sample_size=5000, 
    stop_words=None, 
    lemmatize=False, 
    max_tokens=10000, 
    percentile_len=0.9, 
    batch_size=64
)

In [3]:
# Model and Score the data
runs = 3
epochs = 2

model_function = lambda: build_rnn_model(
    rnn_layers=1,
    dense_layers=1,
    recurrent_type='gru',
    bi_directional=False,
    dropout_rate=0.2,
    units=64,
    sequence_length=sequence_length,
    vocab_size=vocab_size
)  

average_metrics_result = calculate_average_metrics(runs, model_function, train_ds, val_ds, epochs)
print(average_metrics_result)

{'loss': 0.6415860652923584, 'val_loss': 0.6530152360598246, 'val_accuracy': 0.6539999842643738, 'val_auc': 0.5341134667396545, 'duration': 12.352736314137777}


In [6]:
pd.DataFrame(average_metrics_result)

ValueError: If using all scalar values, you must pass an index

In [None]:
runs = 3
epochs = 2
for i in [True, False:
    for j in ['gru','lstm']:
        model_function = lambda: build_rnn_model(
            rnn_layers=1,
            dense_layers=1,
            recurrent_type=j,
            bi_directional=i,
            dropout_rate=0.2,
            units=64,
            sequence_length=sequence_length,
            vocab_size=vocab_size
        )  

        average_metrics_result = calculate_average_metrics(runs, model_function, train_ds, val_ds, epochs)

In [9]:
import pandas as pd

def run_experiment(recurrent_type, bi_directional, runs, train_ds, val_ds, epochs, sequence_length, vocab_size):
    """
    Runs the experiment for a specific configuration and returns the average metrics.
    """
    model_function = lambda: build_rnn_model(
        rnn_layers=1,
        dense_layers=1,
        recurrent_type=recurrent_type,
        bi_directional=bi_directional,
        dropout_rate=0.2,
        units=64,
        sequence_length=sequence_length,
        vocab_size=vocab_size
    )
    return calculate_average_metrics(runs, model_function, train_ds, val_ds, epochs)

# Configuration options
configurations = [
    {'recurrent_type': 'gru', 'bi_directional': True},
    {'recurrent_type': 'gru', 'bi_directional': False},
    {'recurrent_type': 'lstm', 'bi_directional': True},
    {'recurrent_type': 'lstm', 'bi_directional': False},
]

results = []
runs = 3
epochs = 20

# Run experiments
for config in configurations:
    metrics = run_experiment(
        config['recurrent_type'],
        config['bi_directional'],
        runs,
        train_ds,
        val_ds,
        epochs,
        sequence_length=sequence_length,
        vocab_size=vocab_size
    )
    results.append({
        **metrics,
        **config  # Unpack configuration into the results
    })

# Create DataFrame and format
df = pd.DataFrame(results)
df = df.round({'loss': 4, 'val_loss': 4, 'val_accuracy': 4, 'val_auc': 4})
df['duration'] = df['duration'].round(0).astype(int)

print(df)


     loss  val_loss  val_accuracy  val_auc  duration recurrent_type  \
0  0.3116    0.3601        0.8493   0.9267        59            gru   
1  0.6576    0.6476        0.6540   0.4842        35            gru   
2  0.2927    0.3230        0.8647   0.9367       147           lstm   
3  0.6587    0.6483        0.6540   0.5155        56           lstm   

   bi_directional  
0            True  
1           False  
2            True  
3           False  


In [8]:
df

Unnamed: 0,loss,val_loss,val_accuracy,val_auc,duration,recurrent_type,bi_directional
0,0.3028,0.3411,0.8473,0.9232,17,gru,True
1,0.6418,0.6508,0.654,0.5349,12,gru,False
2,0.2748,0.3133,0.8673,0.9384,31,lstm,True
3,0.6397,0.6493,0.654,0.5447,19,lstm,False
