1. Load Data (stratified sample)
2. train-val-test split
4. clean/tokenize, not stop, not lemma
5. convert to tf.dataset
6. run models

In [4]:
import numpy as np
import pandas as pd
import tensorflow as tf
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, Sequential
import time
import nltk
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('wordnet', quiet=True)

def prepare_tf_dataset(X, y, batch_size, is_training=False):
    """
    Prepares a TensorFlow dataset for efficient training or evaluation.
    """
    dataset = tf.data.Dataset.from_tensor_slices((X, y))
    if is_training:
        dataset = dataset.shuffle(10000)  # Shuffle only if dataset is for training
    return dataset.batch(batch_size).cache().prefetch(tf.data.AUTOTUNE)

def load_data(path, include_validation=False, sample_size=0):
    """Load review data from a CSV file, with optional sampling and validation split."""
    df = pd.read_csv(path)[['Review_Title', 'Review', 'Recommended']]
    if sample_size > 0:
        df = df.sample(sample_size)
    
    X = df['Review_Title'] + ' ' + df['Review']
    y = df['Recommended'].map({'yes': 1, 'no': 0})
    
    if include_validation:
        return split_data_with_validation(X, y)
    else:
        return train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

def split_data_with_validation(X, y):
    """Split data into training, validation, and test sets."""
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)
    return X_train, X_val, X_test, y_train, y_val, y_test

def get_wordnet_pos(treebank_tag):
    """Map POS tag to first character lemmatize() accepts."""
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def clean_text(review, stop_words=None, lemmatize=True):
    """Clean and preprocess a single review text."""
    tokenizer = RegexpTokenizer(r"([a-zA-Z]+(?:’[a-z]+)?)")
    lemmatizer = WordNetLemmatizer() if lemmatize else None
    
    tokens = tokenizer.tokenize(review.lower())
    if lemmatize:
        pos_tags = pos_tag(tokens)
        tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
    if stop_words:
        tokens = [word for word in tokens if word not in stop_words]
    
    return ' '.join(tokens)

def preprocess_texts(reviews, stop_words=None, lemmatize=False):
    """Apply text cleaning and preprocessing to a list of texts."""
    return [clean_text(review, stop_words=stop_words, lemmatize=lemmatize) for review in reviews]

def create_embedding_matrix(glove_path, text_vectorization, embedding_dim=300):
    vocab = text_vectorization.get_vocabulary()
    vocab_size = len(vocab)  # This is the actual vocabulary size, including special tokens.

    # Load GloVe embeddings from file.
    glove_embeddings = {}
    with open(glove_path, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            glove_embeddings[word] = vector

    # Initialize the embedding matrix with zeros.
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    
    # Populate the embedding matrix with GloVe vectors.
    for i, word in enumerate(vocab):
        embedding_vector = glove_embeddings.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            
    return embedding_matrix, vocab_size

def load_and_prepare_data(path, include_validation=True, sample_size=5000, 
                          stop_words=None, lemmatize=False, 
                          max_tokens=10000, percentile_len=0.9, batch_size=64):
    """
    Load, clean, and prepare data for training and validation.
    """
    # Load the data with validation split and optional sampling
    X_train, X_val, X_test, y_train, y_val, y_test = load_data(
        path, include_validation=include_validation, sample_size=sample_size)
    
    # Clean the text data
    X_train = preprocess_texts(X_train, stop_words=stop_words, lemmatize=lemmatize)
    X_val = preprocess_texts(X_val, stop_words=stop_words, lemmatize=lemmatize)
    
    # Calculating percentile sequence length and vocabulary size for training set
    lengths = pd.Series([len(review.split()) for review in X_train])
    sequence_length = int(lengths.quantile(percentile_len))
    vocab_size = min(max_tokens, len(set(word for review in X_train for word in review.split())))
  
    # TextVectorization
    text_vectorization = tf.keras.layers.TextVectorization(
        standardize=None,
        max_tokens=vocab_size,
        output_mode='int',
        output_sequence_length=sequence_length)
    
    text_vectorization.adapt(X_train)
    X_train = text_vectorization(X_train)
    X_val = text_vectorization(X_val)

    # Creating Embedding Matrix
    glove_path = 'data/glove.6B.300d.txt'
    embedding_matrix, vocab_size = create_embedding_matrix(glove_path, text_vectorization)
        
    # Prepare the datasets
    train_ds = prepare_tf_dataset(X_train, y_train, batch_size, is_training=True)
    val_ds = prepare_tf_dataset(X_val, y_val, batch_size)

    return train_ds, val_ds, vocab_size, sequence_length, batch_size

def add_rnn_layer(model, units, rnn_type='gru', bidirectional=False, return_sequences=False):
    LayerClass = layers.GRU if rnn_type == 'gru' else layers.LSTM
    layer = LayerClass(units, return_sequences=return_sequences)
    if bidirectional:
        layer = layers.Bidirectional(layer)
    model.add(layer)

def build_rnn_model(rnn_layers, dense_layers, recurrent_type, bi_directional, dropout_rate, units, sequence_length, vocab_size, embedding_matrix):

    model = Sequential()
    model.add(layers.Embedding(input_dim=vocab_size, output_dim=units, input_length=sequence_length,
                     weights=[embedding_matrix], trainable=False))

    for i in range(rnn_layers):
        add_rnn_layer(model, units, rnn_type=recurrent_type, bidirectional=bi_directional, 
                      return_sequences=(i < rnn_layers - 1))
        model.add(layers.Dropout(dropout_rate))
        units = max(2, units // 2)

    for _ in range(dense_layers):
        model.add(layers.Dense(units, activation="relu"))
        model.add(layers.Dropout(dropout_rate))
        units = max(2, units // 2)

    model.add(layers.Dense(1, activation='sigmoid'))
    
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy', tf.keras.metrics.AUC(name="auc")])
    return model

def extract_performance_metrics(history, callbacks, duration):
    early_stopping = next(
        (cb for cb in callbacks if isinstance(cb, tf.keras.callbacks.EarlyStopping)), 
        None
    )
    if early_stopping and early_stopping.stopped_epoch > 0:
        adjusted_epoch = early_stopping.stopped_epoch - early_stopping.patience
        max_epoch_index = len(history.history['loss']) - 1
        best_epoch = max(0, min(adjusted_epoch, max_epoch_index))
    else:
        best_epoch = len(history.history['loss']) - 1

    metrics = {
        'loss': history.history['loss'][best_epoch],
        'val_loss': history.history['val_loss'][best_epoch],
        'val_accuracy': history.history.get('val_accuracy', [None])[best_epoch],
        'val_auc': history.history.get('val_auc', [None])[best_epoch],
        'duration': duration
    }
    return metrics

def train_and_evaluate(model_function, train_ds, val_ds, epochs):
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            min_delta=0.01,
            patience=5, 
            restore_best_weights=True,
            verbose=0
        )
    ]
    
    model = model_function()    
    start_time = time.time()
    history = model.fit(
        train_ds, 
        epochs=epochs, 
        validation_data=val_ds,
        callbacks=callbacks,
        verbose=0
    )
    duration = time.time() - start_time
    return extract_performance_metrics(history, callbacks, duration)

def calculate_average_metrics(runs, model_function, train_ds, val_ds, epochs):
    metrics_aggregate = {'loss': 0, 'val_loss': 0, 'val_accuracy': 0, 'val_auc': 0, 'duration': 0}

    for _ in range(runs):
        metrics = train_and_evaluate(model_function, train_ds, val_ds, epochs)
        for key in metrics_aggregate:
            metrics_aggregate[key] += metrics[key]

    return {key: val / runs for key, val in metrics_aggregate.items()}

In [2]:
# Load and Prepare the Data
train_ds, val_ds, vocab_size, sequence_length, batch_size = load_and_prepare_data(
    path='data/Airline_review.csv', 
    include_validation=True, 
    sample_size=1000, 
    stop_words=None, 
    lemmatize=False, 
    max_tokens=5000, 
    percentile_len=0.9, 
    batch_size=64
)

In [3]:
from tqdm.notebook import tqdm


def run_experiment(recurrent_type, bi_directional, runs, train_ds, val_ds, epochs, sequence_length, vocab_size):
    """
    Runs the experiment for a specific configuration and returns the average metrics.
    """
    model_function = lambda: build_rnn_model(
        rnn_layers=1,
        dense_layers=1,
        recurrent_type=recurrent_type,
        bi_directional=bi_directional,
        dropout_rate=0.2,
        units=64,
        sequence_length=sequence_length,
        vocab_size=vocab_size
    )
    return calculate_average_metrics(runs, model_function, train_ds, val_ds, epochs)

# Configuration options
configurations = [
    {'recurrent_type': 'gru', 'bi_directional': True},
    {'recurrent_type': 'gru', 'bi_directional': False},
    {'recurrent_type': 'lstm', 'bi_directional': True},
    {'recurrent_type': 'lstm', 'bi_directional': False},
]

results = []
runs = 2
epochs = 2

# Run experiments
for config in tqdm(configurations, desc="Running Configurations"):
    metrics = run_experiment(
        config['recurrent_type'],
        config['bi_directional'],
        runs,
        train_ds,
        val_ds,
        epochs,
        sequence_length=sequence_length,
        vocab_size=vocab_size
    )
    
    results.append({
        **metrics,
        **config  # Unpack configuration into the results
    })

# Create DataFrame and format
df = pd.DataFrame(results)
df = df.round({'loss': 4, 'val_loss': 4, 'val_accuracy': 4, 'val_auc': 4})
df['duration'] = df['duration'].round(0).astype(int)

print(df)


Running Configurations:   0%|          | 0/4 [00:00<?, ?it/s]

     loss  val_loss  val_accuracy  val_auc  duration recurrent_type  \
0  0.6231    0.6046          0.68   0.8290         9            gru   
1  0.6417    0.6328          0.68   0.4730         5            gru   
2  0.6048    0.5143          0.68   0.9604        11           lstm   
3  0.6507    0.6359          0.68   0.4730         7           lstm   

   bi_directional  
0            True  
1           False  
2            True  
3           False  


In [None]:
df

In [None]:
# sampled or not?
X_input = X_train_clean_sampled
y_input = y_train_sampled

# finding vocab size
text_cleaner = TextCleanerTransformer(stop_words=None, lemmatize=False)
X_train_clean = text_cleaner.transform(X_input)
X_train_tokenized = [review.split() for review in X_train_clean]
train_vocabulary = set(word for review in X_train_tokenized for word in review)
vocab_size = len(train_vocabulary) + 1

#setting important dimensions
vocab=min(15000,len(train_vocabulary) + 1)
length=500
embedding_dim = 300

#making embedding matrix
glove_embeddings = {}
with open('../data/glove.6B.300d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        glove_embeddings[word] = vector

# Initialize the embedding matrix with zeros
embedding_matrix = np.zeros((vocab, embedding_dim))

# Fill in the matrix with GloVe embeddings
for i, word in enumerate(train_vocabulary):
    embedding_vector = glove_embeddings.get(word)
    if embedding_vector is not None:
        # Ensure the index i does not exceed the vocab size limitation
        if i < vocab:
            embedding_matrix[i] = embedding_vector