In [1]:
# Standard library imports
from nltk import pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

# Third-party imports
import numpy as np
import pandas as pd
import tensorflow as tf
from scikeras.wrappers import KerasClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_validate, train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from tensorflow.keras import Sequential, layers

# Global constants
STOPWORDS = stopwords.words('english')
# nltk.download('stopwords')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')

df = pd.read_csv('../data/Airline_review.csv')[['Review_Title','Review','Recommended']]
reviews = df['Review_Title'] + ' ' + df['Review']
labels = df['Recommended'].map({'yes':1,'no':0})
train_reviews, temp_reviews, train_labels, temp_labels = train_test_split(reviews, labels, test_size=0.2, stratify=labels, random_state=42)
val_reviews, test_reviews, val_labels, test_labels = train_test_split(temp_reviews, temp_labels, test_size=0.5, stratify=temp_labels, random_state=42)

# Concatenating the valildation set as I don't need it here. 90-10 split
X_train = pd.concat([train_reviews, val_reviews])
y_train = pd.concat([train_labels, val_labels])

2024-03-30 09:35:53.713971: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:

STOPWORDS = stopwords.words('english')
text_cleaner = TextCleanerTransformer(stop_words=None, lemmatize=False)
X_train_clean = text_cleaner.transform(X_train)
#For Sequence models later
from tensorflow.keras.layers import TextVectorization
max_features = 20000 # 28593 Unlemmatized, 23171 lemmatized
sequence_length = 500 # more than 98% are less than this anyway

vectorize_layer = TextVectorization(
    standardize=None, # already done by transformation.
    split='whitespace',
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

# Learning the vocabulary
vectorize_layer.adapt(X_train_clean) 

# Transforming to sequence vectors
X_train_sequence_vec = vectorize_layer(X_train_clean)
X_train_np = X_train_sequence_vec.numpy()

In [3]:
from tensorflow.keras import layers, Sequential

def add_rnn_layer(model, units, rnn_type='gru', bidirectional=False, return_sequences=False):
    """
    Adds an RNN layer to the model with specified configurations.
    
    Parameters:
    - model: Sequential model to which the layer is added
    - units: Number of units in the RNN layer
    - rnn_type: Type of RNN ('gru' or 'lstm')
    - bidirectional: If True, adds a bidirectional wrapper to the layer
    - return_sequences: Whether to return the last output in the output sequence
    """
    LayerClass = layers.GRU if rnn_type == 'gru' else layers.LSTM
    layer = LayerClass(units, return_sequences=return_sequences)
    if bidirectional:
        layer = layers.Bidirectional(layer)
    model.add(layer)

def build_rnn_model(rnn_layers=1, dense_layers=1, recurrent_type='gru', bi_directional=False, dropout_rate=0.2, units=64, sequence_length=500, vocab_size=20000):

    model = Sequential()
    model.add(layers.Embedding(input_dim=vocab_size, output_dim=units, input_length=sequence_length))

    for i in range(rnn_layers):
        add_rnn_layer(model, units, rnn_type=recurrent_type, bidirectional=bi_directional, 
                      return_sequences=(i < rnn_layers - 1))
        model.add(layers.Dropout(dropout_rate))
        units //= 2

    for _ in range(dense_layers):
        model.add(layers.Dense(units, activation="relu"))
        model.add(layers.Dropout(dropout_rate))
        units //= 2

    model.add(layers.Dense(1, activation='sigmoid'))
    return model

model_wrapper = KerasClassifier(build_fn=build_rnn_model,
                                random_state=42,
                                optimizer='adam',
                                loss='binary_crossentropy',
                                metrics=['accuracy'],
                                batch_size=64,
                                verbose=1,
                                callbacks=None,
                                shuffle=True,
                                epochs=2)

In [4]:
# just Gridserching the model
params = {
    'model__recurrent_type': ['gru'],  
    'model__units': [32],
    'model__bi_directional': [True, False],
    'model__rnn_layers': [1],
    'model__dense_layers': [1],
}

gs = GridSearchCV(estimator=model_wrapper, 
                  param_grid=params,
                  scoring='balanced_accuracy',
                  cv=2,
                  verbose=3,
                  error_score='raise')

grid_search = gs.fit(X_train_np, y_train)

Fitting 2 folds for each of 2 candidates, totalling 4 fits


  X, y = self._initialize(X, y)


Epoch 1/2
Epoch 2/2
[CV 1/2] END model__bi_directional=True, model__dense_layers=1, model__recurrent_type=gru, model__rnn_layers=1, model__units=32;, score=0.867 total time= 1.1min


  X, y = self._initialize(X, y)


Epoch 1/2
Epoch 2/2
[CV 2/2] END model__bi_directional=True, model__dense_layers=1, model__recurrent_type=gru, model__rnn_layers=1, model__units=32;, score=0.868 total time= 1.1min


  X, y = self._initialize(X, y)


Epoch 1/2
Epoch 2/2
[CV 1/2] END model__bi_directional=False, model__dense_layers=1, model__recurrent_type=gru, model__rnn_layers=1, model__units=32;, score=0.500 total time=  47.6s


  X, y = self._initialize(X, y)


Epoch 1/2
Epoch 2/2
[CV 2/2] END model__bi_directional=False, model__dense_layers=1, model__recurrent_type=gru, model__rnn_layers=1, model__units=32;, score=0.500 total time=  49.9s


  X, y = self._initialize(X, y)


Epoch 1/2
Epoch 2/2


In [None]:
recurrent_type: ['gru','lstm']
units: [256,128,64,32,16]
bi_directional: [True,False]
rnn_layers: [1,2]
dense_layers: [1,2]

### Incorporating TextVectorization inside sklearn pipeline

In [6]:
# Making a custom transformer to adapt input data.
from sklearn.base import BaseEstimator, TransformerMixin
import tensorflow as tf

class KerasTextVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, max_tokens=20000, output_sequence_length=100):
        self.max_tokens = max_tokens
        self.output_sequence_length = output_sequence_length
        self.text_vectorization = tf.keras.layers.TextVectorization(
            max_tokens=self.max_tokens,
            output_sequence_length=self.output_sequence_length)

    def fit(self, X, y=None):
        self.text_vectorization.adapt(X)
        return self  # Return self to allow chaining

    def transform(self, X, y=None):
        return self.text_vectorization(X).numpy()  # Convert to numpy for sklearn compatibility

# Functions for building RNN Model
def add_rnn_layer(model, units, rnn_type='gru', bidirectional=False, return_sequences=False):
    LayerClass = layers.GRU if rnn_type == 'gru' else layers.LSTM
    layer = LayerClass(units, return_sequences=return_sequences)
    if bidirectional:
        layer = layers.Bidirectional(layer)
    model.add(layer)

def build_rnn_model(rnn_layers=1, dense_layers=1, recurrent_type='gru', bi_directional=False, dropout_rate=0.2, units=64, sequence_length=500, vocab_size=20000):
    model = Sequential()
    model.add(layers.Embedding(input_dim=vocab_size, output_dim=units, input_length=sequence_length))

    for i in range(rnn_layers):
        add_rnn_layer(model, units, rnn_type=recurrent_type, bidirectional=bi_directional, 
                      return_sequences=(i < rnn_layers - 1))
        model.add(layers.Dropout(dropout_rate))
        units //= 2

    for _ in range(dense_layers):
        model.add(layers.Dense(units, activation="relu"))
        model.add(layers.Dropout(dropout_rate))
        units //= 2

    model.add(layers.Dense(1, activation='sigmoid'))
    return model

In [8]:
text_vectorizer = KerasTextVectorizer(max_tokens=20000, output_sequence_length=500)
model_wrapper = KerasClassifier(build_fn=build_rnn_model,
                                random_state=42,
                                optimizer='adam',
                                loss='binary_crossentropy',
                                metrics=['accuracy'],
                                batch_size=64,
                                verbose=1,
                                callbacks=None,
                                shuffle=True,
                                epochs=1)

pipe = Pipeline([
    ('text_vect', text_vectorizer),
    ('rnn', model_wrapper)
])

params = {
    'rnn__model__recurrent_type': ['gru'],  
    'rnn__model__units': [32],
    'rnn__model__bi_directional': [True, False],
    'rnn__model__rnn_layers': [1],
    'rnn__model__dense_layers': [1],
}
gs = GridSearchCV(estimator=pipe, 
                  param_grid=params,
                  scoring='balanced_accuracy',
                  cv=2,
                  verbose=3,
                  error_score='raise')

grid_search = gs.fit(X_train_clean, y_train)

Fitting 2 folds for each of 2 candidates, totalling 4 fits


  X, y = self._initialize(X, y)


[CV 1/2] END rnn__model__bi_directional=True, rnn__model__dense_layers=1, rnn__model__recurrent_type=gru, rnn__model__rnn_layers=1, rnn__model__units=32;, score=0.871 total time=  41.4s


  X, y = self._initialize(X, y)


[CV 2/2] END rnn__model__bi_directional=True, rnn__model__dense_layers=1, rnn__model__recurrent_type=gru, rnn__model__rnn_layers=1, rnn__model__units=32;, score=0.861 total time=  43.9s


  X, y = self._initialize(X, y)


[CV 1/2] END rnn__model__bi_directional=False, rnn__model__dense_layers=1, rnn__model__recurrent_type=gru, rnn__model__rnn_layers=1, rnn__model__units=32;, score=0.500 total time=  31.2s


  X, y = self._initialize(X, y)


[CV 2/2] END rnn__model__bi_directional=False, rnn__model__dense_layers=1, rnn__model__recurrent_type=gru, rnn__model__rnn_layers=1, rnn__model__units=32;, score=0.500 total time=  30.4s


  X, y = self._initialize(X, y)




KeyboardInterrupt: 

In [9]:
text_cleaner = TextCleanerTransformer(stop_words=None, lemmatize=False)
text_vectorizer = KerasTextVectorizer(max_tokens=20000, output_sequence_length=500)
model_wrapper = KerasClassifier(model=build_rnn_model,
                                random_state=42,
                                optimizer='adam',
                                loss='binary_crossentropy',
                                metrics=['accuracy'],
                                batch_size=64,
                                verbose=1,
                                callbacks=None,
                                shuffle=True,
                                epochs=1)

pipe = Pipeline([
    ('text_cleaner', text_cleaner),
    ('text_vect', text_vectorizer),
    ('rnn', model_wrapper)
])

params = {
    'rnn__model__recurrent_type': ['gru'],  
    'rnn__model__units': [32],
    'rnn__model__bi_directional': [True, False],
    'rnn__model__rnn_layers': [1],
    'rnn__model__dense_layers': [1],
}
gs = GridSearchCV(estimator=pipe, 
                  param_grid=params,
                  scoring='balanced_accuracy',
                  cv=2,
                  verbose=3,
                  error_score='raise')

grid_search = gs.fit(X_train, y_train)

Fitting 2 folds for each of 2 candidates, totalling 4 fits
[CV 1/2] END rnn__model__bi_directional=True, rnn__model__dense_layers=1, rnn__model__recurrent_type=gru, rnn__model__rnn_layers=1, rnn__model__units=32;, score=0.871 total time=  39.3s
[CV 2/2] END rnn__model__bi_directional=True, rnn__model__dense_layers=1, rnn__model__recurrent_type=gru, rnn__model__rnn_layers=1, rnn__model__units=32;, score=0.861 total time=  40.3s
[CV 1/2] END rnn__model__bi_directional=False, rnn__model__dense_layers=1, rnn__model__recurrent_type=gru, rnn__model__rnn_layers=1, rnn__model__units=32;, score=0.500 total time=  32.9s
[CV 2/2] END rnn__model__bi_directional=False, rnn__model__dense_layers=1, rnn__model__recurrent_type=gru, rnn__model__rnn_layers=1, rnn__model__units=32;, score=0.500 total time=  31.0s


In [14]:
pd.DataFrame(grid_search.cv_results_).columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_rnn__model__bi_directional', 'param_rnn__model__dense_layers',
       'param_rnn__model__recurrent_type', 'param_rnn__model__rnn_layers',
       'param_rnn__model__units', 'params', 'split0_test_score',
       'split1_test_score', 'mean_test_score', 'std_test_score',
       'rank_test_score'],
      dtype='object')

In [15]:
def summarize_rnn_grid_search_results(grid_search):
    columns_to_extract = [
        ('mean_fit_time', 'fit_time'),
        ('mean_score_time', 'score_time'),
        ('param_rnn__model__bi_directional', 'bi_directional'),
        ('param_rnn__model__dense_layers', 'num_dense_layers'),
        ('param_rnn__model__recurrent_type', 'recurrent_type'),
        ('param_rnn__model__rnn_layers', 'num_rnn_layers'),
        ('param_rnn__model__units', 'units'),
        ('param_rnn__model__dropout_rate', 'dropout_rate'),
        ('mean_test_score', 'balanced_accuracy')
    ]
    summary_df = pd.DataFrame(grid_search.cv_results_)[[original for original, renamed in columns_to_extract]]

    summary_df.columns = [renamed for original, renamed in columns_to_extract]
    
    # Calculate total time and convert to int
    summary_df['time'] = (summary_df['fit_time'] + summary_df['score_time']).astype(int)
    
    # Reorder and select final columns for the output
    final_columns = ['balanced_accuracy', 'time', 'units', 'bi_directional', 'recurrent_type', 'num_rnn_layers', 'num_dense_layers', 'dropout_rate']
    final_df = summary_df[final_columns]
    sorted_df = final_df.sort_values(by=['balanced_accuracy', 'time'], ascending=[False, True])
    
    return sorted_df
summarize_rnn_grid_search_results(grid_search)

Unnamed: 0,balanced_accuracy,time,units,bi_directional,recurrent_type,num_rnn_layers,num_dense_layers
0,0.866113,39,32,True,gru,1,1
1,0.5,31,32,False,gru,1,1
