In [33]:
import nltk  # Added this import
from nltk import pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from gensim.models import word2vec

# Ensure stopwords are downloaded only once (not a coding error, but a good practice)
nltk.download('stopwords', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

STOPWORDS = stopwords.words('english')

# Third-party imports
import numpy as np
import pandas as pd
import tensorflow as tf
from scikeras.wrappers import KerasClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_validate, train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from tensorflow.keras import Sequential, layers


sample_size = 5000
df = pd.read_csv('../data/Airline_review.csv')[['Review_Title','Review','Recommended']]
X = df['Review_Title'] + ' ' + df['Review']
y = df['Recommended'].map({'yes':1,'no':0})
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train_sampled, _, y_train_sampled, _ = train_test_split(X_train, y_train, test_size=(len(X_train)-sample_size)/len(X_train), stratify=y_train, random_state=42)


In [34]:
class TextCleanerTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, stop_words=None, lemmatize=True):
        self.stop_words = set(stop_words) if stop_words else None
        self.lemmatize = lemmatize
        self.lemmatizer = WordNetLemmatizer() if lemmatize else None
        self.tokenizer = RegexpTokenizer(r"([a-zA-Z]+(?:’[a-z]+)?)")
        
    def fit(self, X, y=None):
        return self 
    
    def transform(self, X, y=None):
        return [self.clean_text(review) for review in X]
    
    def clean_text(self, review):
        tokens = self.tokenizer.tokenize(review.lower())  # Tokenize and lowercase
        if self.lemmatize:
            pos_tags = pos_tag(tokens)
            tokens = [self.lemmatizer.lemmatize(word, self.get_wordnet_pos(tag))
                      for word, tag in pos_tags]
        if self.stop_words:
            tokens = [word for word in tokens if word not in self.stop_words]
        return ' '.join(tokens)
    
    def get_wordnet_pos(self, treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

def summarize_glv_grid_search_results(grid_search):
    columns_to_extract = [
        ('mean_fit_time', 'fit_time'),
        ('mean_score_time', 'score_time'),
        ('param_glv__model__bi_directional', 'bi_directional'),
        ('param_glv__model__dense_layers', 'num_dense_layers'),
        ('param_glv__model__recurrent_type', 'recurrent_type'),
        ('param_glv__model__rnn_layers', 'num_rnn_layers'),
        ('param_glv__model__units', 'units'),
        ('param_glv__model__dropout_rate', 'dropout_rate'),
        ('mean_train_score', 'train_score'),
        ('mean_test_score', 'test_score')
    ]
    summary_df = pd.DataFrame(grid_search.cv_results_)[[original for original, renamed in columns_to_extract]]

    summary_df.columns = [renamed for original, renamed in columns_to_extract]
    
    # Calculate total time and convert to int
    summary_df['time'] = (summary_df['fit_time'] + summary_df['score_time']).astype(int)
    
    # Reorder and select final columns for the output
    final_columns = ['train_score', 'test_score', 'time', 'units', 'bi_directional', 'recurrent_type', 'num_rnn_layers', 'num_dense_layers', 'dropout_rate']
    final_df = summary_df[final_columns]
    sorted_df = final_df.sort_values(by=['test_score', 'time'], ascending=[False, True])
    
    return sorted_df

text_cleaner = TextCleanerTransformer(stop_words=None, lemmatize=False)
X_train_clean = text_cleaner.transform(X_train)
X_train_clean_sampled = text_cleaner.transform(X_train_sampled)

In [2]:
glove_embeddings = {}
with open('../data/glove.6B.300d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        glove_embeddings[word] = vector


In [42]:
text_cleaner = TextCleanerTransformer(stop_words=None, lemmatize=False)
X_train_clean = text_cleaner.transform(X_train_sampled)
X_train_tokenized = [review.split() for review in X_train_clean]
train_vocabulary = set(word for review in X_train_tokenized for word in review)
len(train_vocabulary)

15257

In [14]:
glove_embeddings = {}
with open('../data/glove.6B.300d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        glove_embeddings[word] = vector

vocab = min(15000, len(train_vocabulary) + 1)
length = 500
embedding_dim = 300

# Initialize the embedding matrix with zeros
embedding_matrix = np.zeros((vocab, embedding_dim))

# Fill in the matrix with GloVe embeddings
for i, word in enumerate(train_vocabulary):
    embedding_vector = glove_embeddings.get(word)
    if embedding_vector is not None:
        # Ensure the index i does not exceed the vocab size limitation
        if i < vocab:
            embedding_matrix[i] = embedding_vector


In [53]:
# sampled or not?
X_input = X_train_clean_sampled
y_input = y_train_sampled

# finding vocab size
text_cleaner = TextCleanerTransformer(stop_words=None, lemmatize=False)
X_train_clean = text_cleaner.transform(X_input)
X_train_tokenized = [review.split() for review in X_train_clean]
train_vocabulary = set(word for review in X_train_tokenized for word in review)
vocab_size = len(train_vocabulary) + 1

#setting important dimensions
vocab=min(15000,len(train_vocabulary) + 1)
length=500
embedding_dim = 300

#making embedding matrix
glove_embeddings = {}
with open('../data/glove.6B.300d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        glove_embeddings[word] = vector

# Initialize the embedding matrix with zeros
embedding_matrix = np.zeros((vocab, embedding_dim))

# Fill in the matrix with GloVe embeddings
for i, word in enumerate(train_vocabulary):
    embedding_vector = glove_embeddings.get(word)
    if embedding_vector is not None:
        # Ensure the index i does not exceed the vocab size limitation
        if i < vocab:
            embedding_matrix[i] = embedding_vector

In [54]:
# sampled or not?
X_input = X_train_clean_sampled
y_input = y_train_sampled

# finding vocab size
text_cleaner = TextCleanerTransformer(stop_words=None, lemmatize=False)
X_train_clean = text_cleaner.transform(X_input)
X_train_tokenized = [review.split() for review in X_train_clean]
train_vocabulary = set(word for review in X_train_tokenized for word in review)
vocab_size = len(train_vocabulary) + 1

#setting important dimensions
vocab=min(15000,len(train_vocabulary) + 1)
length=500
embedding_dim = 300

#making embedding matrix
glove_embeddings = {}
with open('../data/glove.6B.300d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        glove_embeddings[word] = vector

# Initialize the embedding matrix with zeros
embedding_matrix = np.zeros((vocab, embedding_dim))

# Fill in the matrix with GloVe embeddings
for i, word in enumerate(train_vocabulary):
    embedding_vector = glove_embeddings.get(word)
    if embedding_vector is not None:
        # Ensure the index i does not exceed the vocab size limitation
        if i < vocab:
            embedding_matrix[i] = embedding_vector
class KerasTextVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, max_tokens=vocab, output_sequence_length=length):
        self.max_tokens = max_tokens
        self.output_sequence_length = output_sequence_length
        self.text_vectorization = tf.keras.layers.TextVectorization(
            max_tokens=self.max_tokens,
            output_sequence_length=self.output_sequence_length)

    def fit(self, X, y=None):
        self.text_vectorization.adapt(X)
        return self  # Return self to allow chaining

    def transform(self, X, y=None):
        return self.text_vectorization(X).numpy()  

def add_rnn_layer(model, units, rnn_type='gru', bidirectional=False, return_sequences=False):
    LayerClass = layers.GRU if rnn_type == 'gru' else layers.LSTM
    layer = LayerClass(units, return_sequences=return_sequences)
    if bidirectional:
        layer = layers.Bidirectional(layer)
    model.add(layer)

def build_glove_model(rnn_layers=1, dense_layers=1, recurrent_type='gru', bi_directional=False, dropout_rate=0.2, units=64, sequence_length=length, vocab_size=vocab, embedding_dim=embedding_dim):
    model = Sequential()
    model.add(layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=sequence_length,
                     weights=[embedding_matrix], trainable=False))
    for i in range(rnn_layers):
        add_rnn_layer(model, units, rnn_type=recurrent_type, bidirectional=bi_directional, 
                      return_sequences=(i < rnn_layers - 1))
        model.add(layers.Dropout(dropout_rate))
        units = max(8, units // 2)

    for _ in range(dense_layers):
        model.add(layers.Dense(units, activation="relu"))
        model.add(layers.Dropout(dropout_rate))
        units = max(8, units // 2)

    model.add(layers.Dense(1, activation='sigmoid'))
    return model

skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
CALLBACKS = [tf.keras.callbacks.EarlyStopping(monitor='loss',
                                              min_delta=0.01,
                                              patience=2, 
                                              restore_best_weights=True,
                                              verbose=0)]

# Instantiate transformers
text_vectorizer = KerasTextVectorizer(max_tokens=vocab, output_sequence_length=length)
glv_model_wrapper = KerasClassifier(model=build_glove_model,
                                    random_state=42,
                                    optimizer='adam',
                                    loss='binary_crossentropy',
                                    metrics=['accuracy'],
                                    batch_size=64,
                                    verbose=0,
                                    callbacks=CALLBACKS,
                                    shuffle=True,
                                    epochs=2)


pipe = Pipeline([
    ('text_vect', text_vectorizer),
    ('glv', glv_model_wrapper)
])

params = {
    'glv__model__recurrent_type': ['gru'],  
    'glv__model__units': [32,64],
    'glv__model__bi_directional': [True],
    'glv__model__rnn_layers': [1,2],
    'glv__model__dense_layers': [1,2],
    'glv__model__dropout_rate': [0.25]
}

gs = GridSearchCV(estimator=pipe,
                    param_grid=params,
                    scoring='balanced_accuracy',
                    cv=skf,
                    verbose=4,
                    n_jobs= 1,
                    return_train_score=True)

glv_grid_search = gs.fit(X_input, y_input)
summarize_glv_grid_search_results(glv_grid_search)

Fitting 2 folds for each of 8 candidates, totalling 16 fits
[CV 1/2] END glv__model__bi_directional=True, glv__model__dense_layers=1, glv__model__dropout_rate=0.25, glv__model__recurrent_type=gru, glv__model__rnn_layers=1, glv__model__units=32;, score=(train=0.503, test=0.503) total time=  36.3s
[CV 2/2] END glv__model__bi_directional=True, glv__model__dense_layers=1, glv__model__dropout_rate=0.25, glv__model__recurrent_type=gru, glv__model__rnn_layers=1, glv__model__units=32;, score=(train=0.610, test=0.616) total time=  35.7s
[CV 1/2] END glv__model__bi_directional=True, glv__model__dense_layers=1, glv__model__dropout_rate=0.25, glv__model__recurrent_type=gru, glv__model__rnn_layers=1, glv__model__units=64;, score=(train=0.715, test=0.671) total time=  40.6s
[CV 2/2] END glv__model__bi_directional=True, glv__model__dense_layers=1, glv__model__dropout_rate=0.25, glv__model__recurrent_type=gru, glv__model__rnn_layers=1, glv__model__units=64;, score=(train=0.730, test=0.685) total time=

Unnamed: 0,train_score,test_score,time,units,bi_directional,recurrent_type,num_rnn_layers,num_dense_layers,dropout_rate
1,0.722576,0.677848,46,64,True,gru,1,1,0.25
3,0.73195,0.675982,70,64,True,gru,2,1,0.25
7,0.694882,0.670068,69,64,True,gru,2,2,0.25
2,0.616576,0.597944,57,32,True,gru,2,1,0.25
0,0.556593,0.55941,36,32,True,gru,1,1,0.25
5,0.555165,0.547657,40,64,True,gru,1,2,0.25
6,0.505188,0.506662,56,32,True,gru,2,2,0.25
4,0.505625,0.503388,35,32,True,gru,1,2,0.25


In [55]:
# sampled or not?
X_input = X_train
y_input = y_train

# finding vocab size
text_cleaner = TextCleanerTransformer(stop_words=None, lemmatize=False)
X_train_clean = text_cleaner.transform(X_input)
X_train_tokenized = [review.split() for review in X_train_clean]
train_vocabulary = set(word for review in X_train_tokenized for word in review)
vocab_size = len(train_vocabulary) + 1

#setting important dimensions
vocab=min(20000,len(train_vocabulary) + 1)
length=500
embedding_dim = 300

#making embedding matrix
glove_embeddings = {}
with open('../data/glove.6B.300d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        glove_embeddings[word] = vector

# Initialize the embedding matrix with zeros
embedding_matrix = np.zeros((vocab, embedding_dim))

# Fill in the matrix with GloVe embeddings
for i, word in enumerate(train_vocabulary):
    embedding_vector = glove_embeddings.get(word)
    if embedding_vector is not None:
        # Ensure the index i does not exceed the vocab size limitation
        if i < vocab:
            embedding_matrix[i] = embedding_vector
class KerasTextVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, max_tokens=vocab, output_sequence_length=length):
        self.max_tokens = max_tokens
        self.output_sequence_length = output_sequence_length
        self.text_vectorization = tf.keras.layers.TextVectorization(
            max_tokens=self.max_tokens,
            output_sequence_length=self.output_sequence_length)

    def fit(self, X, y=None):
        self.text_vectorization.adapt(X)
        return self  # Return self to allow chaining

    def transform(self, X, y=None):
        return self.text_vectorization(X).numpy()  

def add_rnn_layer(model, units, rnn_type='gru', bidirectional=False, return_sequences=False):
    LayerClass = layers.GRU if rnn_type == 'gru' else layers.LSTM
    layer = LayerClass(units, return_sequences=return_sequences)
    if bidirectional:
        layer = layers.Bidirectional(layer)
    model.add(layer)

def build_glove_model(rnn_layers=1, dense_layers=1, recurrent_type='gru', bi_directional=False, dropout_rate=0.2, units=64, sequence_length=length, vocab_size=vocab, embedding_dim=embedding_dim):
    model = Sequential()
    model.add(layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=sequence_length,
                     weights=[embedding_matrix], trainable=False))
    for i in range(rnn_layers):
        add_rnn_layer(model, units, rnn_type=recurrent_type, bidirectional=bi_directional, 
                      return_sequences=(i < rnn_layers - 1))
        model.add(layers.Dropout(dropout_rate))
        units = max(8, units // 2)

    for _ in range(dense_layers):
        model.add(layers.Dense(units, activation="relu"))
        model.add(layers.Dropout(dropout_rate))
        units = max(8, units // 2)

    model.add(layers.Dense(1, activation='sigmoid'))
    return model

skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
CALLBACKS = [tf.keras.callbacks.EarlyStopping(monitor='loss',
                                              min_delta=0.01,
                                              patience=2, 
                                              restore_best_weights=True,
                                              verbose=0)]

# Instantiate transformers
text_vectorizer = KerasTextVectorizer(max_tokens=vocab, output_sequence_length=length)
glv_model_wrapper = KerasClassifier(model=build_glove_model,
                                    random_state=42,
                                    optimizer='adam',
                                    loss='binary_crossentropy',
                                    metrics=['accuracy'],
                                    batch_size=64,
                                    verbose=0,
                                    callbacks=CALLBACKS,
                                    shuffle=True,
                                    epochs=2)


pipe = Pipeline([
    ('text_vect', text_vectorizer),
    ('glv', glv_model_wrapper)
])

params = {
    'glv__model__recurrent_type': ['gru'],  
    'glv__model__units': [32,64],
    'glv__model__bi_directional': [True],
    'glv__model__rnn_layers': [1,2],
    'glv__model__dense_layers': [1,2],
    'glv__model__dropout_rate': [0.25]
}

gs = GridSearchCV(estimator=pipe,
                    param_grid=params,
                    scoring='balanced_accuracy',
                    cv=skf,
                    verbose=4,
                    n_jobs= 1,
                    return_train_score=True)

glv_grid_search = gs.fit(X_input, y_input)
summarize_glv_grid_search_results(glv_grid_search)

Fitting 2 folds for each of 8 candidates, totalling 16 fits
[CV 1/2] END glv__model__bi_directional=True, glv__model__dense_layers=1, glv__model__dropout_rate=0.25, glv__model__recurrent_type=gru, glv__model__rnn_layers=1, glv__model__units=32;, score=(train=0.842, test=0.817) total time= 2.0min
[CV 2/2] END glv__model__bi_directional=True, glv__model__dense_layers=1, glv__model__dropout_rate=0.25, glv__model__recurrent_type=gru, glv__model__rnn_layers=1, glv__model__units=32;, score=(train=0.831, test=0.836) total time= 2.1min
[CV 1/2] END glv__model__bi_directional=True, glv__model__dense_layers=1, glv__model__dropout_rate=0.25, glv__model__recurrent_type=gru, glv__model__rnn_layers=1, glv__model__units=64;, score=(train=0.865, test=0.847) total time= 2.7min
[CV 2/2] END glv__model__bi_directional=True, glv__model__dense_layers=1, glv__model__dropout_rate=0.25, glv__model__recurrent_type=gru, glv__model__rnn_layers=1, glv__model__units=64;, score=(train=0.853, test=0.849) total time=

Unnamed: 0,train_score,test_score,time,units,bi_directional,recurrent_type,num_rnn_layers,num_dense_layers,dropout_rate
1,0.859047,0.848073,161,64,True,gru,1,1,0.25
7,0.86075,0.847363,228,64,True,gru,2,2,0.25
3,0.851308,0.832618,258,64,True,gru,2,1,0.25
0,0.836592,0.826778,124,32,True,gru,1,1,0.25
6,0.828239,0.813345,186,32,True,gru,2,2,0.25
4,0.811458,0.805816,118,32,True,gru,1,2,0.25
5,0.814215,0.80396,135,64,True,gru,1,2,0.25
2,0.739192,0.731675,192,32,True,gru,2,1,0.25
