In [1]:
import nltk  # Added this import
from nltk import pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

# Ensure stopwords are downloaded only once (not a coding error, but a good practice)
nltk.download('stopwords', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

STOPWORDS = stopwords.words('english')

# Third-party imports
import numpy as np
import pandas as pd
import tensorflow as tf
from scikeras.wrappers import KerasClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_validate, train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from tensorflow.keras import Sequential, layers

df = pd.read_csv('../data/Airline_review.csv')[['Review_Title','Review','Recommended']]
X = df['Review_Title'] + ' ' + df['Review']
y = df['Recommended'].map({'yes':1,'no':0})
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train_sampled, _, y_train_sampled, _ = train_test_split(X_train, y_train, test_size=(len(X_train)-5000)/len(X_train), stratify=y_train, random_state=42)





In [2]:
class TextCleanerTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, stop_words=None, lemmatize=True):
        self.stop_words = set(stop_words) if stop_words else None
        self.lemmatize = lemmatize
        self.lemmatizer = WordNetLemmatizer() if lemmatize else None
        self.tokenizer = RegexpTokenizer(r"([a-zA-Z]+(?:’[a-z]+)?)")
        
    def fit(self, X, y=None):
        return self  # No fitting necessary for this transformer
    
    def transform(self, X, y=None):
        return [self.clean_text(review) for review in X]
    
    def clean_text(self, review):
        tokens = self.tokenizer.tokenize(review.lower())  # Tokenize and lowercase
        if self.lemmatize:
            pos_tags = pos_tag(tokens)
            tokens = [self.lemmatizer.lemmatize(word, self.get_wordnet_pos(tag))
                      for word, tag in pos_tags]
        if self.stop_words:
            tokens = [word for word in tokens if word not in self.stop_words]
        
        return ' '.join(tokens)
    
    def get_wordnet_pos(self, treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

def summarize_mlp_grid_search_results(grid_search):
    columns_to_extract = [
        ('mean_fit_time', 'fit_time'),
        ('mean_score_time', 'score_time'),
        ('param_mlp__model__num_layers', 'num_layers'),
        ('param_mlp__model__units', 'units'),
        ('mean_test_score', 'balanced_accuracy'),
        ('param_mlp__model__initializer','initializer'),
        ('param_mlp__model__dropout_rate', 'dropout_rate')
        
    ]
    summary_df = pd.DataFrame(grid_search.cv_results_)[[original for original, renamed in columns_to_extract]]

    summary_df.columns = [renamed for original, renamed in columns_to_extract]
    
    # Calculate total time and convert to int
    summary_df['time'] = (summary_df['fit_time'] + summary_df['score_time']).astype(int)
    
    # Reorder and select final columns for the output
    final_columns = ['balanced_accuracy', 'time', 'num_layers', 'units', 'dropout_rate', 'initializer']
    final_df = summary_df[final_columns]
    sorted_df = final_df.sort_values(by=['balanced_accuracy', 'time'], ascending=[False, True])
    
    return sorted_df

def summarize_rnn_grid_search_results(grid_search):
    columns_to_extract = [
        ('mean_fit_time', 'fit_time'),
        ('mean_score_time', 'score_time'),
        ('param_rnn__model__bi_directional', 'bi_directional'),
        ('param_rnn__model__dense_layers', 'num_dense_layers'),
        ('param_rnn__model__recurrent_type', 'recurrent_type'),
        ('param_rnn__model__rnn_layers', 'num_rnn_layers'),
        ('param_rnn__model__units', 'units'),
        ('param_rnn__model__dropout_rate', 'dropout_rate'),
        ('mean_train_score', 'train_score'),
        ('mean_test_score', 'test_score')
    ]
    summary_df = pd.DataFrame(grid_search.cv_results_)[[original for original, renamed in columns_to_extract]]

    summary_df.columns = [renamed for original, renamed in columns_to_extract]
    
    # Calculate total time and convert to int
    summary_df['time'] = (summary_df['fit_time'] + summary_df['score_time']).astype(int)
    
    # Reorder and select final columns for the output
    final_columns = ['train_score', 'test_score', 'time', 'units', 'bi_directional', 'recurrent_type', 'num_rnn_layers', 'num_dense_layers', 'dropout_rate']
    final_df = summary_df[final_columns]
    sorted_df = final_df.sort_values(by=['test_score', 'time'], ascending=[False, True])
    
    return sorted_df

text_cleaner = TextCleanerTransformer(stop_words=None, lemmatize=False)
X_train_clean = text_cleaner.transform(X_train)
X_train_clean_sampled = text_cleaner.transform(X_train_sampled)

# MLP

In [4]:
# Define model
def build_mlp_model(num_layers=1, units=64, initializer=None, dropout_rate=0.2):
    model = Sequential()
    model.add(layers.InputLayer(input_shape=(20000,)))
    for _ in range(num_layers):
        model.add(layers.Dense(units, activation="relu", kernel_initializer=initializer))
        model.add(layers.Dropout(dropout_rate))
        units = max(8, units // 2)
    model.add(layers.Dense(1, activation='sigmoid'))
    return model
    
# Setting parameters
k_best = SelectKBest(k=20000)
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
CALLBACKS = [tf.keras.callbacks.EarlyStopping(monitor='loss',
                                              min_delta=0.001,
                                              patience=5, 
                                              restore_best_weights=True,
                                              verbose=1)]

# Instantiate transformers
vectorizer = CountVectorizer(decode_error='replace', strip_accents='unicode', stop_words=None, ngram_range=(1, 2), max_df=0.95, min_df=2)
tf_idf = TfidfTransformer()
nlp_model_wrapper = KerasClassifier(build_fn=build_mlp_model,
                                random_state=42,
                                optimizer='adam',
                                loss='binary_crossentropy',
                                metrics=['accuracy'],
                                batch_size=64,
                                verbose=1,
                                callbacks=CALLBACKS,
                                shuffle=True,
                                epochs=20)
pipe = Pipeline([
    ("count", vectorizer),
    ('tf_idf', tf_idf),
    ('feature_selection', k_best),
    ('mlp', nlp_model_wrapper)
])

params = {
    'mlp__model__num_layers': [1,2],  
    'mlp__model__units': [8, 16, 32, 64],
    'mlp__model__initializer': [None,'he_normal'],
    'mlp__model__dropout_rate': [0.25, 0.5]}

gs_1 = GridSearchCV(estimator=pipe, 
                  param_grid=params,
                  scoring='balanced_accuracy',
                  cv=skf,
                  verbose=3,
                  error_score=0,
                  n_jobs= -1)

mlp_grid_search = gs_1.fit(X_train_clean, y_train)
summarize_mlp_grid_search_results(mlp_grid_search)

Fitting 3 folds for each of 32 candidates, totalling 96 fits



  X, y = self._initialize(X, y)



Epoch 1/20


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


Unnamed: 0,balanced_accuracy,time,num_layers,units,dropout_rate,initializer
16,0.903777,200,1,8,0.5,
24,0.903689,242,1,8,0.5,he_normal
20,0.901034,238,2,8,0.5,
28,0.899201,244,2,8,0.5,he_normal
8,0.898304,160,1,8,0.25,he_normal
0,0.897982,238,1,8,0.25,
4,0.896635,161,2,8,0.25,
12,0.895369,221,2,8,0.25,he_normal
25,0.893785,244,1,16,0.5,he_normal
17,0.893504,202,1,16,0.5,
