In [1]:
# Standard library imports
import os
import pickle
import warnings

# Related third-party imports
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
from joblib import dump
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.tokenize import RegexpTokenizer
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from tensorflow.data import AUTOTUNE as tf_AUTOTUNE, Dataset as tf_Dataset
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dense, Dropout, Embedding, Bidirectional, GRU, TextVectorization
from tensorflow.keras.models import Sequential

# Local application/library specific imports would go here

# Additional setup
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
STOPWORDS = stopwords.words('english')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # Suppress TensorFlow logging (1: INFO, 2: WARNING, 3: ERROR)
warnings.filterwarnings('ignore')  # Suppress Python warnings




In [2]:
def get_wordnet_pos_optimized(treebank_tag):
    """Map POS tag to the first character that lemmatize() accepts."""
    tag_dict = {
        'J': wordnet.ADJ,
        'V': wordnet.VERB,
        'N': wordnet.NOUN,
        'R': wordnet.ADV
    }
    # Default to NOUN if not found
    return tag_dict.get(treebank_tag[0], wordnet.NOUN)

def clean_text(review, tokenizer, stop_words=None, lemmatize=False, tokenize=False):
    """Clean and preprocess a single review text."""
    tokens = tokenizer.tokenize(review.lower())
    
    if lemmatize:
        lemmatizer = WordNetLemmatizer()
        pos_tags = pos_tag(tokens)
        tokens = [lemmatizer.lemmatize(word, get_wordnet_pos_optimized(tag)) for word, tag in pos_tags]
    
    if stop_words:
        tokens = [word for word in tokens if word not in stop_words]
    
    return tokens if tokenize else ' '.join(tokens)

def preprocess_texts(reviews, tokenizer, stop_words=None, lemmatize=False, tokenize=False):
    """Apply text cleaning and preprocessing to a list of texts."""
    return [clean_text(review, tokenizer, stop_words, lemmatize, tokenize) for review in reviews]

def prepare_tf_dataset(X, y, batch_size, is_training=False):
    """Prepare a TensorFlow dataset for training or evaluation."""
    dataset = tf_Dataset.from_tensor_slices((X, y))
    if is_training:
        dataset = dataset.shuffle(10000)
    return dataset.batch(batch_size).cache().prefetch(tf_AUTOTUNE)

def extract_performance_metrics(history, callbacks):
    """Extract best epoch performance metrics."""
    early_stopping = next((cb for cb in callbacks if isinstance(cb, EarlyStopping)), None)
    if early_stopping and early_stopping.stopped_epoch > 0:
        best_epoch = max(0, early_stopping.stopped_epoch - early_stopping.patience)
    else:
        best_epoch = len(history.history['loss']) - 1

    metrics = {
            'loss': history.history['loss'][best_epoch],
            'val_loss': history.history['val_loss'][best_epoch],
            'val_accuracy': history.history.get('val_accuracy', [None])[best_epoch],
            'val_auc': history.history.get('val_auc', [None])[best_epoch]
        }
    return metrics   
    
def bag_of_words_cv(model_name, model, pipe, X_train, y_train, cv, model_score_list):
    """Cross-validate and save model and performance metrics."""
    model_pipeline = Pipeline(pipe.steps + [(model_name, model)])
    cv_results = cross_validate(model_pipeline, X_train, y_train, cv=cv, scoring=['accuracy', 'roc_auc'], return_train_score=False)
    
    accuracy, auc = cv_results['test_accuracy'].mean(), cv_results['test_roc_auc'].mean()
    score_list = [model_name, accuracy, auc]
    model_score_list.append(score_list)
    with open(f'{model_name}_scores.pkl', 'wb') as file:
        pickle.dump(score_list, file)
    
    model_pipeline.fit(X_train, y_train)
    dump(model_pipeline, f'{model_name}_pipeline.joblib')
    df = pd.DataFrame([[accuracy, auc]], columns=['Accuracy', 'AUC'], index=[model_name])
    return model_score_list, df

In [3]:
# Initialize tokenizer
tokenizer = RegexpTokenizer(r"([a-zA-Z]+(?:’[a-z]+)?)")

# Load dataset and select relevant columns
df = pd.read_csv('data/Airline_review.csv')[['Review_Title', 'Review', 'Recommended']]

# Combine 'Review_Title' and 'Review' for feature X, map 'Recommended' to binary for target y
X = df['Review_Title'] + ' ' + df['Review']
y = df['Recommended'].map({'yes': 1, 'no': 0})

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, stratify=y, random_state=42
)

In [4]:
# Define the TF-IDF Vectorizer configuration
tf_idf = TfidfVectorizer(
    decode_error='replace',
    strip_accents='unicode',
    ngram_range=(1, 2),
    max_df=0.95,
    min_df=2
)

# Initialize SelectKBest for feature selection
k_best = SelectKBest(k=20000)

# Setup the pipeline for bag-of-words model preparation
bow_pipe = Pipeline([
    ("tf_idf", tf_idf),
    ('feature_selection', k_best)
])

# Preprocess texts for bag-of-words model, including lemmatization
X_train_bow = preprocess_texts(X_train, tokenizer, stop_words=None, lemmatize=True)

# Preprocess texts for sequence models without lemmatization
X_train_seq = preprocess_texts(X_train, tokenizer, stop_words=None, lemmatize=False)

In [5]:
# Initialize Stratified K-Fold for cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define callbacks for model training
CALLBACKS = [
    EarlyStopping(
        monitor='val_loss',
        min_delta=0.001,
        patience=5,
        restore_best_weights=True,
        verbose=0
    )
]

# Initialize a list to store model performance metrics
model_score_list = []

In [6]:
# Dummy Model
dummy_model = DummyClassifier(strategy='uniform', random_state=42)

# Perform cross-validation and evaluate the Dummy model using the bag-of-words pipeline
model_score_list, df= bag_of_words_cv('Dummy', dummy_model, bow_pipe, 
                                      X_train_bow, y_train, skf, model_score_list)

df

Unnamed: 0,Accuracy,AUC
Dummy,0.499832,0.5


In [7]:
# Baseline Model
baseline_model = LogisticRegression(max_iter=1000)

model_score_list, df= bag_of_words_cv('Baseline', baseline_model, bow_pipe, 
                                      X_train_bow, y_train, skf, model_score_list)
df

Unnamed: 0,Accuracy,AUC
Baseline,0.905481,0.96144


In [8]:
# Logistic Regression
log_reg = LogisticRegression(
    C=9.42012179027564, 
    max_iter=100, 
    solver='newton-cg'
)
model_score_list, df= bag_of_words_cv('Logistic_Regression', log_reg, bow_pipe, 
                                      X_train_bow, y_train, skf, model_score_list)
df

Unnamed: 0,Accuracy,AUC
Logistic_Regression,0.916031,0.967635


In [9]:
# Multinomial Naive Bayes
mnb_model = MultinomialNB(
    fit_prior=False,
    class_prior=None,
    alpha=0.01
)
model_score_list, df= bag_of_words_cv('MultinomialNB', mnb_model, bow_pipe, 
                                      X_train_bow, y_train, skf, model_score_list)
df

Unnamed: 0,Accuracy,AUC
MultinomialNB,0.887738,0.948371


In [10]:
# Gradient Boosting Classifier
gbc_model = GradientBoostingClassifier(
    subsample=0.8,
    n_estimators=300,
    min_samples_split=2,
    max_features='sqrt',
    max_depth=6,
    learning_rate=0.1
)
model_score_list, df= bag_of_words_cv('Gradient_Boosting_Classifier', gbc_model, bow_pipe, 
                                      X_train_bow, y_train, skf, model_score_list)
df

Unnamed: 0,Accuracy,AUC
Gradient_Boosting_Classifier,0.90222,0.958531


In [11]:
# Random Forest Classifier
rfc_model = RandomForestClassifier(
    n_estimators=500,
    min_samples_split=10,
    min_samples_leaf=1,
    max_features='sqrt',
    max_depth=None,
    bootstrap=True
)
model_score_list, df= bag_of_words_cv('Random_Forest_Classifier', rfc_model, bow_pipe, 
                                      X_train_bow, y_train, skf, model_score_list)
df

Unnamed: 0,Accuracy,AUC
Random_Forest_Classifier,0.886971,0.949088


In [12]:
# Multi-Layer Perceptions - Cross Validation
tf_idf = TfidfVectorizer(
    decode_error='replace', strip_accents='unicode', stop_words=None,
    ngram_range=(1, 2), max_df=0.95, min_df=2
)
k_best = SelectKBest(k=20000)

metrics_aggregate = {'loss': 0, 'val_loss': 0, 'val_accuracy': 0, 'val_auc': 0}
runs = 0

for train_index, validation_index in skf.split(X, y):
    runs += 1
    X_train_split, X_val_split = X[train_index], X[validation_index]
    y_train_split, y_val_split = y[train_index], y[validation_index]

    # Apply transformations
    X_train_split = k_best.fit_transform(tf_idf.fit_transform(X_train_split), y_train_split)
    X_val_split = k_best.transform(tf_idf.transform(X_val_split))

    # Prepare datasets
    train_ds = prepare_tf_dataset(X_train_split.toarray(), y_train_split, batch_size=256, is_training=True)
    val_ds = prepare_tf_dataset(X_val_split.toarray(), y_val_split, batch_size=256)

    # Model definition
    mlp_model = Sequential([
        Dense(128, activation='relu', input_shape=(20000,)),
        Dropout(0.8),
        Dense(1, activation='sigmoid')
    ])

    mlp_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', 'AUC'])

    # Model training
    history = mlp_model.fit(
        train_ds,
        validation_data=val_ds,
        epochs=100,
        verbose=0,
        callbacks=CALLBACKS
    )
    
    # Extract and aggregate performance metrics
    metrics = extract_performance_metrics(history, CALLBACKS)
    for key, value in metrics.items():
        metrics_aggregate[key] += value

# Calculate average metrics over all runs
results_dict = {key: value / runs for key, value in metrics_aggregate.items()}
accuracy = results_dict['val_accuracy']
auc = results_dict['val_auc']

# Append results to the model score list and save to file
score_list = ['MLP', accuracy, auc]
model_score_list.append(score_list)
with open('MLP_scores.pkl', 'wb') as file:
    pickle.dump(score_list, file)

# Create and display a DataFrame with the results
df = pd.DataFrame([[accuracy, auc]], columns=['Accuracy', 'AUC'], index=['MLP'])
df







Unnamed: 0,Accuracy,AUC
MLP,0.917915,0.969525


In [13]:
# MLP - Fitting and Saving
tf_idf = TfidfVectorizer(decode_error='replace', strip_accents='unicode', stop_words=None, ngram_range=(1, 2), max_df=0.95, min_df=2)
k_best = SelectKBest(k=20000)

# Assuming X_train_bow and y_train are your features and labels
X = np.array(X_train_bow)
y = np.array(y_train)

# Splitting dataset
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Feature extraction and selection
X_train_split = k_best.fit_transform(tf_idf.fit_transform(X_train_split), y_train_split)
X_val_split = k_best.transform(tf_idf.transform(X_val_split))

# Prepare datasets
train_ds = prepare_tf_dataset(X_train_split.toarray(), y_train_split, batch_size=256, is_training=True)
val_ds = prepare_tf_dataset(X_val_split.toarray(), y_val_split, batch_size=256)

# Model definition
mlp_model = Sequential([
    Dense(128, activation='relu', input_shape=(20000,)),
    Dropout(0.8),
    Dense(1, activation='sigmoid')
])

# Model compilation
mlp_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', 'AUC'])

# Model fitting
mlp_model.fit(train_ds, validation_data=val_ds, epochs=100, verbose=0, callbacks=CALLBACKS)

# Save the model
mlp_model.save('MLP_model.h5')

In [14]:
# RNN Model - Cross Validation
metrics_aggregate = {'loss': 0, 'val_loss': 0, 'val_accuracy': 0, 'val_auc': 0}
X = np.array(X_train_seq)
y = np.array(y_train)

# Configure text vectorization
text_vectorization = TextVectorization(
    standardize=None,
    max_tokens=20000,
    output_mode='int',
    output_sequence_length=200
)

# Run cross-validation
runs = 0
for train_index, validation_index in skf.split(X, y):
    runs += 1
    X_train_split, X_val_split = X[train_index], X[validation_index]
    y_train_split, y_val_split = y[train_index], y[validation_index]

    # Adapt the text vectorization layer and transform inputs
    text_vectorization.adapt(X_train_split)
    X_train_split = text_vectorization(X_train_split)
    X_val_split = text_vectorization(X_val_split)

    # Prepare datasets
    train_ds = prepare_tf_dataset(X_train_split, y_train_split, batch_size=256, is_training=True)
    val_ds = prepare_tf_dataset(X_val_split, y_val_split, batch_size=256)

    # Define the RNN model
    rnn_model = Sequential([
        Embedding(input_dim=20000, output_dim=32, input_length=200),
        Bidirectional(GRU(16)),
        Dense(8, activation='relu'),
        Dense(1, activation='sigmoid')
    ])

    # Compile the model
    rnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', 'AUC'])

    # Train the model
    history = rnn_model.fit(
        train_ds,
        validation_data=val_ds,
        epochs=100,
        verbose=0,
        callbacks=CALLBACKS
    )
    
    # Extract and aggregate performance metrics
    metrics = extract_performance_metrics(history, CALLBACKS)
    for key in metrics_aggregate:
        metrics_aggregate[key] += metrics[key]

# Calculate average metrics over all runs
results_dict = {key: val / runs for key, val in metrics_aggregate.items()}
accuracy = results_dict['val_accuracy']
auc = results_dict['val_auc']

# Append results and save to a file
score_list = ['RNN', accuracy, auc]
model_score_list.append(score_list)
with open('RNN_scores.pkl', 'wb') as file:
    pickle.dump(score_list, file)

# Create and display a DataFrame with the results
df = pd.DataFrame([[accuracy, auc]], columns=['Accuracy', 'AUC'], index=['RNN'])
df

Unnamed: 0,Accuracy,AUC
RNN,0.890759,0.951667


In [15]:
# RNN - Fitting and Saving
text_vectorization = TextVectorization(
    standardize=None,
    max_tokens=20000,
    output_mode='int',
    output_sequence_length=200
)

# Convert text data and labels to numpy arrays for TensorFlow processing
X = np.array(X_train_seq)
y = np.array(y_train)

# Split data into training and validation sets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Adapt the text vectorization layer to the training data
text_vectorization.adapt(X_train_split)

# Vectorize the training and validation text data
X_train_split = text_vectorization(X_train_split)
X_val_split = text_vectorization(X_val_split)

# Prepare TensorFlow datasets for training and validation
train_ds = prepare_tf_dataset(X_train_split, y_train_split, batch_size=256, is_training=True)
val_ds = prepare_tf_dataset(X_val_split, y_val_split, batch_size=256)

# Define the RNN model architecture
rnn_model = Sequential([
    Embedding(input_dim=20000, output_dim=32, input_length=200),
    Bidirectional(GRU(16)),
    Dense(8, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile the model with optimizer, loss function, and metrics
rnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', 'AUC'])

# Fit the model on the training data and validate using the validation data
rnn_model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=100,
    verbose=0,
    callbacks=CALLBACKS
)

# Save the trained model to an H5 file
rnn_model.save('RNN_model.h5')

In [16]:
# GloVe Model - Cross Validation
text_vectorization = TextVectorization(standardize=None, max_tokens=20000, output_mode='int', output_sequence_length=200)
X = np.array(X_train_seq)
y = np.array(y_train)
text_vectorization.adapt(X)

# Load GloVe embeddings for words in the vocabulary
vocabulary = text_vectorization.get_vocabulary()
vocab_size = len(vocabulary)
embedding_matrix = np.zeros((vocab_size, 300))
glove_path = 'data/glove.6B.300d.txt'

glove_embeddings = {}
with open(glove_path, 'r', encoding='utf-8') as file:
    for line in file:
        values = line.split()
        word, vector = values[0], np.asarray(values[1:], dtype='float32')
        if word in vocabulary:
            glove_embeddings[word] = vector

for i, word in enumerate(vocabulary):
    embedding_vector = glove_embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Define metrics aggregation
metrics_aggregate = {'loss': 0, 'val_loss': 0, 'val_accuracy': 0, 'val_auc': 0}
runs = 0

for train_index, validation_index in skf.split(X, y):
    runs += 1
    X_train_split, X_val_split, y_train_split, y_val_split = X[train_index], X[validation_index], y[train_index], y[validation_index]

    # Vectorize text data
    X_train_split = text_vectorization(X_train_split)
    X_val_split = text_vectorization(X_val_split)

    # Prepare datasets
    train_ds = prepare_tf_dataset(X_train_split, y_train_split, 256, is_training=True)
    val_ds = prepare_tf_dataset(X_val_split, y_val_split, 256)

    # Initialize and compile the model
    glv_model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=300, input_length=200, weights=[embedding_matrix], trainable=False),
        Bidirectional(GRU(32)),
        Dropout(0.4),
        Dense(16, activation='relu'),
        Dropout(0.4),
        Dense(1, activation='sigmoid')
    ])
    glv_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', 'AUC'])

    # Train the model
    history = glv_model.fit(train_ds, validation_data=val_ds, epochs=100, verbose=0, callbacks=CALLBACKS)
    
    # Extract and aggregate performance metrics
    metrics = extract_performance_metrics(history, CALLBACKS)
    for key, value in metrics.items():
        metrics_aggregate[key] += value

# Calculate and display average metrics over all runs
results_dict = {key: value / runs for key, value in metrics_aggregate.items()}
accuracy, auc = results_dict['val_accuracy'], results_dict['val_auc']

# Save results
score_list = ['GloVe', accuracy, auc]
model_score_list.append(score_list)
with open('GloVe_scores.pkl', 'wb') as file:
    pickle.dump(score_list, file)

df = pd.DataFrame([[accuracy, auc]], columns=['Accuracy', 'AUC'], index=['GloVe'])
df

Unnamed: 0,Accuracy,AUC
GloVe,0.909557,0.96396


In [17]:
#GloVe model - Fitting and Saving
text_vectorization = TextVectorization(standardize=None,
                                       max_tokens=20000,
                                       output_mode='int',
                                       output_sequence_length=200)

X = np.array(X_train_seq)
y = np.array(y_train)

X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

text_vectorization.adapt(X_train_split)
X_train_split = text_vectorization(X_train_split)
X_val_split = text_vectorization(X_val_split)

train_ds = prepare_tf_dataset(X_train_split, y_train_split, 256, is_training=True)
val_ds = prepare_tf_dataset(X_val_split, y_val_split, 256)

vocabulary = text_vectorization.get_vocabulary()
vocab_size = len(vocabulary)

# Load GloVe embeddings from file.
glove_embeddings = {}
with open('data/glove.6B.300d.txt', 'r', encoding='utf-8') as file:
    for line in file:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        glove_embeddings[word] = vector

# Initialize the embedding matrix with zeros.
embedding_matrix = np.zeros((vocab_size, 300))

# Populate the embedding matrix with GloVe vectors.
for i, word in enumerate(vocabulary):
    embedding_vector = glove_embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        
glv_model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=300, input_length=200, weights=[embedding_matrix], trainable=False),
    Bidirectional(GRU(32)),
    Dropout(0.4),
    Dense(16, activation='relu'),
    Dropout(0.4),
    Dense(1, activation='sigmoid')
    ])

glv_model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy', 'AUC'])

results = glv_model.fit(train_ds,
                    validation_data= val_ds,
                    epochs=100,
                    verbose=0,
                    callbacks=CALLBACKS)

glv_model.save('GloVe_model.h5', save_format='h5')

In [18]:
# Create a DataFrame from the scores list
scores_df = pd.DataFrame(model_score_list, columns=['Model', 'Accuracy', 'AUC'])

# Save the DataFrame to a pickle file for later use
scores_df.to_pickle('model_scores_df.pkl')

# Display the DataFrame
scores_df

Unnamed: 0,Model,Accuracy,AUC
0,Dummy,0.499832,0.5
1,Baseline,0.905481,0.96144
2,Logistic_Regression,0.916031,0.967635
3,MultinomialNB,0.887738,0.948371
4,Gradient_Boosting_Classifier,0.90222,0.958531
5,Random_Forest_Classifier,0.886971,0.949088
6,MLP,0.917915,0.969525
7,RNN,0.890759,0.951667
8,GloVe,0.909557,0.96396
