In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from nltk import pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import cross_validate, StratifiedKFold, train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
STOPWORDS = stopwords.words('english')


df = pd.read_csv('../data/Airline_review.csv')[['Review_Title','Review','Recommended']]
reviews = df['Review_Title'] + ' ' + df['Review']
labels = df['Recommended'].map({'yes':1,'no':0})
train_reviews, temp_reviews, train_labels, temp_labels = train_test_split(reviews, labels, test_size=0.2, stratify=labels, random_state=42)
val_reviews, test_reviews, val_labels, test_labels = train_test_split(temp_reviews, temp_labels, test_size=0.5, stratify=temp_labels, random_state=42)

# Concatenating the valildation set as I don't need it here. 90-10 split
X_train = pd.concat([train_reviews, val_reviews])
y_train = pd.concat([train_labels, val_labels])

2024-03-28 20:57:45.796678: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
class TextCleanerTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, stop_words=None, lemmatize=True):
        self.stop_words = stop_words
        self.lemmatize = lemmatize
        
    def fit(self, X, y=None):
        return self 
    
    def transform(self, X, y=None):
        cleaned_reviews = []
        for review in X:
            cleaned_reviews.append(self.clean_text(review, self.stop_words, self.lemmatize))
        return cleaned_reviews
    
    def clean_text(self, review, stop_words, lemmatize):
        tokenizer = RegexpTokenizer(r"([a-zA-Z]+(?:’[a-z]+)?)")
        tokens = tokenizer.tokenize(review)
        if stop_words is None:
            tokens = [word.lower() for word in tokens]
        else:
            tokens = [word.lower() for word in tokens if word.lower() not in stop_words]

        if lemmatize:
            pos_tags = pos_tag(tokens)
            wordnet_tags = [(word, self.get_wordnet_pos(tag)) for word, tag in pos_tags]
            lemmatizer = WordNetLemmatizer()
            lemmatized_tokens = [lemmatizer.lemmatize(word, tag) for word, tag in wordnet_tags]
            return ' '.join(lemmatized_tokens)
        else:
            return ' '.join(tokens)
    
    def get_wordnet_pos(self, treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

In [14]:
#For Sequence models later
from tensorflow.keras.layers import TextVectorization
max_features = 20000 # 28593 Unlemmatized, 23171 lemmatized
sequence_length = 500 # more than 98% are less than this anyway

vectorize_layer = TextVectorization(
    standardize=None, # already done by transformation.
    split='whitespace',
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

# Learning the vocabulary
vectorize_layer.adapt(X_train_clean) 

# Transforming to sequence vectors
X_train_sequence_vec = vectorize_layer(X_train_clean)

I still question whether I shoud force everything into an sklearn pipeline or not. While ellegant, it doesn't feel as compatable with tensorflow as I want it to be, especially considering validation scores. At least during trial and error, I don't think i will convert to sklearn, maybe I will once I decide on a final model. 


In [20]:
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from tensorflow.keras import Sequential
from tensorflow.keras import layers
import tensorflow as tf
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# Instantiating processing transformers
text_cleaner = TextCleanerTransformer(stop_words=STOPWORDS, lemmatize=True)
vectorizer = CountVectorizer(decode_error='replace', strip_accents='unicode', stop_words=None, ngram_range=(1, 2), max_df=0.95, min_df=2)
tf_idf = TfidfTransformer()
k_best = SelectKBest(k=20000)

# Setting parameters globally
SCORING = {'accuracy': 'accuracy', 'roc_auc': 'roc_auc'}
skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
# CALLBACKS = [tf.keras.callbacks.EarlyStopping(monitor='loss', 
#                                               patience=2, 
#                                               restore_best_weights=True,
#                                               verbose=1,
                                              # start_from_epoch=5)]

# Clean data before entering the pipeline for efficiency
X_train_clean = text_cleaner.transform(X_train)

def build_mlp_model(input_shape, num_layers, units, initializer=None):
    model = Sequential()
    model.add(layers.InputLayer(input_shape=(input_shape,)))
    for _ in range(num_layers - 1):
        model.add(layers.Dense(units, activation="relu", kernel_initializer=initializer))
        units = units // 2
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy', tf.keras.metrics.AUC()])
    return model

model_wrapper = KerasClassifier(
    build_fn=build_mlp_model,
    input_shape=20000, 
    epochs=20,
    random_state=42,
    num_layers=1,
    units=64,
    initializer=None,
    verbose=4,
    callbacks=None
)

pipe = Pipeline([
    ("count", vectorizer),
    ('tf_idf', tf_idf),
    ('feature_selection', k_best),
    ('mlp', model_wrapper)
])

params = {
    'mlp__num_layers': [1, 2, 3],  
    'mlp__units': [8, 16, 32, 64], 
    'mlp__initializer': [None, 'he_normal']
}

gs = GridSearchCV(estimator=pipe, 
                  param_grid=params,
                  scoring=SCORING, 
                  n_jobs=-1, 
                  refit='accuracy',
                  cv=2, 
                  verbose=4, 
                  error_score='raise')

# Assuming X_train_clean and y_train are defined
grid_search = gs.fit(X_train_clean, y_train)

Fitting 2 folds for each of 24 candidates, totalling 48 fits


ValueError: Could not interpret metric identifier: loss

In [9]:
import scikeras
print(scikeras.__version__)

0.12.0


In [5]:
text_cleaner = TextCleanerTransformer(stop_words=STOPWORDS, lemmatize=True)

# Clean data before entering the pipeline for efficiency
X_train_clean = text_cleaner.transform(X_train)

In [7]:
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from tensorflow.keras import Sequential
from tensorflow.keras import layers
import tensorflow as tf
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
STOPWORDS = stopwords.words('english')

def build_mlp_model(input_shape, num_layers, units, initializer=None):
    model = Sequential()
    model.add(layers.InputLayer(input_shape=(input_shape,)))
    for _ in range(num_layers - 1):
        model.add(layers.Dense(units, activation="relu", kernel_initializer=initializer))
        units = units // 2  # Reduce the units by half for each subsequent layer
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

# CALLBACKS list for model training
callbacks = [tf.keras.callbacks.EarlyStopping(monitor='loss', patience=2, restore_best_weights=True, verbose=1)]

model_wrapper = KerasClassifier(
    build_fn=build_mlp_model,
    input_shape=20000, 
    num_layers=1,
    units=64,
    initializer=None,
    epochs=20,
    random_state=42,
    callbacks=callbacks)

pipe = Pipeline([
    ("vectorizer", CountVectorizer(decode_error='replace', strip_accents='unicode', stop_words=None, ngram_range=(1, 2), max_df=0.95, min_df=2)),
    ('tf_idf', TfidfTransformer()),
    ('feature_selection', SelectKBest(k=20000)),
    ('mlp', model_wrapper)
])

params = {
    'mlp__num_layers': [1, 2, 3],  
    'mlp__units': [8, 16, 32, 64], 
    'mlp__initializer': [None, 'he_normal']
}

gs = GridSearchCV(estimator=pipe, 
                  param_grid=params,
                  scoring='accuracy',
                  n_jobs=-1,
                  cv=StratifiedKFold(n_splits=2, shuffle=True, random_state=42),
                  verbose=4, 
                  error_score='raise')

# The fitting process would be initiated with actual data
grid_search = gs.fit(X_train_clean, y_train)


Fitting 2 folds for each of 24 candidates, totalling 48 fits


2024-03-28 21:00:56.586164: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-03-28 21:00:56.586391: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-03-28 21:00:56.586813: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuil

Epoch 1/20
Epoch 1/20
Epoch 1/20
Epoch 1/20
Epoch 1/20
Epoch 1/20
Epoch 1/20
Epoch 1/20
Epoch 2/20
  1/326 [..............................] - ETA: 12s - loss: 0.6016 - accuracy: 0.5938Epoch 2/20
Epoch 2/20
Epoch 2/20
 20/326 [>.............................] - ETA: 0s - loss: 0.5669 - accuracy: 0.6516 Epoch 2/20
Epoch 2/20
Epoch 3/20
Epoch 3/20
Epoch 3/20
Epoch 3/20
Epoch 3/20
Epoch 3/20
Epoch 4/20
Epoch 4/20
Epoch 4/20
Epoch 4/20
Epoch 4/20
  1/326 [..............................] - ETA: 12s - loss: 0.4339 - accuracy: 0.8438Epoch 4/20
Epoch 3/20
Epoch 5/20
Epoch 5/20
Epoch 5/20
Epoch 5/20
Epoch 5/20
Epoch 5/20
 37/326 [==>...........................] - ETA: 0s - loss: 0.3929 - accuracy: 0.8742Epoch 5/20
Epoch 6/20
Epoch 6/20
 14/326 [>.............................] - ETA: 1s - loss: 0.3711 - accuracy: 0.8884 Epoch 6/20
  1/326 [..............................] - ETA: 13s - loss: 0.4625 - accuracy: 0.8438Epoch 6/20
 48/326 [===>..........................] - ETA: 0s - loss: 0.3627 - accur

  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)


[CV 1/2] END mlp__initializer=None, mlp__num_layers=1, mlp__units=64;, score=0.897 total time=  33.9s
Epoch 1/20
[CV 2/2] END mlp__initializer=None, mlp__num_layers=1, mlp__units=64;, score=0.899 total time=  34.0s
Epoch 1/20
[CV 2/2] END mlp__initializer=None, mlp__num_layers=1, mlp__units=32;, score=0.899 total time=  34.0s
Epoch 1/20


  X, y = self._initialize(X, y)


[CV 1/2] END mlp__initializer=None, mlp__num_layers=1, mlp__units=32;, score=0.897 total time=  34.2s
Epoch 1/20


  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)


[CV 2/2] END mlp__initializer=None, mlp__num_layers=1, mlp__units=8;, score=0.899 total time=  34.8s
Epoch 1/20
[CV 1/2] END mlp__initializer=None, mlp__num_layers=1, mlp__units=16;, score=0.897 total time=  34.8s
Epoch 1/20
[CV 1/2] END mlp__initializer=None, mlp__num_layers=1, mlp__units=8;, score=0.897 total time=  34.9s
Epoch 1/20
Epoch 2/20


  X, y = self._initialize(X, y)


  1/326 [..............................] - ETA: 14:40 - loss: 0.6936 - accuracy: 0.3438Epoch 2/20
 54/326 [===>..........................] - ETA: 2s - loss: 0.2482 - accuracy: 0.9271Epoch 2/20
Epoch 1/20
 75/326 [=====>........................] - ETA: 2s - loss: 0.2136 - accuracy: 0.9296Epoch 3/20
Epoch 4/20
 16/326 [>.............................] - ETA: 6s - loss: 0.1580 - accuracy: 0.9609Epoch 3/20
Epoch 3/20
 53/326 [===>..........................] - ETA: 3s - loss: 0.1216 - accuracy: 0.9617Epoch 4/20
  4/326 [..............................] - ETA: 6s - loss: 0.1119 - accuracy: 0.9844 Epoch 5/20
Epoch 5/20
 69/326 [=====>........................] - ETA: 4s - loss: 0.0530 - accuracy: 0.9891Epoch 6/20.] - ETA: 7s - loss: 0.1732 - accuracy: 0.94
Epoch 7/20
Epoch 6/20
Epoch 6/20
 64/326 [====>.........................] - ETA: 6s - loss: 0.0436 - accuracy: 0.9927Epoch 4/20
 43/326 [==>...........................] - ETA: 7s - loss: 0.0428 - accuracy: 0.9949Epoch 3/20
Epoch 8/20
  1/326 [

KeyboardInterrupt: 