In [4]:
import numpy as np
import pandas as pd
import tensorflow as tf
from nltk import pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import cross_validate, StratifiedKFold, train_test_split
from sklearn.pipeline import Pipeline, make_pipeline


df = pd.read_csv('data/Airline_review.csv')[['Review_Title','Review','Recommended']]
reviews = df['Review_Title'] + ' ' + df['Review']
labels = df['Recommended'].map({'yes':1,'no':0})
train_reviews, temp_reviews, train_labels, temp_labels = train_test_split(reviews, labels, test_size=0.2, stratify=labels, random_state=42)
val_reviews, test_reviews, val_labels, test_labels = train_test_split(temp_reviews, temp_labels, test_size=0.5, stratify=temp_labels, random_state=42)

# Concatenating the valildation set as I don't need it here. 90-10 split
X_train = pd.concat([train_reviews, val_reviews])
y_train = pd.concat([train_labels, val_labels])

In [24]:
class TextCleanerTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, stop_words=None, lemmatize=True):
        self.stop_words = stop_words
        self.lemmatize = lemmatize
        
    def fit(self, X, y=None):
        return self 
    
    def transform(self, X, y=None):
        cleaned_reviews = []
        for review in X:
            cleaned_reviews.append(self.clean_text(review, self.stop_words, self.lemmatize))
        return cleaned_reviews
    
    def clean_text(self, review, stop_words, lemmatize):
        tokenizer = RegexpTokenizer(r"([a-zA-Z]+(?:’[a-z]+)?)")
        tokens = tokenizer.tokenize(review)
        if stop_words is None:
            tokens = [word.lower() for word in tokens]
        else:
            tokens = [word.lower() for word in tokens if word.lower() not in stop_words]

        if lemmatize:
            pos_tags = pos_tag(tokens)
            wordnet_tags = [(word, self.get_wordnet_pos(tag)) for word, tag in pos_tags]
            lemmatizer = WordNetLemmatizer()
            lemmatized_tokens = [lemmatizer.lemmatize(word, tag) for word, tag in wordnet_tags]
            return ' '.join(lemmatized_tokens)
        else:
            return ' '.join(tokens)
    
    def get_wordnet_pos(self, treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

In [None]:
# Instatiating transformer, not removing stop words or lemmatizing to preserve order and context
text_cleaner = TextCleanerTransformer(stop_words=None, lemmatize=False)


In [25]:
from tensorflow.keras import Sequential
from tensorflow.keras import layers
from scikeras.wrappers import KerasClassifier
# Instantiating processing transformers

# vectorizer = CountVectorizer(decode_error='replace', strip_accents='unicode', stop_words=None, ngram_range=(1, 2), max_df=0.95, min_df=2)
# tf_idf = TfidfTransformer()
k_best = SelectKBest(k=10000)

# Setting CV parameters
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
SCORING = {'accuracy':'accuracy',
           'roc_auc':'roc_auc'}

CALLBACKS = [tf.keras.callbacks.EarlyStopping(monitor='loss', 
                                              patience=5, 
                                              restore_best_weights=True,
                                              verbose=1,
                                              start_from_epoch=5)]


def build_mlp_model(input_shape, num_layers, units, kernel_initializer=initializer):
    model = Sequential()
    model.add(layers.InputLayer(input_shape=(input_shape,)))
    for _ in range(num_layers-1):
        model.add(layers.Dense(units, activation="relu", initializer= initializer))
        units = units // 2
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy', tf.keras.metrics.AUC()])
    return model


I still question whether I shoud force everything into an sklearn pipeline or not. While ellegant, it doesn't feel as compatable with tensorflow as I want it to be, especially considering validation scores. At least during trial and error, I don't think i will convert to sklearn, maybe I will once I decide on a final model. 


In [None]:

model = KerasClassifier(build_fn=build_mlp_model, epochs=100, verbose=1, 
                        callbacks=CALLBACKS,
                        input_shape=10000)

pipe = Pipeline([
    ("count", vectorizer),
    ('tf_idf',tf_idf),
    ('feature_selection', k_best),
    ('simple_mlp', model)
])

In [None]:
num_layers: [1,2,3]
units: [64, 32, 16, 8]
initializer: [None, 'he_normal']

In [None]:
params = {
    'mnb__alpha': [0.01, 0.1, 0.5, 1.0, 2.0, 5.0],
    'mnb__fit_prior': [True, False],
    'mnb__class_prior': [None,[0.66,0.34]]
}

rs = RandomizedSearchCV(estimator= pipe, 
                        param_distributions= params,
                        n_iter=100, 
                        cv=5,
                        verbose=1,
                        n_jobs=-1,
                        scoring=SCORING,
                        refit= 'roc_auc',
                        random_state=42)
rand_search = rs.fit(X_train_clean, y_train)
RandCV_scores('mnb',rand_search)

In [None]:
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV
from tensorflow.keras import Sequential
from tensorflow.keras import layers
import tensorflow as tf
import numpy as np

k_best = SelectKBest(k=10000)
SCORING = {'accuracy':'accuracy',
           'roc_auc':'roc_auc'}

CALLBACKS = [tf.keras.callbacks.EarlyStopping(monitor='loss', 
                                              patience=5, 
                                              restore_best_weights=True,
                                              verbose=1,
                                              start_from_epoch=5)]

def build_mlp_model(input_shape, num_layers, units, kernel_initializer=initializer ):
    model = Sequential()
    model.add(layers.InputLayer(input_shape=(input_shape,)))
    for _ in range(num_layers-1):
        model.add(layers.Dense(units, activation="relu", initializer= initializer))
        units = units // 2
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy', tf.keras.metrics.AUC()])
    return model

model_wrapper = KerasClassifier(
    model=build_mlp_model,
    input_shape=20,  
    num_layers=1
    units=64, 
    initializer='he_uniform',  
    epochs=100,  
    verbose=1
)

pipe = Pipeline([
    ("count", vectorizer),
    ('tf_idf',tf_idf),
    ('feature_selection', k_best),
    ('mlp', model_wrapper)
])

params = {
    'mlp__num_layers': [1, 2, 3],  
    'mlp__units': [8, 16, 32, 64], 
    'mlp__initializer': [None, 'he_normal']
}
from sklearn.model_selection import GridSearchCV

gs = GridSearchCV(estimator=pipe, 
             param_grid= params, *, 
             scoring=SCORING, 
             n_jobs=None, 
             refit=True, 
             cv=None, 
             verbose=0, 
             pre_dispatch='2*n_jobs', 
             error_score=nan, 
             return_train_score=False)

grid_search = gs.fit(X_train_clean, y_train)

In [51]:
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from tensorflow.keras import Sequential
from tensorflow.keras import layers
import tensorflow as tf
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# Instantiating processing transformers
vectorizer = CountVectorizer(decode_error='replace', strip_accents='unicode', stop_words=None, ngram_range=(1, 2), max_df=0.95, min_df=2)
tf_idf = TfidfTransformer()
k_best = SelectKBest(k=10000)
SCORING = {'accuracy': 'accuracy', 'roc_auc': 'roc_auc'}

def build_mlp_model(input_shape, num_layers, units, initializer):
    model = Sequential()
    model.add(layers.InputLayer(input_shape=(input_shape,)))
    for _ in range(num_layers - 1):
        model.add(layers.Dense(units, activation="relu", kernel_initializer=initializer))
        units = units // 2
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy', tf.keras.metrics.AUC()])
    return model

model_wrapper = KerasClassifier(
    model=build_mlp_model,
    input_shape=10000,  
    num_layers=1,  
    units=64,
    initializer='he_uniform',  
    epochs=20,  
    verbose=1,
    random_state=42
)

pipe = Pipeline([
    ("count", vectorizer),
    ('tf_idf', tf_idf),
    ('feature_selection', k_best),
    ('mlp', model_wrapper)
])

params = {
    'mlp__num_layers': [1, 2, 3],  
    'mlp__units': [8, 16, 32, 64], 
    'mlp__initializer': ['he_uniform', 'he_normal']
}

gs = GridSearchCV(estimator=pipe, 
                  param_grid=params,
                  scoring=SCORING, 
                  n_jobs=-1, 
                  refit='accuracy',
                  cv=None, 
                  verbose=0,
                  error_score='raise', 
                  return_train_score=False)

# Assuming X_train_clean and y_train are defined
grid_search = gs.fit(X_train_clean, y_train)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20

KeyboardInterrupt: 