In [17]:
import os
import re

import emoji
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

# tf and keras
import tensorflow as tf
from tensorflow.keras import Sequential, layers, losses
from tensorflow.keras.layers import (
    Dense,
    Embedding,
    GlobalAveragePooling1D,
    Dropout,
    TextVectorization,
    Input,
    Conv1D,
    LSTM,
    MaxPooling1D,
    Bidirectional,
)
from tensorflow.keras.models import Model
# import tensorflow_datasets as tfds

pd.set_option('display.max_colwidth', 100) 

In [18]:
def read_files():
    X_train = pd.read_csv('./data/final/X_train.csv')
    y_train = pd.read_csv('./data/final/y_train.csv')
    X_val = pd.read_csv('./data/final/X_val.csv')
    y_val = pd.read_csv('./data/final/y_val.csv')
    
    train_not_na_indices = (X_train['selftext'].notna())
    val_not_na_indices = (X_val['selftext'].notna())
    
    X_train = X_train[train_not_na_indices]
    X_val = X_val[val_not_na_indices]

    y_train = y_train[train_not_na_indices]
    y_val = y_val[val_not_na_indices]

    return X_train, y_train, X_val, y_val

In [3]:
# # try and except the TF tokenizer
# try:
#     tokenizer = tfds.features.text.Tokenizer()
# except AttributeError:
#     tokenizer = tfds.deprecated.text.Tokenizer()

# # create an instance of the Counter class
# token_counts = Counter()

# for example in data_tf_train:
#     tokens = tokenizer.tokenize(example[0].numpy()[0])
#     token_counts.update(tokens)

# print('Size of training vocabulary:', len(token_counts))

# UTILS

# Model 1: Tokenizer and pad_sequences

In [5]:
X_train, y_train, X_val, y_val = read_files()

In [6]:
X_train = X_train[X_train['selftext'].notna()]

In [16]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd

def get_vectorization_layer(df, column, max_tokens=10000, output_sequence_length=250, embedding_dim=16):
    vectorize_layer = layers.TextVectorization(
        max_tokens=max_tokens,
        output_mode='int',
        output_sequence_length=output_sequence_length)

    df[column] = df[column].astype(str)
    vectorize_layer.adapt(df[column].values)

    return vectorize_layer

vectorize_layer = get_vectorization_layer(X_train, 'selftext')

def vectorize_text(text):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text)



X_train_v = X_train['selftext'].apply(vectorize_text)

# Tokenizer setup
max_vocab_size = 10000
tokenizer = Tokenizer(num_words=max_vocab_size)
tokenizer.fit_on_texts(X_train['selftext'])

# Convert texts to sequences
sequences = tokenizer.texts_to_sequences(X_train['selftext'])
# Pad sequences to ensure uniform length
max_sequence_length = 250
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)

vectorized_text = vectorize_layer(text_data)

NameError: name 'text_data' is not defined

In [None]:
embedding_dim=16

In [None]:
model1 = Sequential([
    Embedding(input_dim=max_vocab_size, output_dim=embedding_dim, input_length=max_sequence_length),
    layers.LSTM(128, return_sequences=True),
    Dropout(0.5),
    layers.LSTM(64),
    Dropout(0.5),
    Dense(1) # regression doesn't use an activation function
])


# Compile the model
model1.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error','accuracy'])
model1.summary()

# Train the model
model1.fit(vectorized_text, y_train, epochs=5, batch_size=2, validation_data=(X_val, y_val), verbose=1)

# Model 2: TextVectorization Layer, Basic Embedding Model, Two Hidden Dense Layers (64, 32)

In [19]:
X_train, y_train, X_val, y_val = read_files()

In [20]:
def get_vectorization_layer(df, column, max_tokens=10000, output_sequence_length=250, embedding_dim=16):
    vectorize_layer = layers.TextVectorization(
        max_tokens=max_tokens,
        output_mode='int',
        output_sequence_length=output_sequence_length)

    df[column] = df[column].astype(str)
    vectorize_layer.adapt(df[column].values)

    return vectorize_layer

def get_embedding_layer(max_tokens=10000, output_sequence_length=250, embedding_dim=16):
    return Embedding(input_dim=max_tokens, output_dim=embedding_dim, input_length=output_sequence_length)

def build_model_2(output_sequence_length=250):
    tf.keras.backend.clear_session()
    tf.random.set_seed(0)
    inputs = Input(shape=(output_sequence_length,))
    
    x = embedding_layer(inputs)

    x = GlobalAveragePooling1D()(x)

    x = Dense(64, activation='relu')(x)
    x = Dense(32, activation='relu')(x)
    
    outputs = Dense(1)(x)
    model = Model(inputs, outputs)
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error','accuracy'])
    
    return model

In [21]:
text_data = tf.constant(X_train['selftext'].values)
vectorize_layer = get_vectorization_layer(X_train, 'selftext')
vectorized_text = vectorize_layer(text_data)
embedding_layer = get_embedding_layer()
embedded_text = embedding_layer(vectorized_text)

In [22]:
model_2 = build_model_2()
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True)
history_2 = model_2.fit(vectorized_text, y_train, epochs=5, batch_size=2, verbose=1, callbacks=[early_stopping])
pd.DataFrame(history_2.history)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Unnamed: 0,loss,mean_absolute_error,accuracy
0,841420.0625,125.875099,0.130496
1,821035.6875,120.802795,0.130496
2,792405.625,118.600639,0.130496
3,754908.5625,115.370438,0.130496
4,705723.0,111.921883,0.130496


In [23]:
def build_full_model_2(max_tokens=10000, output_sequence_length=250, embedding_dim=16):
    tf.keras.backend.clear_session()
    tf.random.set_seed(0)
    inputs = Input(shape=(output_sequence_length,))
    
    x = Embedding(input_dim=max_tokens, output_dim=embedding_dim, input_length=output_sequence_length)(inputs)

    x = GlobalAveragePooling1D()(x)

    x = Dense(64, activation='relu')(x)
    x = Dense(32, activation='relu')(x)
    
    outputs = Dense(1)(x)
    model = Model(inputs, outputs)
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error','accuracy'])
    
    return model

In [24]:
text_data = tf.constant(X_train['selftext'].values)
vectorize_layer = get_vectorization_layer(X_train, 'selftext')
vectorized_text = vectorize_layer(text_data)

model_2_full = build_full_model_2()
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True)
history_2 = model_2_full.fit(vectorized_text, y_train, epochs=5, batch_size=2,verbose=1, callbacks=[early_stopping])
pd.DataFrame(history_2.history)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Unnamed: 0,loss,mean_absolute_error,accuracy
0,841941.1875,125.811234,0.130261
1,823003.3125,121.104996,0.130496
2,796639.5,119.501534,0.130496
3,763079.25,116.235962,0.130496
4,718906.75,113.606369,0.130496


# Model 3: TextVectorization Layer, Convolutional NN Embedding Model, Two Hidden Dense Layers (64, 32)

In [25]:
X_train, y_train, X_val, y_val = read_files()

In [26]:
def build_model_3(output_sequence_length=250):
    tf.keras.backend.clear_session()
    tf.random.set_seed(0)
    inputs = Input(shape=(output_sequence_length,))

    x = embedding_layer(inputs)
    
    x = layers.Conv1D(32, 4, activation='relu')(x)
    x = layers.MaxPooling1D()(x)
    
    x = Dense(64, activation='relu')(x)
    x = Dense(32, activation='relu')(x)

    outputs = Dense(1)(x)
    model = Model(inputs, outputs)
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error','accuracy'])
    
    return model

In [27]:
text_data = tf.constant(X_train['selftext'].values)
vectorize_layer = get_vectorization_layer(X_train, 'selftext')
vectorized_text = vectorize_layer(text_data)
embedding_layer = get_embedding_layer()
embedded_text = embedding_layer(vectorized_text)

In [29]:
model_3 = build_model_3()
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True)
history_3 = model_3.fit(vectorized_text, y_train, epochs=5, batch_size=2, verbose=1, callbacks=[early_stopping])
pd.DataFrame(history_3.history)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Unnamed: 0,loss,mean_absolute_error,accuracy
0,843427.9375,128.294571,0.13038
1,841724.6875,132.104477,0.130496
2,840341.875,132.194839,0.130496
3,839900.0,133.079559,0.130496
4,839023.1875,133.237381,0.130496


In [32]:
pd.DataFrame(history_3.history)

Unnamed: 0,loss,mean_absolute_error,accuracy
0,843427.9375,128.294571,0.13038
1,841724.6875,132.104477,0.130496
2,840341.875,132.194839,0.130496
3,839900.0,133.079559,0.130496
4,839023.1875,133.237381,0.130496


# Model 4: TextVectorization Layer, LSTM RNN Embedding Model, Two Hidden Dense Layers (64, 32)

In [33]:
X_train, y_train, X_val, y_val = read_files()

In [37]:
def build_model_4(output_sequence_length=250):
    tf.keras.backend.clear_session()
    tf.random.set_seed(0)
    inputs = Input(shape=(output_sequence_length,))

    x = embedding_layer(inputs)
    
    x = layers.LSTM(32, activation='relu')(x)
    # x = layers.MaxPooling1D()(x)
    
    x = Dense(64, activation='relu')(x)
    x = Dense(32, activation='relu')(x)

    outputs = Dense(1)(x)
    model = Model(inputs, outputs)
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error','accuracy'])
    
    return model

In [38]:
text_data = tf.constant(X_train['selftext'].values)
vectorize_layer = get_vectorization_layer(X_train, 'selftext')
vectorized_text = vectorize_layer(text_data)
embedding_layer = get_embedding_layer()
embedded_text = embedding_layer(vectorized_text)

In [39]:
model_4 = build_model_4()
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True)
history_4 = model_4.fit(vectorized_text, y_train, epochs=5, batch_size=2, verbose=1, callbacks=[early_stopping])
pd.DataFrame(history_4.history)

Epoch 1/5

KeyboardInterrupt: 

# Model 5: TextVectorization Layer, Bi-Directional LSTM RNN Embedding Model, Two Hidden Dense Layers (64, 32)

In [40]:
X_train, y_train, X_val, y_val = read_files()

In [41]:
def build_model_5(output_sequence_length=250):
    tf.keras.backend.clear_session()
    tf.random.set_seed(0)
    inputs = Input(shape=(output_sequence_length,))

    x = embedding_layer(inputs)
    
    x = Bidirectional(LSTM(32, activation='relu'))(x)
    # x = layers.MaxPooling1D()(x)
    
    x = Dense(64, activation='relu')(x)
    x = Dense(32, activation='relu')(x)

    outputs = Dense(1)(x)
    model = Model(inputs, outputs)
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error','accuracy'])
    
    return model

In [42]:
text_data = tf.constant(X_train['selftext'].values)
vectorize_layer = get_vectorization_layer(X_train, 'selftext')
vectorized_text = vectorize_layer(text_data)
embedding_layer = get_embedding_layer()
embedded_text = embedding_layer(vectorized_text)

In [43]:
model_5 = build_model_5()
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True)
history_5 = model_5.fit(vectorized_text, y_train, epochs=5, batch_size=2, verbose=1, callbacks=[early_stopping])
pd.DataFrame(history_5.history)

Epoch 1/5
   97/12771 [..............................] - ETA: 55:12 - loss: nan - mean_absolute_error: nan - accuracy: 0.1701                   

KeyboardInterrupt: 

# Using TextVectorization Layer 2 

In [None]:
vocab_size = len(vectorize_layer.get_vocabulary())

# Input shape:  (batch_size, input_length)
# Output shape: (batch_size, input_length, output_dim)
embedding_layer = tf.keras.layers.Embedding(
    input_dim = vocab_size,  # size of feature vocabulary
    output_dim = 2,   # embedding dimension
    input_length = max_sequence_length  # number of inputs
    )

first_review_embed_rep = embedding_layer(X_train_v[0])

In [None]:
def build_model(vectorize_layer):
    vocab_size = len(vectorize_layer.get_vocabulary())

    tf.keras.backend.clear_session()
    tf.random.set_seed(0)

    model = tf.keras.Sequential()
    model.add(vectorize_layer)
    model.add(tf.keras.layers.Embedding(
        input_dim = vocab_size,  # size of feature vocabulary
        output_dim = 2,  # embedding dimension
        input_length = max_sequence_length  # number of inputs
        ))

    # Average over the sequence dimension, so each review is represented by
    # 1 vector of size embedding_dimension
    model.add(tf.keras.layers.GlobalAveragePooling1D())

    # Alternatively, we could concatenate the embedding representations of
    # all tokens in the movie review
    #model.add(tf.keras.layers.Flatten())

    model.add(tf.keras.layers.Dense(
      units=8,
      activation='relu'))

    model.add(tf.keras.layers.Dense(
      units=1,
      activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])

    return model

In [None]:
def get_vectorization_layer(df, column, max_features=10000, sequence_length=250, embedding_dim=16):
    vectorize_layer = layers.TextVectorization(
        max_tokens=max_features,
        output_mode='int',
        output_sequence_length=sequence_length)

    df[column] = df[column].astype(str)
    vectorize_layer.adapt(df[column])

    return vectorize_layer

vectorize_layer = get_vectorization_layer(X_train, 'selftext')

model = build_model(vectorize_layer)

# Display the model layers.
display(model.layers)
display(model.summary())

# Retrieve the embeddings layer, which itself is wrapped in a list.
embeddings = model.layers[1].get_weights()[0]
print('|'*100)
display("Embeddings layer - shape: ", embeddings.shape)
print('|'*100)
display("Embeddings layer - parameter matrix (before training): ", embeddings)

# End of file

Sources:
* https://stackoverflow.com/questions/73878049/how-do-you-convert-the-pandas-dataframe-to-tensorflow-python-data-ops-dataset-op