In [1]:
import numpy as np
import pandas as pd
import jsonlines

import tensorflow as tf
tf.config.run_functions_eagerly(True)

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Flatten, Concatenate, ZeroPadding1D, Layer
from tensorflow.keras.losses import binary_crossentropy, categorical_crossentropy
from tensorflow.keras.preprocessing.text import Tokenizer
from keras import ops
from tensorflow import keras
from tensorflow.keras.saving import register_keras_serializable
import tensorflow_probability as tfp

from sklearn.preprocessing import label_binarize

import nltk
nltk.download("stopwords")
import string
from nltk.corpus import stopwords

2024-05-07 15:23:37.270507: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-07 15:23:37.470735: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-07 15:23:38.461727: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/cpsc477_sh2482/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Preprocess Data

In [2]:
file = "data/aspect-level-certainty.jsonl"

annotate_science = []

with jsonlines.open(file) as f:
    for line in f.iter():
        annotate_science.append(line)

In [3]:
finance = pd.read_csv("data/10K_sentiment_list.csv")
uncertain_finance = finance[finance["Uncertainty"] != 0]["Word"].values

In [4]:
# Restructure annotated data from scientific set
label_codes = {"NotPresent": 0, "Certain": 1, "Uncertain": 2}
dims = {"Number": None, "Extent": None, "Probability": None, "Condition": None, "Suggestion": None, "Framing": None}
real_text = [x["finding"] for x in annotate_science]
real_labels = [[label_codes[x["aspect-level-certainty"][dim]] for dim in dims] for x in annotate_science]

In [5]:
latent_dim = 100
vocab_size = 10000
embedding_dim = 128
num_label_dims = 6
label_dim = 3
label_values = [0,1,2]
max_len = 64
start_token = "<START>"
stop_token = "<STOP>"

In [6]:
def preprocess(text):
    """
    Preprocesses text string by removing capitalization, punctuation, and stop words.

    Args:
        text: The text string to be preprocessed.

    Returns:
        The preprocessed text string.
    """

    text = text.lower()
    text = "".join(c for c in text if c not in string.punctuation)
    stop_words = stopwords.words("english")
    text = " ".join([word for word in text.split() if word not in stop_words])
    text = start_token + " " + text + " " + stop_token
    return text

In [7]:
def tokenize(text_main, text_finance, max_len):
    """
    Tokenizes preprocessed text string.

    Args:
        text: The preprocessed text string.
        max_len: The maximum length for tokenized sequences

    Returns:
        A list of integers representing the tokenized text sequence.
    """
    all_text = text_main + text_finance
    
    tokenizer = Tokenizer(num_words=vocab_size)
    tokenizer.fit_on_texts(all_text)
    vocab = tokenizer.word_index
    
    main_sequences = [tokenizer.texts_to_sequences([line])[0] for line in text_main] 
    for i in range(len(main_sequences)):
        if len(main_sequences[i]) > max_len:
            main_sequences[i] = main_sequences[i][:max_len]
        else:
            main_sequences[i] = tf.keras.utils.pad_sequences([main_sequences[i]], max_len, padding="post")[0]

    finance_tokens = [tokenizer.texts_to_sequences([word])[0] for word in text_finance]
    
    return main_sequences, finance_tokens, vocab

## Define Model

In [8]:
domain_weight = 0.3
@keras.saving.register_keras_serializable()
def generator_loss(y_true, y_pred):
    """
    Calculates a custom loss function incorporating uncertainty lexicon.

    Args:
        y_true: The ground truth labels.
        y_pred: The predicted labels.

    Returns:
        The combined loss value.
    """
    text_loss = categorical_crossentropy(y_true, y_pred)

    # Extract uncertainty words embeddings from the embedding layer
    uncertainty_word_embeddings = tf.nn.embedding_lookup(generator.get_layer('embedding').weights[0], finance_token)

    # Compute similarity between predicted embeddings and uncertainty word embeddings
    predicted_word_embeddings = tf.nn.embedding_lookup(generator.get_layer('embedding').weights[0], tf.argmax(y_pred, axis=-1))
    similarity = tf.matmul(predicted_word_embeddings, uncertainty_word_embeddings, transpose_b=True)

    # Calculate uncertainty loss based on similarity
    uncertainty_loss = tf.reduce_mean(tf.reduce_max(similarity, axis=-1))

    # Adjust the weights of the text loss and uncertainty loss
    combined_loss = text_loss + uncertainty_loss * domain_weight

    return combined_loss


In [9]:
@keras.saving.register_keras_serializable()
def discriminator_loss(real_output, fake_output):
    """
    Calculates loss function to distinguish between real and fake text.

    Args:
        real_output: The real text.
        fake_output: The generated tex.

    Returns:
        The loss value.
    """
    real_loss = binary_crossentropy(tf.ones_like(real_output), real_output)
    fake_loss = binary_crossentropy(tf.zeros_like(fake_output), fake_output)
    label_loss = binary_crossentropy(real_output, fake_output)
    return real_loss + fake_loss + label_loss

In [2]:
@keras.saving.register_keras_serializable()
class Generator(keras.Model):
    def __init__(self, vocab_size, embedding_dim, latent_dim, label_dim, **kwargs):
        super(Generator, self).__init__(**kwargs)
        self.embedding = Embedding(vocab_size, embedding_dim, name="embedding")
        self.lstm = LSTM(256, return_sequences=True)
        self.dense = Dense(vocab_size, activation="softmax")
        self.label_embedding = Dense(embedding_dim)
    
    def call(self, inputs, training=None):
        noise, label = inputs
        noise = tf.reshape(noise, (-1, latent_dim)) 
        label_embedding = self.label_embedding(label)
        embedded_noise = self.embedding(noise)
        label_embedding_repeated = tf.tile(tf.expand_dims(label_embedding, 1), [1, tf.shape(embedded_noise)[1], 1])
        combined_input = keras.layers.concatenate([embedded_noise, label_embedding_repeated])
        output = self.lstm(combined_input)
        output = self.dense(output)
        return output

NameError: name 'keras' is not defined

In [15]:
@keras.saving.register_keras_serializable()
class Discriminator(keras.Model):
  def __init__(self, vocab_size, embedding_dim, label_dim, **kwargs):
    super(Discriminator, self).__init__(**kwargs)
    self.embedding = Embedding(vocab_size, embedding_dim)
    self.lstm = LSTM(256)
    self.label_embedding = Dense(embedding_dim)
    self.dense = Dense(1, activation="sigmoid")

  def call(self, inputs, training=None):
    text, label = inputs
    label_embedding = self.label_embedding(label)
    label_embedding_reshaped = tf.expand_dims(label_embedding, axis=1) 
    label_embedding_reshaped = tf.tile(label_embedding_reshaped, [1, max_len, 1]) 
    embedded_text = self.embedding(text)
    combined_input = keras.layers.concatenate([embedded_text, label_embedding_reshaped])
    output = self.lstm(combined_input)
    output = self.dense(output)
    return output


In [16]:
@keras.saving.register_keras_serializable()
def train_step(real_data, labels, noise):
    with tf.GradientTape() as disc_tape, tf.GradientTape() as gen_tape:
        # Generate fake data
        fake_data_logits = generator([noise, labels], training=True)

        # Generate sequences from logits
        fake_data_sequences = tf.TensorArray(tf.int32, size=fake_data_logits.shape[0], dynamic_size=False)
        for i in tf.range(fake_data_logits.shape[0]):
            generated_sequence = []
            for _ in range(max_len):
                logits = fake_data_logits[i:i+1, _, :]
                generated_token = tf.random.categorical(logits, num_samples=1)
                generated_sequence.append(generated_token[0, 0].numpy())
            fake_data_sequences = fake_data_sequences.write(i, generated_sequence)
        fake_data_sequences = fake_data_sequences.stack()

        # Train Discriminator
        real_output = discriminator([real_data, labels], training=True)
        fake_output = discriminator([fake_data_sequences, labels], training=True)
        disc_loss = discriminator_loss(tf.ones_like(real_output), real_output) + discriminator_loss(tf.zeros_like(fake_output), fake_output)

        # Train Generator
        gen_output = discriminator([fake_data_sequences, labels], training=True)
        gen_loss = generator_loss(tf.ones_like(gen_output), gen_output)

    # Compute gradients and apply optimizer updates for Discriminator
    gradients_of_disc = disc_tape.gradient(disc_loss, discriminator.trainable_variables)
    discriminator_optimizer.apply_gradients(zip(gradients_of_disc, discriminator.trainable_variables))

    # Compute gradients and apply optimizer updates for Generator
    gradients_of_gen = gen_tape.gradient(gen_loss, generator.trainable_variables)
    generator_optimizer.apply_gradients(zip(gradients_of_gen, generator.trainable_variables))

    return disc_loss, gen_loss


## Build and Train Model

In [17]:
real_text_process = [preprocess(x) for x in real_text]
finance_text_process = [preprocess(x) for x in uncertain_finance]

real_text_token, finance_token, vocab = tokenize(real_text_process, finance_text_process, max_len)
vocab_search = {value: key for key, value in vocab.items()}

In [18]:
one_hot_labels = [label_binarize(x, classes=[0,1,2]) for x in real_labels]

dims["Number"] = [x[0] for x in one_hot_labels]
dims["Extent"] = [x[1] for x in one_hot_labels]
dims["Probability"] = [x[2] for x in one_hot_labels]
dims["Condition"] = [x[3] for x in one_hot_labels]
dims["Suggestion"] = [x[4] for x in one_hot_labels]
dims["Framing"] = [x[5] for x in one_hot_labels]

In [None]:
num_epochs = 15
batch_size = 64
num_batches = len(real_text_token) // batch_size

all_disc_loss = [[[None for x in range(num_batches)] for y in range(num_epochs)] for z in range(num_label_dims)]
all_gen_loss = [[[None for x in range(num_batches)] for y in range(num_epochs)] for z in range(num_label_dims)]

key_idx = 0
for key in dims.keys():
    print(f"Dimension: {key}")
    labels = dims[key]

    generator = Generator(vocab_size, embedding_dim, latent_dim, label_dim)
    discriminator = Discriminator(vocab_size, embedding_dim, label_dim)
    generator_optimizer = tf.keras.optimizers.Adam(learning_rate=0.0002)
    discriminator_optimizer = tf.keras.optimizers.Adam(learning_rate=0.0002)
    generator.compile(loss=generator_loss, optimizer=generator_optimizer)
    discriminator.compile(loss=discriminator_loss, optimizer=discriminator_optimizer)
    
    for epoch in range(num_epochs):
        print(f"Epoch: {epoch+1}/{num_epochs}")
    
        for i in range(num_batches):
            print(f"Batch: {i+1}/{num_batches}")
            start = i * batch_size
            end = (i + 1) * batch_size
            
            real_data = tf.convert_to_tensor(real_text_token[start:end])
            real_data_labels = tf.convert_to_tensor(labels[start:end])
            noise = tf.random.normal(shape=(batch_size, latent_dim))
        
            disc_loss, gen_loss = train_step(real_data, real_data_labels, noise)
            all_disc_loss[key_idx][epoch][i] = disc_loss
            all_gen_loss[key_idx][epoch][i] = gen_loss

    generator.save_weights(f"generator_{key}.weights.h5")
    discriminator.save_weights(f"discriminator_{key}.weights.h5")
    key_idx += 1

Dimension: Number
Epoch: 1/15
Batch: 1/27


  text_loss = categorical_crossentropy(y_true, y_pred)


Batch: 2/27
Batch: 3/27
Batch: 4/27
Batch: 5/27
Batch: 6/27
Batch: 7/27
Batch: 8/27
Batch: 9/27
Batch: 10/27
Batch: 11/27
Batch: 12/27
Batch: 13/27
Batch: 14/27
Batch: 15/27
Batch: 16/27
Batch: 17/27
Batch: 18/27
Batch: 19/27
Batch: 20/27
Batch: 21/27
Batch: 22/27
Batch: 23/27
Batch: 24/27
Batch: 25/27
Batch: 26/27
Batch: 27/27
Epoch: 2/15
Batch: 1/27
Batch: 2/27
Batch: 3/27
Batch: 4/27
Batch: 5/27
Batch: 6/27
Batch: 7/27
Batch: 8/27
Batch: 9/27
Batch: 10/27
Batch: 11/27
Batch: 12/27
Batch: 13/27
Batch: 14/27
Batch: 15/27
Batch: 16/27
Batch: 17/27
Batch: 18/27
Batch: 19/27
Batch: 20/27
Batch: 21/27
Batch: 22/27
Batch: 23/27
Batch: 24/27
Batch: 25/27
Batch: 26/27
Batch: 27/27
Epoch: 3/15
Batch: 1/27
Batch: 2/27
Batch: 3/27
Batch: 4/27
Batch: 5/27
Batch: 6/27
Batch: 7/27
Batch: 8/27
Batch: 9/27
Batch: 10/27
Batch: 11/27
Batch: 12/27
Batch: 13/27
Batch: 14/27
Batch: 15/27
Batch: 16/27
Batch: 17/27
Batch: 18/27
Batch: 19/27
Batch: 20/27
Batch: 21/27
Batch: 22/27
Batch: 23/27
Batch: 24/27
B