In [403]:
import numpy as np
import pandas as pd
import jsonlines

import tensorflow as tf
tf.config.run_functions_eagerly(True)

from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Flatten, Lambda
from tensorflow.keras.losses import binary_crossentropy, categorical_crossentropy
from keras import ops

from sklearn.preprocessing import label_binarize

import nltk
nltk.download("stopwords")
import string
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sh2482\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
file = "data/aspect-level-certainty.jsonl"

annotate_science = []

with jsonlines.open(file) as f:
    for line in f.iter():
        annotate_science.append(line)

In [14]:
finance = pd.read_csv("data/10K_sentiment_list.csv")
uncertain_finance = finance[finance["Uncertainty"] != 0]["Word"].values

In [308]:
# Restructure annotated data from scientific set
label_codes = {"NotPresent": 0, "Certain": 1, "Uncertain": 2}
dims = {"Number": None, "Extent": None, "Probability": None, "Condition": None, "Suggestion": None, "Framing": None}
real_text = [x["finding"] for x in annotate_science]
real_labels = [[label_codes[x["aspect-level-certainty"][dim]] for dim in dim_order] for x in annotate_science]

In [450]:
latent_dim = 100
embedding_dim = 128
num_label_dims = 6
one_hot_len = 3
label_values = [0,1,2]
max_len = 64
lstm_units = 64

In [7]:
def preprocess(text):
    """
    Preprocesses text string by removing capitalization, punctuation, and stop words.

    Args:
        text: The text string to be preprocessed.

    Returns:
        The preprocessed text string.
    """

    text = text.lower()
    text = "".join(c for c in text if c not in string.punctuation)
    stop_words = stopwords.words("english")
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text

In [465]:
def tokenize(text, max_len):
    """
    Tokenizes preprocessed text string.

    Args:
        text: The preprocessed text string.
        max_len: The maximum length for tokenized sequences

    Returns:
        A list of integers representing the tokenized text sequence.
    """

    tokenizer = Tokenizer(num_words=vocab_size)
    tokenizer.fit_on_texts([text])
    sequences = tokenizer.texts_to_sequences([text])
    if len(sequences) > max_len:
        sequences = sequences[:max_len]
    else:
        sequences = tf.keras.utils.pad_sequences(sequences, max_len, padding="post")
    return sequences

In [401]:
domain_weight = 0.2

def generator_loss(y_true, y_pred):
    """
    Calculates a custom loss function.

    Args:
        y_true: The ground truth labels.
        y_pred: The predicted labels.
        domain_weight: The weight for finance specific uncertainty loss.

    Returns:
        The combined loss value.
    """
    text_loss = categorical_crossentropy(y_true, y_pred)

    # Make sure to use same embeddings as generator model
    embedding_layer = generator.get_layer('embedding') 
    word_embedding_matrix = embedding_layer.get_weights()[0]

    finance_word_embeddings = tf.nn.embedding_lookup(word_embedding_matrix, finance_token)
    uncertainty_scores = tf.reduce_mean(finance_word_embeddings, axis=1)
    uncertainty_loss = tf.reduce_mean(tf.abs(uncertainty_scores - y_true[:, 1]))
    return text_loss + uncertainty_loss * domain_weight

In [341]:
def discriminator_loss(real_output, fake_output):
    """
    Calculates loss function to distinguish between real and fake text.

    Args:
        real_output: The real text.
        fake_output: The generated tex.

    Returns:
        The loss value.
    """
    real_loss = binary_crossentropy(tf.ones_like(real_output), real_output)
    fake_loss = binary_crossentropy(tf.zeros_like(fake_output), fake_output)
    label_loss = binary_crossentropy(real_output, fake_output)
    return real_loss + fake_loss + label_loss

### Build the Model

In [424]:
def build_generator():
    noise_input = Input(shape=(latent_dim,))
    label_input = Input(shape=(one_hot_len,))
    combined = Concatenate()([noise_input, label_input])
    embedded = Embedding(vocab_size, embedding_dim, name="embedding")(combined)
    lstm1 = LSTM(256, return_sequences=True)(embedded)
    lstm2 = LSTM(128)(lstm1)
    output = Dense(max_len, activation='softmax')(lstm2)
    model = Model(inputs=[noise_input, label_input], outputs=output)
    model.compile(loss = generator_loss, optimizer="adam")
    return model

In [492]:
def build_discriminator():
    text_input = Input(shape=(None,))
    label_input = Input(shape=(one_hot_len,))
    combined = Concatenate()([text_input, label_input])
    embedded = Embedding(vocab_size, embedding_dim)(combined)
    lstm1 = LSTM(256, return_sequences=True)(embedded)
    lstm2 = LSTM(128)(lstm1)
    sentiment_output = Dense(1, activation='sigmoid')(lstm2)
    model = Model(inputs=[text_input, label_input], outputs=sentiment_output)
    model.compile(loss = discriminator_loss, optimizer="adam")
    return model

In [443]:
def build_combined():
    noise_input = Input(shape=(latent_dim,))
    sentiment_input = Input(shape=(one_hot_len,))
    generated_text = generator([noise_input, sentiment_input])
    validity = discriminator([generated_text, sentiment_input])
    model = Model(inputs=[noise_input, sentiment_input], outputs=validity)
    model.compile(loss="categorical_crossentropy", optimizer="adam")
    return model

In [498]:
real_text_process = [preprocess(x) for x in real_text]
real_text_token = [tokenize(x, max_len) for x in real_text_process]

finance_text_process = [preprocess(x) for x in uncertain_finance]
finance_token = [tokenize(x, max_len) for x in finance_text_process]

one_hot_labels = [label_binarize(x, classes=[0,1,2]) for x in real_labels]

dims["Number"] = [x[0] for x in one_hot_labels]
dims["Extent"] = [x[1] for x in one_hot_labels]
dims["Probability"] = [x[2] for x in one_hot_labels]
dims["Condition"] = [x[3] for x in one_hot_labels]
dims["Suggestion"] = [x[4] for x in one_hot_labels]
dims["Framing"] = [x[5] for x in one_hot_labels]

In [500]:
num_epochs = 16
batch_size = 64

num_batches = len(real_text_token) // batch_size

generator = build_generator()
discriminator = build_discriminator()
combined = build_combined()

real = np.ones((batch_size, 1))
fake = np.zeros((batch_size, 1))

d_loss = [[0 for x in range(num_batches)] for y in range(num_epochs)]
gen_loss = [[0 for x in range(num_batches)] for y in range(num_epochs)]

for epoch in range(num_epochs):
    for i in range(num_batches):
        
        start = i * batch_size
        end = (i + 1) * batch_size
        
        real_data = tf.convert_to_tensor(real_text_token[start:end])
        real_data_labels = tf.convert_to_tensor(dims["Number"][start:end])
        
        # Train Discriminator
        noise_data = tf.convert_to_tensor(np.random.normal(size=(batch_size, latent_dim)))
        generate_labels = tf.convert_to_tensor(np.random.randint(0, 2, size=(batch_size, one_hot_len)))

        generated_data = tf.convert_to_tensor(generator.predict([noise_data, generate_labels]))
        real_data = Flatten()(real_data)
        
        real_loss = discriminator.train_on_batch([real_data, generate_labels], real)
        fake_loss = discriminator.train_on_batch([generated_data, generate_labels], fake)
        d_loss = 0.5 * np.add(real_loss, fake_loss)

        d_loss[epoch][i] = d_loss
        
        # Train Generator        
        gen_loss = combined.train_on_batch([noise_data, real_data_labels], real)

        gen_loss[epoch][i] = gen_loss
    
        # Print training progress
    print(f"Epoch: {epoch+1}/{num_epochs}, Discriminator Loss: {d_loss}, Generator Loss: {gen_loss[0]}")

SyntaxError: '[' was never closed (3801645415.py, line 36)