In [53]:
import numpy as np
import pandas as pd
import jsonlines

import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Reshape, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer

import nltk
nltk.download("stopwords")
import string
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sh2482\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
file = "data/aspect-level-certainty.jsonl"

annotate_science = []

with jsonlines.open(file) as f:
    for line in f.iter():
        annotate_science.append(line)

In [14]:
finance = pd.read_csv("data/10K_sentiment_list.csv")
uncertain_finance = finance[finance["Uncertainty"] != 0]["Word"].values

In [118]:
# Restructure annotated data from scientific set
label_codes = {"NotPresent": 0, "Certain": 1, "Uncertain": 2}
dim_order = ["Number", "Extent", "Probability", "Condition", "Suggestion", "Framing"]
real_text = [x["finding"] for x in annotate_science]
real_labels = [[label_codes[x["aspect-level-certainty"][dim]] for dim in dim_order] for x in annotate_science]

In [26]:
latent_dim = 100
vocab_size = 10000
num_label_dims = 6
label_values = [0,1,2]
max_len = 64
uncertain_finance = uncertain_finance

In [7]:
def preprocess(text):
    """
    Preprocesses text string by removing capitalization, punctuation, and stop words.

    Args:
        text: The text string to be preprocessed.

    Returns:
        The preprocessed text string.
    """

    text = text.lower()
    text = "".join(c for c in text if c not in string.punctuation)
    stop_words = stopwords.words("english")
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text

In [28]:
def tokenize(text, max_len):
    """
    Tokenizes preprocessed text string.

    Args:
        text: The preprocessed text string.
        max_len: The maximum length for tokenized sequences

    Returns:
        A list of integers representing the tokenized text sequence.
    """

    tokenizer = Tokenizer(num_words=vocab_size)
    tokenizer.fit_on_texts([text])
    sequences = tokenizer.texts_to_sequences([text])
    sequences = tf.keras.utils.pad_sequences(sequences, max_len, padding="post")
    return sequences

In [31]:
def loss_function(y_true, y_pred, domain_weight):
    """
    Calculates a custom loss function.

    Args:
        y_true: The ground truth labels.
        y_pred: The predicted labels.
        domain_weight: The weight for finance specific uncertainty loss.

    Returns:
        The combined loss value.
    """
    text_loss = CategoricalCrossentropy()(y_true[:,0], y_pred[:,0])

    uncertainty_embedding = Embedding(vocab_size, 128)(y_pred[:, 0])
    uncertainty_scores = tf.reduce_sum(tf.nn.embedding_lookup(uncertainty_embedding, finance_uncertainty), axis=1)
    uncertainty_loss = tf.reduce_mean(tf.abs(uncertainty_scores - y_true[:, 1]))

    return text_loss + uncertainty_loss * domain_weight

In [91]:
def build_generator():
    """
    Defines architecture of the generator model in the CGAN.

    Returns:
        A Model object representing the generator.
    """
    noise = Input(shape=(latent_dim,))
    label = Input(shape=(num_label_dims,))
    combined = Concatenate()([noise, label])

    # Embedding layer for text
    x = Embedding(vocab_size, 128)(combined)
    x = LSTM(256, return_sequences=True)(x)
    x = LSTM(128)(x)

    # Dense layer for output
    output = Dense(max_len, activation="softmax")(x)
    
    return Model(inputs=[noise, label], outputs=output)

In [127]:
def build_discriminator():
    """
    Defines architecture of the discriminator model in the CGAN.

    Returns:
        A Model object representing the discriminator.
    """
    text = Input(shape=(max_len,))
    label = Input(shape=(num_label_dims,))

    # Embedding and processing layers for text
    x = Embedding(vocab_size, 128)(text)
    x = Concatenate()([x, label])
    x = Dense(64, activation="relu")(combined)

    # Output layer for each label dimension
    valid = []
    for _ in range(num_label_dims):
        valid.append(Dense(1, activation="sigmoid")(x))
    output = Concatenate()(valid)
    return Model(inputs=[text, label], outputs=output)

In [93]:
def cgan_model(generator, discriminator):
    """
    Creates the full CGAN model.

    Args:
        generator: The generator model.
        discriminator: The discriminator model.

    Returns:
        A Model object representing the CGAN model.
    """
    noise = Input(shape=(latent_dim,))
    label = Input(shape=(num_label_dims,))
    generated_text = generator([noise, label])
    labeled = discriminator([generated_text, label])
    return Model(inputs=[noise, label], outputs=[generated_text, labeled])

In [49]:
lr = 0.001
beta = 0.5

In [125]:
generator = build_generator()
discriminator = build_discriminator()
model = cgan_model(generator, discriminator)
optimizer = Adam(learning_rate = lr, beta_1 = beta)
discriminator.compile(loss=[loss_function], optimizer=optimizer, metrics=['accuracy'])
model.compile(loss=[loss_function], optimizer=optimizer)

In [129]:
num_epochs = 1
batch_size = 16

real_text_process = [preprocess(x) for x in real_text]
real_text_token = [tokenize(x) for x in real_text_process]

num_batches = len(real_text) // batch_size
text_batches = np.array_split(real_text, num_batches)
label_batches = np.array_split(real_labels, num_batches)


for epoch in range(num_epochs):

    for batch_index in range(num_batches):
        real_data = text_batches[batch_index]
        real_data_labels = label_batches[batch_index]
        
        noise = np.random.normal(0, 1, (batch_size, latent_dim))
        fake_data = generator.predict([noise, real_data_labels])

        print(real_data.shape)

        real_loss = discriminator.train_on_batch([real_data, real_data_labels], np.ones((batch_size, 1)))
        fake_loss = discriminator.train_on_batch([fake_data, real_data_labels], np.zeros((batch_size, 1)))
    
        noise = np.random.normal(0, 1, (batch_size, latent_dim))
        generator_loss = combined_model.train_on_batch([noise, real_data_labels], [real_data, np.ones((batch_size, 1))])
    
        print(f"Epoch {epoch+1}/{num_epochs}, Discriminator Loss: {discriminator_loss}, Generator Loss: {generator_loss}")

# Generate synthetic text with specific labels
noise = np.random.normal(0, 1, (1, latent_dim))
label = np.array([[1, 0, 2, 1, 2, 0]])  # Generate text with specific labels for each dimension
generated_text_indices = generator.predict([noise, label])
generated_text = [chr(i) for i in np.argmax(generated_text_indices, axis=1)[0]]  # Convert indices back to text
print("Generated Text:", " ".join(generated_text))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step
(16,)


ValueError: Exception encountered when calling Functional.call().

[1mInvalid input shape for input Tensor("functional_65_1/Cast:0", shape=(16,), dtype=float32). Expected shape (None, 64), but input has incompatible shape (16,)[0m

Arguments received by Functional.call():
  • inputs=['tf.Tensor(shape=(16,), dtype=string)', 'tf.Tensor(shape=(16, 6), dtype=int32)']
  • training=True
  • mask=['None', 'None']

In [121]:
label_batches = np.array_split(real_labels, num_batches)
label_batches

[array([[0, 0, 1, 0, 0, 0],
        [1, 1, 2, 0, 0, 0],
        [0, 0, 2, 0, 1, 1],
        [1, 2, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0],
        [0, 0, 2, 0, 1, 0],
        [0, 0, 0, 0, 1, 0],
        [0, 2, 1, 0, 0, 0],
        [0, 0, 2, 0, 0, 0],
        [0, 2, 1, 1, 0, 0],
        [0, 2, 1, 0, 0, 0],
        [0, 0, 1, 0, 0, 0],
        [2, 2, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 1],
        [1, 0, 1, 0, 0, 0],
        [1, 2, 1, 0, 0, 0]]),
 array([[1, 1, 1, 0, 0, 2],
        [1, 0, 1, 0, 0, 0],
        [2, 0, 1, 0, 0, 0],
        [0, 2, 1, 0, 0, 0],
        [0, 0, 2, 0, 2, 0],
        [0, 2, 2, 0, 0, 0],
        [1, 0, 1, 0, 0, 0],
        [0, 0, 1, 0, 0, 0],
        [0, 1, 2, 0, 0, 0],
        [0, 1, 1, 0, 0, 0],
        [0, 0, 1, 0, 0, 0],
        [2, 0, 1, 0, 0, 2],
        [0, 2, 2, 0, 0, 0],
        [0, 1, 1, 0, 0, 0],
        [0, 0, 1, 0, 0, 0],
        [0, 0, 2, 0, 1, 1]]),
 array([[1, 0, 2, 0, 0, 0],
        [0, 0, 1, 0, 0, 2],
        [1, 0, 1, 0, 0, 0],
        [1, 0, 1