In [19]:
import numpy as np
from keras.utils import to_categorical
from keras import models
from keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re

import pandas as pd

df = pd.read_parquet("hf://datasets/TimKoornstra/financial-tweets-sentiment/data/train-00000-of-00001.parquet")

print(df.columns)
df.drop(columns='url')

print(f"tweets in dataset: {len(np.unique(df["tweet"]))}")
#print(df.head)

pattern = r'[^\w\s]'

# Use regex substitution to remove special characters from the 'text' column
df['tweet'] = df['tweet'].apply(lambda x: re.sub(pattern, '', x))

# Print the updated DataFrame
#print([(np.unique(df["tweet"]))])

# Use ASCII filtering to remove non-ASCII characters from the 'text' column
df['tweet'] = df['tweet'].apply(lambda x: ''.join(char for char in x if ord(char) < 128))
#print('\n\n\n')
#print(([(np.unique(df["tweet"]))]))
print(len(df['tweet']))


def remove_links(tweet):
    words = tweet.split()  # Split tweet into words
    filtered_words = [word for word in words if not word.startswith('http')]  # Remove words that start with 'http'
    return ' '.join(filtered_words)  # Join words back together into a tweet

# Apply the function to each entry in the 'tweets' column
df['tweet'] = df['tweet'].apply(remove_links)
df=df.drop(columns=['url'])
#print(df.head)

def remove_numerical_words(tweet):
    # Use regex to remove words containing any digits
    return ' '.join(word for word in tweet.split() if not re.search(r'\d', word))

# Apply the function to the 'tweet' column
df['tweet'] = df['tweet'].apply(remove_numerical_words)

# Check the updated DataFrame
print(df.head())
df.to_csv('out.csv', index=False) 

all_tweets = ' '.join(df['tweet'])

# Step 2: Split the combined string into individual words
all_words = all_tweets.split()

# Step 3: Use a set to get unique words
unique_words = set(all_words)

# Convert the set back to a sorted list (optional)
unique_words_list = sorted(list(unique_words))
print(f'unique words in dataset: {len(unique_words_list)}')
#print(unique_words_list)
#"Tokenizing" data

#df['sentiment'] = to_categorical(df['sentiment'].values, num_classes=3)

# Tokenizing and padding sequences (same as before)
tokenizer = Tokenizer(num_words=50000)
tokenizer.fit_on_texts(df['tweet'])

sequences = tokenizer.texts_to_sequences(df['tweet'])
print(sequences[:50])
print(f'sequence length {len(sequences)}')
padded_sequences = pad_sequences(sequences, padding='post')
max_sequence_length = padded_sequences.shape[1]

# Split the data into training and test sets
train_x = padded_sequences[:15000]
train_y = df['sentiment'][:15000]
test_x = padded_sequences[15000:]
test_y = df['sentiment'][15000:]

#turns this into one hot encoding format
train_y = to_categorical(train_y, num_classes=3)
test_y = to_categorical(test_y, num_classes=3)

print(train_y[1])


vocab_size=50000
# Training the model
model = models.Sequential()

# Embedding layer (input length is max_sequence_length after padding)
model.add(layers.Embedding(input_dim=vocab_size, output_dim=50, input_length=max_sequence_length))

# Flatten the output of the Embedding layer
model.add(layers.GlobalMaxPooling1D())

# Hidden Layers
model.add(layers.Dense(50, activation="relu"))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(50, activation="relu"))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(50, activation="relu"))

# Output Layer (3 units for 3 classes, with softmax activation)
model.add(layers.Dense(3, activation="softmax"))

model.summary()

# Compile the model
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

# Train the model
results = model.fit(
    train_x, train_y,
    epochs=5,
    batch_size=32,
    validation_data=(test_x, test_y)
)

train_accuracy = results.history['accuracy']
val_accuracy = results.history['val_accuracy']
train_loss = results.history['loss']
val_loss = results.history['val_loss']



#Testing on new data
new_data = pd.DataFrame({
    'tweet': [
        "JPMorgan price will fall. It's been a bad week for the company", 
        "Stock price soars on great day production increases at the company.", 
        "I am going to be looking at the stock price later today.",
        "There has been a change in management at chipotle.",
        "Profits soar for small startup with a big jump in sales."
    ]
})

print(f"Max sequence length from training data: {max_sequence_length}")

# Step 1: Preprocess the new data
# Clean the new data
pattern = r'[^\w\s]'  # Keep only words and spaces
new_data['tweet'] = new_data['tweet'].apply(lambda x: re.sub(pattern, '', x))

# Step 2: Tokenize the new data
# Use the same tokenizer that was fitted on the training data
sequences_new = tokenizer.texts_to_sequences(new_data['tweet'])

# Step 3: Pad the sequences (using the defined max_sequence_length)
padded_sequences_new = pad_sequences(sequences_new, maxlen=max_sequence_length, padding='post')

# Step 4: Make predictions

predictions = model.predict(padded_sequences_new)
print(predictions)
# Step 5: Convert predictions to class labels
predicted_classes = np.argmax(predictions, axis=1)

# Output the results
for tweet, sentiment in zip(new_data['tweet'], predicted_classes):
    sentiment_labels = ['Neutral', 'Positive', 'Negative']
    print(f"Tweet: '{tweet}' => Predicted Sentiment: {sentiment_labels[sentiment]}")




Index(['tweet', 'sentiment', 'url'], dtype='object')
tweets in dataset: 38089
38091
                                               tweet  sentiment
0  BYND JPMorgan reels in expectations on Beyond ...          2
1  CCL RCL Nomura points to bookings weakness at ...          2
2  CX Cemex cut at Credit Suisse JP Morgan on wea...          2
3                  ESS BTIG Research cuts to Neutral          2
4       FNKO Funko slides after Piper Jaffray PT cut          2
unique words in dataset: 46650
[[2358, 1058, 7042, 5, 744, 9, 1297, 2742], [1090, 4300, 4576, 302, 2, 9656, 1298, 12, 3332, 6, 1825, 3850], [12178, 17168, 318, 12, 405, 1599, 1944, 728, 9, 667, 850, 610], [6328, 4577, 632, 510, 2, 915], [8063, 5763, 2622, 47, 2525, 4899, 621, 318], [7043, 12179, 1136, 12, 9657, 35, 593, 146, 906, 12, 1222, 208], [147, 147, 1600, 3, 444], [147, 1222, 208, 510, 2, 189], [8064, 4301, 510, 2, 45, 2526], [17169, 17170, 51, 12180, 510, 916, 1670, 47, 4900, 2359], [4302, 5292, 1441, 9, 17171], [17172



Epoch 1/5
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 0.5456 - loss: 0.9653 - val_accuracy: 0.4313 - val_loss: 1.2367
Epoch 2/5
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.7975 - loss: 0.5129 - val_accuracy: 0.4583 - val_loss: 1.2674
Epoch 3/5
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.9065 - loss: 0.2637 - val_accuracy: 0.4439 - val_loss: 1.6775
Epoch 4/5
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.9615 - loss: 0.1237 - val_accuracy: 0.4660 - val_loss: 1.9009
Epoch 5/5
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.9841 - loss: 0.0579 - val_accuracy: 0.4537 - val_loss: 2.4336
Max sequence length from training data: 688
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[[1.12892635e-01 3.22376750e-03 8.83883655e-01]
 [1.62653741e-03 9.96737123e-01

In [18]:
import numpy as np
from keras.utils import to_categorical
from keras import models
from keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re

import pandas as pd

new_data = pd.DataFrame({
    'tweet': [
        "JPMorgan price will fall. It's been a bad week for the company", 
        "Stock price soars on great day production increases at the company.", 
        "I am going to be looking at the stock price later today.",
        "There has been a change in management at chipotle.",
        "Profits soar for small startup with a big jump in sales."
    ]
})


#USing a pretrained function:
from transformers import pipeline

# Create a sentiment analysis pipeline
pipe = pipeline(
    "sentiment-analysis",
    model="StephanAkkerman/FinTwitBERT-sentiment",
)

list_results=pipe(new_data['tweet'].to_list())
# Get the predicted sentiment
for i, t in enumerate(new_data['tweet']):
    print(f"Tweet: {t}: {' '*(75-len(t))}{list_results[i]}")


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

All the weights of TFBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


Tweet: JPMorgan price will fall. It's been a bad week for the company:              {'label': 'BEARISH', 'score': 0.9997867941856384}
Tweet: Stock price soars on great day production increases at the company.:         {'label': 'BULLISH', 'score': 0.968792200088501}
Tweet: I am going to be looking at the stock price later today.:                    {'label': 'NEUTRAL', 'score': 0.9666387438774109}
Tweet: There has been a change in management at chipotle.:                          {'label': 'NEUTRAL', 'score': 0.9979617595672607}
Tweet: Profits soar for small startup with a big jump in sales.:                    {'label': 'BULLISH', 'score': 0.9602739214897156}
