<div style="text-align: center;">

  <div style="background: #023E8A;
              padding: 12px 24px;
              border-radius: 6px;
              display: inline-block;">
    <h1 style="margin: 0; color: #fff;">Stock Sentiment</h1>
  </div>

</div>


Name        | Student Number     
---------------- | --------------
Inês Major         | 20240486       
Luís Semedo          |  20240852    
Pedro Santos | 20240295
Rafael Bernardo | 20240510
Rodrigo Miranda | 20240490

# Imports

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install emoji
!pip install -U datasets
!pip install evaluate
!pip install optuna

Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/590.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m36.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.1
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m35.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer


from wordcloud import WordCloud


import warnings
warnings.filterwarnings("ignore")


# Text preprocessing
import re
import emoji
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict

# Evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
import evaluate

# Hugging Face datasets
from datasets import Dataset

# Hugging Face Transformers
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)


import optuna
import json
import os

import html

# Data

In [4]:
base_path = "/content/drive/MyDrive/Text Mining/Project"


train_df = pd.read_csv(f"{base_path}/data/train.csv")

# Corpus Split

In [None]:
df_train, df_val = train_test_split(
    train_df,
    test_size=0.2,
    random_state=42,
    stratify=train_df['label']
)

In [None]:
df_train['label'].value_counts(normalize=True), df_val['label'].value_counts(normalize=True)

(label
 2    0.647367
 1    0.201467
 0    0.151166
 Name: proportion, dtype: float64,
 label
 2    0.647459
 1    0.201676
 0    0.150864
 Name: proportion, dtype: float64)


# Data Preprocessing

## Encoders

In [5]:
def prepare_for_bert(text):
    text = html.unescape(text)
    text = re.sub(r"http\S+|www\S+", "[URL]", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text.lower()

def detect_signals(text):
    return {
        'has_url': int(bool(re.search(r"http\S+|www\S+", text))),
        'has_hashtag': int(bool(re.search(r"#\w+", text))),
        'has_mention': int(bool(re.search(r"@\w+", text))),
        'has_cashtag': int(bool(re.search(r"\$\w+", text))),
        'has_emoji': int(emoji.emoji_count(text) > 0)
    }

def enrich_text(row):
    text = row['bert_text']
    tags = []
    if row['has_emoji']:
        tags.append("This tweet contains emojis.")
    if row['has_url']:
        tags.append("This tweet contains a URL.")
    if row['has_cashtag']:
        tags.append("This tweet contains a cashtag.")
    if row['has_hashtag']:
        tags.append("This tweet contains a hashtag.")
    tags.append(f"This tweet has {len(text.split())} words.")
    enriched_intro = " ".join(tags)
    return enriched_intro + " " + text

In [6]:
def preprocess_dataframe(df):
    df = df.copy()
    df['bert_text'] = df['text'].apply(prepare_for_bert)
    signal_cols = df['text'].apply(detect_signals).apply(pd.Series)
    df = pd.concat([df, signal_cols], axis=1)
    df['bert_text_enriched'] = df.apply(enrich_text, axis=1)
    return df

In [7]:
df_proc = preprocess_dataframe(train_df)


# Feature Engineering

## Enconders

In [8]:
def prepare_hf_datasets(df, tokenizer):
    df_train, df_val = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

    train_data = Dataset.from_pandas(df_train[['bert_text_enriched', 'label']].rename(columns={'bert_text_enriched': 'text'}))
    val_data = Dataset.from_pandas(df_val[['bert_text_enriched', 'label']].rename(columns={'bert_text_enriched': 'text'}))

    def tokenize_function(example):
        return tokenizer(example["text"], padding="max_length", truncation=True, max_length=128)

    train_tokenized = train_data.map(tokenize_function, batched=True)
    val_tokenized = val_data.map(tokenize_function, batched=True)

    train_tokenized.set_format("torch")
    val_tokenized.set_format("torch")

    return train_tokenized, val_tokenized


# Classification Models - RoBERTa-Large

In [9]:
accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.asarray(np.argmax(logits, axis=-1))
    labels = np.asarray(labels)

    return {
        "eval_accuracy": accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"],
        "eval_precision_macro": precision_metric.compute(predictions=predictions, references=labels, average="macro")["precision"],
        "eval_recall_macro": recall_metric.compute(predictions=predictions, references=labels, average="macro")["recall"],
        "eval_f1_macro": f1_metric.compute(predictions=predictions, references=labels, average="macro")["f1"],
        "eval_precision_weighted": precision_metric.compute(predictions=predictions, references=labels, average="weighted")["precision"],
        "eval_recall_weighted": recall_metric.compute(predictions=predictions, references=labels, average="weighted")["recall"],
        "eval_f1_weighted": f1_metric.compute(predictions=predictions, references=labels, average="weighted")["f1"],
    }

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.56k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.38k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

In [10]:
def train_encoder_roberta_large(df, model_checkpoint):
    tokenizer = AutoTokenizer.from_pretrained(
        model_checkpoint,
        use_fast=False if "bertweet" in model_checkpoint else True
    )

    train_tokenized, val_tokenized = prepare_hf_datasets(df, tokenizer)

    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=3)

    training_args = TrainingArguments(
        output_dir="./results_roberta",
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=0.000004,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=20,
        weight_decay=0.132473,
        load_best_model_at_end=True,
        metric_for_best_model="eval_f1_macro",
        greater_is_better=True,
        logging_dir="./logs_roberta",
        logging_steps=50,
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_tokenized,
        eval_dataset=val_tokenized,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=6)]
    )

    trainer.train()
    return trainer

In [11]:
trainer_roberta_large = train_encoder_roberta_large(df_proc, model_checkpoint="roberta-large")

For computational reasons we'll import the model we saved on the notebook tm_testes_11.ipynb and evaluate the results here.

We are still keeping the pipeline above in order to facilitate the workflow that lead us to the best model. The hyperparameters chosen, as clarified on the report, where obtained through a Grid Search.

In [16]:
os.environ["WANDB_DISABLED"] = "true"

model_path = "/content/drive/MyDrive/Text Mining/Project/Models/roberta_large_v1"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

train_tokenized, val_tokenized = prepare_hf_datasets(df_proc, tokenizer)

trainer_loaded = Trainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized
)

Map:   0%|          | 0/7634 [00:00<?, ? examples/s]

Map:   0%|          | 0/1909 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


# Evaluation

In [17]:
def evaluate_and_report(trainer, name="Model"):
    # Validation predictions
    val_predictions = trainer.predict(trainer.eval_dataset)
    val_preds = np.argmax(val_predictions.predictions, axis=1)

    # Training predictions
    train_predictions = trainer.predict(trainer.train_dataset)
    train_preds = np.argmax(train_predictions.predictions, axis=1)

    # Print reports
    print(f"\nResults for {name}")
    print("\nValidation Set:")
    print(classification_report(val_predictions.label_ids, val_preds, target_names=['Bearish', 'Bullish', 'Neutral']))

    print("\nTraining Set:")
    print(classification_report(train_predictions.label_ids, train_preds, target_names=['Bearish', 'Bullish', 'Neutral']))

In [18]:
evaluate_and_report(trainer_loaded, name="RoBERTa-large")


Results for RoBERTa-large

Validation Set:
              precision    recall  f1-score   support

     Bearish       0.84      0.85      0.85       288
     Bullish       0.87      0.90      0.89       385
     Neutral       0.94      0.93      0.94      1236

    accuracy                           0.91      1909
   macro avg       0.89      0.89      0.89      1909
weighted avg       0.91      0.91      0.91      1909


Training Set:
              precision    recall  f1-score   support

     Bearish       1.00      0.99      1.00      1154
     Bullish       0.99      1.00      1.00      1538
     Neutral       1.00      1.00      1.00      4942

    accuracy                           1.00      7634
   macro avg       1.00      1.00      1.00      7634
weighted avg       1.00      1.00      1.00      7634

