## Global Constants

In [None]:
# Kaggle data paths
run_on_kaggle = False
if run_on_kaggle:
    input_base_path = "/kaggle/input/financial-tweets-stock-identifier/gpu_version"
    output_base_path = "/kaggle/working"
else:
    input_base_path = "."
    output_base_path = "."

# Establish column names
tweet_col = 'Tweet'
ticker_col = 'Regex Ticker'

# Set the max length of the tweet
max_len = 256

# Input and output data paths
raw_data_path = f"{input_base_path}/data/ticker_tweets.csv"
processed_data_path = raw_data_path.replace(".csv", "_processed.csv")
output_dir = f"{output_base_path}/bert_models"
tokenizer_output_dir = f'{output_dir}/tuned_tokenizer'
model_output_dir = f'{output_dir}/tuned_tokenizer'
label_encoder_output_dir = f'{output_dir}/label_encoder.pkl'

# Establish the size of the dataset. Set to -1 to use the entire dataset.
data_size = 50

# Determines whether the data or models should be forcefully reloaded
force_data_reload = False
force_model_reload = False

# Determine training epochs. Set to -1 to run indefinitely.
EPOCHS = 1
BATCH_SIZE = 32

## Kaggle Environment Setup

In [None]:
import sys

if run_on_kaggle:
    # Install necessary dependencies
    !pip install -r {input_base_path}/requirements.txt
    
    # Custom base paths
    input_base_path = "/kaggle/input/financial-tweets-stock-identifier/gpu_version"
    output_base_path = "/kaggle/working"
    
    # Add utils module to the Python path
    sys.path.append('/kaggle/input/financial-tweets-stock-identifier/gpu_version')

## Data Initialization

In [None]:
import pandas as pd
import importlib
from utils import data_cleaning as dc
importlib.reload(dc)

df = dc.init_df(force_data_reload, raw_data_path, processed_data_path, data_size, tweet_col)

In [None]:
import os
import pickle
from sklearn.preprocessing import LabelEncoder

# Create the output directory if it doesn't exist
os.chmod(output_dir, 0o755)
os.makedirs(output_dir, exist_ok=True)

# Force reload the models if missing a required dependency
for path in [tokenizer_output_dir, model_output_dir, label_encoder_output_dir]:
    if not os.path.exists(path):
        force_model_reload = True
        
# Define the label encoder
if force_model_reload:
    label_encoder = LabelEncoder()
    df['label'] = label_encoder.fit_transform(df[ticker_col])
else:
    label_encoder = pickle.load(open(label_encoder_output_dir, 'rb'))
    new_labels = df[ticker_col].unique()
    label_encoder.fit(new_labels)
    df['label'] = label_encoder.transform(df[ticker_col])

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into train and test sets
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

## Model Loading

In [None]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import torch

# Define where to load the model from
base_model = 'roberta-base'
tokenizer_path = base_model if force_model_reload else tokenizer_output_dir
model_path = base_model if force_model_reload else model_output_dir
model_args = { "num_labels": len(df[ticker_col].unique()) } if force_model_reload else {}

# Initialize RoBERTa
tokenizer = RobertaTokenizer.from_pretrained(tokenizer_path)
model = RobertaForSequenceClassification.from_pretrained(model_path, **model_args)

# Adjust for changes in labels
num_labels = len(label_encoder.classes_)
if model.num_labels != num_labels:
    model.num_labels = num_labels
    model.classifier = torch.nn.Linear(in_features=model.classifier.in_features, out_features=num_labels)

In [None]:
from torch.utils.data import DataLoader
from utils import tweet_identification as ti
importlib.reload(ti)

# Create DataLoaders
train_dataset = ti.TweetDataset(
    tweets=train_df[tweet_col].to_numpy(),
    labels=train_df['label'].to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
)
val_dataset = ti.TweetDataset(
    tweets=val_df[tweet_col].to_numpy(),
    labels=val_df['label'].to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len 
)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

## Model Training & Evaluation

In [None]:
from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Define training parameters
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2)

In [None]:
import pickle
import time
importlib.reload(ti)

# Only save model when surpassing highest training accuracy
max_val_acc = 0

# Run training loop
epoch = 0
while epoch == -1 or epoch < EPOCHS:
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)
    start_time = time.time()

    # Train the model
    train_acc, train_loss = ti.train_epoch(
        model,
        train_loader,
        optimizer,
        device,
        scheduler=None
    )
    print(f'Train loss {train_loss} accuracy {train_acc}')

    # Evaluate the model on the validation set
    val_acc, val_loss = ti.eval_model(
        model,
        val_loader,
        device
    )
    print(f'Val loss {val_loss} accuracy {val_acc}')

    # Save the model
    if val_acc > max_val_acc:
        print(f'Surpassed previous highest accuracy ({max_val_acc}). Saving model')
        max_val_acc = val_acc
        
        model.save_pretrained(model_output_dir)
        tokenizer.save_pretrained(tokenizer_output_dir)
        with open(label_encoder_output_dir, 'wb') as f:
            pickle.dump(label_encoder, f)
    
    # Do not increase epoch when running indefinitely
    if epoch >= 0:
        epoch += 1
    print(f'Epoch time: {time.time() - start_time}')

In [None]:
from sklearn.metrics import classification_report
import numpy as np

# Evaluation
y_preds = []
y_true = []

model.eval()
with torch.no_grad():
    for d in val_loader:
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        labels = d['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)

        y_preds.extend(preds.cpu().numpy())
        y_true.extend(labels.cpu().numpy())

# Align the labels
labels = sorted(list(set(y_true)))
target_names = label_encoder.inverse_transform(np.arange(len(label_encoder.classes_)))
print(classification_report(y_true, y_preds, labels=labels, target_names=target_names))