## Data Initialization

In [None]:
import pandas as pd
import importlib
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from utils import data_cleaning as dc

# Data paths
raw_data_path = "./data/tweets.csv"
processed_data_path = raw_data_path.replace(".csv", "_processed.csv")

# Establish column names
tweet_col = 'Tweet'
ticker_col = 'Stock Name'

# Establish the size of the dataset. Set to -1 to use the entire dataset.
data_size = 500

# Load the dataset
df = pd.read_csv(raw_data_path)
if data_size > 0:
    df = df.sample(data_size)

# Preprocess the tweet column
df[tweet_col] = df[tweet_col].apply(lambda x: dc.preprocess_tweet(x))

# Encode the labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df[ticker_col])

# Split the data into train and test sets
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

## Model Loading

In [None]:
import os
from transformers import BertTokenizer, BertForSequenceClassification

# Output files for model
tokenizer_output_dir = './bert_models/finetuned_bert_tokenizer'
model_output_dir = './bert_models/finetuned_bert_model'

# Initialize the tokenizer
if not os.path.exists(tokenizer_output_dir):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    tokenizer.save_pretrained(tokenizer_output_dir)
    print(f"Initalized and saved tokenizer to {tokenizer_output_dir}")
else:
    tokenizer = BertTokenizer.from_pretrained(tokenizer_output_dir)
    print(f"Loaded tokenizer from {tokenizer_output_dir}")

# Initalize the model
if not os.path.exists(model_output_dir):
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(df[ticker_col].unique()))
    model.save_pretrained(model_output_dir)
    print(f"Initalized and saved model to {model_output_dir}")
else:
    model = BertForSequenceClassification.from_pretrained(model_output_dir)
    print(f"Loaded model from {model_output_dir}")

In [None]:
from torch.utils.data import DataLoader
from utils import tweet_identification as ti
importlib.reload(ti)

# Create DataLoaders
train_dataset = ti.TweetDataset(
    tweets=train_df[tweet_col].to_numpy(),
    labels=train_df['label'].to_numpy(),
    tokenizer=tokenizer,
    max_len=64
)
val_dataset = ti.TweetDataset(
    tweets=val_df[tweet_col].to_numpy(),
    labels=val_df['label'].to_numpy(),
    tokenizer=tokenizer,
    max_len=64
)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

## Model Training & Evaluation

In [None]:
import torch
from transformers import AdamW

# Define the training loop
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)
EPOCHS = 5

In [None]:
importlib.reload(ti)

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    # Train the model
    train_acc, train_loss = ti.train_epoch(
        model,
        train_loader,
        optimizer,
        device,
        scheduler=None
    )
    print(f'Train loss {train_loss} accuracy {train_acc}')

    # Evaluate the model on the validation set
    val_acc, val_loss = ti.eval_model(
        model,
        val_loader,
        device
    )
    print(f'Val loss {val_loss} accuracy {val_acc}')

# Save the model
model.save_pretrained('finetuned_bert_model')
tokenizer.save_pretrained('finetuned_bert_tokenizer')

In [None]:
from sklearn.metrics import classification_report

# Evaluation
y_preds = []
y_true = []

model.eval()
with torch.no_grad():
    for d in val_loader:
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        labels = d['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)

        y_preds.extend(preds.cpu().numpy())
        y_true.extend(labels.cpu().numpy())

# Align the labels
labels = sorted(list(set(y_true)))
target_names = label_encoder.inverse_transform(labels)

print(classification_report(y_true, y_preds, target_names=target_names))

## Sample Data Evaluation

In [None]:
importlib.reload(dc)

sample_tweet = "The iPhone is a great product. I think Apple is a great company."

# Preprocess the tweet
sample_tweet = dc.preprocess_tweet(sample_tweet)

# Tokenize the tweet
inputs = tokenizer(sample_tweet, return_tensors='pt')
inputs = {k: v.to(device) for k, v in inputs.items()}
outputs = model(**inputs)

# Get the predicted label
_, preds = torch.max(outputs.logits, dim=1)
print(label_encoder.inverse_transform(preds.cpu().numpy())[0])