# Environment Variables

In [None]:
import torch

# Data locatiosn
raw_data_path = "./data/ticker_tweets.csv"
processed_data_path = raw_data_path.replace(".csv", "_processed.csv")
tweet_col = "Tweet"
data_size = -1

# Model components
output_dir = "./gpu_output/v2/bert_models"
tokenizer_output_dir = f"{output_dir}/tuned_tokenizer"
model_output_dir = f"{output_dir}/tuned_model"
label_encoder_output_dir = f"{output_dir}/label_encoder.pkl"

# Model parameters
max_len = 256
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Model output names
regex_output = "Regex Symbol"
model_output = "RoBERTA Symbol"

# Model & Data Loading 

In [None]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from utils import data_cleaning as dc
import pickle

# Load the data
raw_tweets = dc.init_df(False, raw_data_path, processed_data_path, 50, tweet_col)

# Load the model, tokenizer, and label encoder
model = RobertaForSequenceClassification.from_pretrained(model_output_dir)
tokenizer = RobertaTokenizer.from_pretrained(tokenizer_output_dir)
with open(label_encoder_output_dir, 'rb') as f:
    label_encoder = pickle.load(f)

# Data Classification

In [None]:
import importlib
import pandas as pd
from utils import tweet_identification as ti
importlib.reload(ti)

# Convert the tweets column to a list
tweets = raw_tweets[tweet_col].tolist()

# BERT Classification
batch_size = 32
predicted_labels = ti.classify_tweets(tweets, model, tokenizer, label_encoder, device, max_len, batch_size)
raw_tweets[model_output] = predicted_labels

# Regex Classification
nyse_stocks = pd.read_csv("./data/nyse_stock_info.csv")
nyse_tickers = set(nyse_stocks["Symbol"].str.upper())
raw_tweets[regex_output] = raw_tweets[tweet_col].apply(lambda x: ti.get_ticker_strings(x, nyse_tickers))

# Model Comparison

In [None]:
from collections import Counter

def are_anagrams(str1, str2):
    return Counter(str1) == Counter(str2)

In [None]:
# Copy the DataFrame
filtered_df = raw_tweets.copy()

# Replace NA values with empty strings in 'BERT Symbol' and 'Regex Symbol'
filtered_df[model_output] = filtered_df[model_output].fillna("")
filtered_df[regex_output] = filtered_df[regex_output].fillna("")

# Find all the tweets where the BERT and Regex labels are different
filtered_df = filtered_df[~filtered_df.apply(lambda row: are_anagrams(row[model_output], row[regex_output]), axis=1)]
display_cols = [tweet_col, regex_output, model_output]
print(f"Custom Model and Regex differed on {len(filtered_df)}/{len(raw_tweets)} tweets ({len(filtered_df) / len(raw_tweets)})%")

# Display classification differences
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
display(filtered_df[display_cols].head(100))

# Custom Examples

In [None]:
# Define class for test cases
class CustomExample:
    def __init__(self, msg, expected_answer):
        self.msg = msg
        self.expected_answer = expected_answer

# Define custom test cases
custom_tweets = [
    CustomExample("I love my iPhone! I think Apple is a fantastic company.", 
                  "Apple"),
    CustomExample("The new iPad is amazing... I cannot wait to see what they do next year!", 
                  "Apple"),
    CustomExample("OpenAI is truly amazing. I use it almost every day for both work and school.", 
                  "Microsoft"),
    CustomExample("Semi-conductor companies are the stocks to watch in 2024. With the AI boom, their value is going through the roof!", 
                  "NVDA-INTC-AMD")
]

# Output predictions
custom_preds = ti.classify_tweets([tweet.msg for tweet in custom_tweets], 
                                  model, tokenizer, label_encoder, 
                                  device, max_len, batch_size)