In [None]:
import numpy as np
import pandas as pd
import re 

import transformers

# Basic Data Exploration

In [None]:
df = pd.read_csv("training.csv")

In [None]:
df

In [None]:
# Function to extract source, text, and hashtags
def extract_components(text):
    source_match = re.search(r'@(.*?):', text)
    source = source_match.group(1) if source_match else None

    hashtags_match = re.findall(r'#\S+', text)
    hashtags = ' '.join(hashtags_match) if hashtags_match else None

    text_clean = re.sub(r'@(.*?):', '', text)
    text_clean = re.sub(r'#\S+', '', text_clean).strip()

    return source, text_clean, hashtags

# Function to count hashtags in a sentence
def count_hashtags(text):
    hashtags = re.findall(r'#\S+', text)
    return len(hashtags)

In [None]:
df[['source', 'text', 'hashtags']] = df['SocialMediaFeed'].apply(lambda x: pd.Series(extract_components(x)))

In [None]:
df["source"].value_counts()

In [None]:
# Extracting all hashtags from the DataFrame
all_hashtags = df['hashtags'].str.cat(sep=' ').split()

# Counting unique hashtags
unique_hashtags = set(all_hashtags)
num_unique_hashtags = len(unique_hashtags)

num_unique_hashtags, unique_hashtags

# Fine-tuning the classification model

### Preparing the data

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("training.csv")
df

In [None]:
# convert all non-zero values to 1
# df[['NVDA', 'ING', 'SAN', 'PFE', 'CSCO']] = df[['NVDA', 'ING', 'SAN', 'PFE', 'CSCO']].map(lambda x: 1 if x != 0 else 0)

# create a column that sums all the values in the row
# df["sum"] = df[['NVDA', 'ING', 'SAN', 'PFE', 'CSCO']].apply(lambda x: np.sum(x), axis=1)
# df["sum"].value_counts()

In [None]:
# create a column "relevant" if at least one value in the row is not 0
df["relevant"] = df[['NVDA', 'ING', 'SAN', 'PFE', 'CSCO']].apply(lambda x: np.sum(x != 0), axis=1)
df["relevant"].value_counts()

### Setting up the fine-tuning

In [None]:
import torch
from transformers import BertForSequenceClassification, BertTokenizerFast, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader

In [None]:
class NewsHeadlineDataset(Dataset):

    def __init__(self, texts, targets, tokenizer, max_len):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        target = self.targets[idx]
    
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
    
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(target, dtype=torch.long)
        }

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('prajjwal1/bert-tiny')
model = BertForSequenceClassification.from_pretrained('prajjwal1/bert-tiny')

In [None]:
# Split the data
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

# Define the datasets
train_dataset = NewsHeadlineDataset(df_train['SocialMediaFeed'].to_list(), df_train['relevant'].to_list(), tokenizer, 128)
val_dataset = NewsHeadlineDataset(df_test['SocialMediaFeed'].to_list(), df_test['relevant'].to_list(), tokenizer, 128)

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=40,              # total number of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    logging_strategy="steps",        # log every X updates steps
    logging_steps=10,               # log & save weights each logging_steps
    evaluation_strategy="steps",     # evaluate each `logging_steps`
)

trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
)

# train the model
trainer.train()

In [None]:
train_result = trainer.predict(train_dataset)
val_result = trainer.predict(val_dataset)

In [None]:
probabilities = np.exp(train_result.predictions)/np.sum(np.exp(train_result.predictions), axis=1)[:, None]

In [None]:
train_truth = [ex['labels'].item() for ex in train_dataset]
val_truth = [ex['labels'].item() for ex in val_dataset]

In [None]:
# compute softmax
probabilities = np.exp(train_result.predictions) / np.sum(np.exp(train_result.predictions), axis=1, keepdims=True)
# get predicted labels
train_preds = np.argmax(probabilities, axis=1)

probabilities_val = np.exp(val_result.predictions) / np.sum(np.exp(val_result.predictions), axis=1, keepdims=True)
val_preds = np.argmax(probabilities_val, axis=1)

In [None]:
from sklearn.metrics import accuracy_score

# calculate accuracy for training data
train_accuracy = accuracy_score(train_truth, train_preds)
print(f'Train Accuracy: {train_accuracy}')

# calculate accuracy for validation data
val_accuracy = accuracy_score(val_truth, val_preds)
print(f'Validation Accuracy: {val_accuracy}')

# Hyperparameter Tuning

In [None]:
# Set the hyperparameters to test
num_train_epochs_list = [5, 10, 20, 30, 40, 50, 100]
batch_size_list = [16, 32]
warmup_steps_list = [0, 250, 500]
weight_decay_list = [0.02, 0.01, 0.05]

best_accuracy = 0
best_hparams = {}

for num_train_epochs in num_train_epochs_list:
    for batch_size in batch_size_list:
        for warmup_steps in warmup_steps_list:
            for weight_decay in weight_decay_list:

                # Adjust the hyperparameters in TrainingArguments
                training_args = TrainingArguments(
                    output_dir='./results',          # output directory
                    num_train_epochs=num_train_epochs,
                    per_device_train_batch_size=batch_size,
                    per_device_eval_batch_size=batch_size,  # Usually the same as train_batch_size
                    warmup_steps=warmup_steps,
                    weight_decay=weight_decay,
                    logging_dir='./logs',            # directory for storing logs
                    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
                    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
                    logging_strategy="steps",        # log every X updates steps
                    logging_steps=10,               # log & save weights each logging_steps
                    evaluation_strategy="steps",     # evaluate each `logging_steps`
                )


                trainer = Trainer(
                    model=model,
                    args=training_args,
                    train_dataset=train_dataset,
                    eval_dataset=val_dataset
                )

                # Train the model
                trainer.train()

                # Make predictions
                predictions = trainer.predict(val_dataset)

                # Compute the prediction labels
                predicted_labels = np.argmax(predictions.predictions, axis=-1)

                # Compute the accuracy score
                accuracy = accuracy_score(val_truth, predicted_labels)

                print(f'Epochs: {num_train_epochs}, Batch size: {batch_size}, Warmup steps: {warmup_steps}, Weight Decay: {weight_decay}, Accuracy: {accuracy}')
                
                # Save the best model and hyperparameters (comment out if not needed)
                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_hparams = {'epochs': num_train_epochs, 'batch_size': batch_size, 'warmup_steps': warmup_steps, 'weight_decay': weight_decay}
                    model.save_pretrained('./best_model')

print(f'Best hyperparameters: {best_hparams}, Best accuracy: {best_accuracy}')

In [None]:
import optuna

def objective(trial):
    # Define hyperparam ranges
    num_train_epochs = trial.suggest_int("num_train_epochs", 1, 50)
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [16, 32, 64])
    warmup_steps = trial.suggest_int("warmup_steps", 0, 500)
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-1, log=True)

    args = TrainingArguments(
        output_dir='./results',          # output directory
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=per_device_train_batch_size,
        warmup_steps=warmup_steps,
        weight_decay=weight_decay,
        logging_dir='./logs',            # directory for storing logs
        load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
        # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
        logging_strategy="steps",        # log every X updates steps
        logging_steps=10,               # log & save weights each logging_steps
        evaluation_strategy="steps",     # evaluate each `logging_steps`
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )

    trainer.train()

    # Evaluate
    eval_result = trainer.evaluate()

    # You may want to pick one of these metrics - depends on the problem
    # Unfortunately the names are not standardized across the built-in tasks, you may want to double-check
    return eval_result["eval_accuracy"]

# Note: If you have multiple GPUs, you may want to set n_jobs > 1
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)  # feel free to change n_trials

best_trial = study.best_trial

print("Best trial:")
print(" Value: ", best_trial.value)
print(" Params: ")
for key, value in best_trial.params.items():
    print(f"    {key}: {value}")

# Fine-tuning the cross classification-regression model

In [2]:
import pandas as pd
import numpy as np
np.random.seed(42)

# prepare the df
df = pd.read_csv("training.csv")

# typical row:
# "@PharmaNews: Pfizer faces backlash over possible closure of regional office. #PharmaNews #RegionalOffice",0.000000,0.000000,0.000000,-0.029512,0.000000


In [3]:
import torch
from transformers import BertTokenizerFast
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

# Load the tokenizer
tokenizer = BertTokenizerFast.from_pretrained('prajjwal1/bert-tiny')

# Define the stocks and sentiments
stocks = ['None', 'NVDA', 'ING', 'SAN', 'PFE', 'CSCO']
# None can only have None sentiment
sentiments = ['None', '_pos', '_neg']
# 3*5 + 1 = 16 classes in total and 1 regression head with log_returns

# df has ["SocialMediaFeed", 'NVDA', 'ING', 'SAN', 'PFE', 'CSCO'] columns
X_texts = df['SocialMediaFeed'].tolist()

# Calculate which stock is affected by the news and get the log return value
y_stock = []
y_log_return = []
for i, row in df.iterrows():
    affected_stock = "None"
    log_return_val = 0.0
    for stock in stocks[1:]:
        if row[stock] != 0:
            affected_stock = stock
            log_return_val = row[stock]
            break
    y_stock.append(affected_stock)
    y_log_return.append(log_return_val)

y_stock = [stocks.index(s) for s in y_stock]

X_train_texts, X_val_texts, y_train_stock, y_val_stock, y_train_val, y_val_val = train_test_split(X_texts, y_stock, y_log_return, test_size=.2)

# Scale regression target (log returns) with StandardScaler
scaler = StandardScaler()
y_train_val_scaled = scaler.fit_transform(np.array(y_train_val).reshape(-1, 1)).flatten().astype('float32')
y_val_val_scaled = scaler.transform(np.array(y_val_val).reshape(-1, 1)).flatten().astype('float32')

# Set up labels for sentiment analysis
train_sentiments = [sentiments.index("_pos") if val > 0.000001 else sentiments.index("_neg") if val < -0.000001 else sentiments.index("None") for val in y_train_val]
val_sentiments = [sentiments.index("_pos") if val > 0.000001 else sentiments.index("_neg") if val < -0.000001 else sentiments.index("None") for val in y_val_val]

# Next, tokenize the texts
train_encodings = tokenizer(X_train_texts, truncation=True, padding=True)
val_encodings = tokenizer(X_val_texts, truncation=True, padding=True)

# Define the custom dataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, stock_labels, sentiment_labels, values):
        self.encodings = {key: torch.tensor(val) for key, val in encodings.items()}
        self.stock_labels = torch.tensor(stock_labels)
        self.sentiment_labels = torch.tensor(sentiment_labels)
        self.values = torch.tensor(values)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["stock_labels"] = self.stock_labels[idx]
        item["sentiment_labels"] = self.sentiment_labels[idx]
        item["values"] = self.values[idx]
        return item

    def __len__(self):
        return len(self.stock_labels)

# Create the custom dataset
train_dataset = CustomDataset(train_encodings, y_train_stock, train_sentiments, y_train_val_scaled)
val_dataset = CustomDataset(val_encodings, y_val_stock, val_sentiments, y_val_val_scaled)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [4]:
# export scaler
import pickle

with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

In [6]:
df

Unnamed: 0,SocialMediaFeed,NVDA,ING,SAN,PFE,CSCO
0,@PharmaNews: Pfizer faces backlash over possib...,0.000000,0.0,0.0,-0.029512,0.0
1,@BusinessReport: A recent study found that mos...,0.000000,0.0,0.0,0.000000,0.0
2,@HardwareHubs: NVIDIA's contributions to a maj...,0.026125,0.0,0.0,0.000000,0.0
3,@HealthWatch: Johnson & Johnson faces lawsuits...,0.000000,0.0,0.0,0.000000,0.0
4,@IndustryInsider: Magnificent Honary faces pro...,0.000000,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...
1177,@ConsumerGuru: Walmart's new sustainable packa...,0.000000,0.0,0.0,0.000000,0.0
1178,"@FinanceFlash: Despite market fluctuation, tod...",0.000000,0.0,0.0,0.000000,0.0
1179,"@EcoWatch: Today, the World Ecology Forum anno...",0.000000,0.0,0.0,0.000000,0.0
1180,@StreamingTimes: Unconfirmed reports hint at a...,0.000000,0.0,0.0,0.000000,0.0


In [5]:
train_dataset

# typical output:
# {'input_ids': tensor([  101,  1030,  6887, 27292,  4887, 17299,  3686,  1024,  1052,  8873,
#           6290,  1005,  1055,  1053,  2475, 16565,  2453,  2991,  2917, 10908,
#           1012,  3422,  2041,  2005,  4518, 28892,  1012,  1001, 16565,  6279,
#          13701,  1001,  6887, 27292,  7231,  9333,   102,     0,     0,     0,
#              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
#              0,     0,     0,     0,     0]),
#  'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
#          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
#          0, 0, 0, 0, 0, 0, 0]),
#  'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
#          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
#          0, 0, 0, 0, 0, 0, 0]),
#  'stock_labels': tensor(0),
#  'sentiment_labels': tensor(0),
#  'values': tensor(-0.0054)}

<__main__.CustomDataset at 0x297bc1090>

In [30]:
# I want to fine-tune a tinyBert model
# The model has 10 heads. The first 6 heads represent the classification of the 5 stocks and the None class.
# The next 3 heads represent the sentiment analysis of the 3 sentiments: None, positive, negative. None is only for the None class.
# The train_dataset and val_dataset have 'stock_labels' and 'sentiment_labels' that take values between 0 and 5, and 0 and 2 respectively.
# The last head is a regression head that predicts the log return value. The train_dataset and val_dataset have 'values' that take values between -1 and 1.

In [31]:
from transformers import BertModel
import torch.nn as nn

class CustomBERTModel(nn.Module):
    def __init__(self):
        super(CustomBERTModel, self).__init__()
        self.bert = BertModel.from_pretrained('prajjwal1/bert-tiny')
        self.classifier_stock = nn.Linear(self.bert.config.hidden_size, 6) # stock classification head
        self.classifier_sentiment = nn.Linear(self.bert.config.hidden_size, 3) # sentiment classification head
        self.regression = nn.Linear(self.bert.config.hidden_size, 1) # regression head
        
    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        pooled_output = outputs[1]
        stock_labels = self.classifier_stock(pooled_output)
        sentiment_labels = self.classifier_sentiment(pooled_output)
        regression_values = self.regression(pooled_output)
        
        return stock_labels, sentiment_labels, regression_values

In [32]:
from torch.nn import CrossEntropyLoss, MSELoss

def train(dataloader, model, optimizer):
    model.train()
    total_loss = 0    # initialize total loss to 0
    for batch in dataloader:
        # clear the gradients
        optimizer.zero_grad()
        
        # get the inputs and labels
        inputs = {key: val.to(device) for key, val in batch.items()}
        stock_labels = inputs.pop('stock_labels').to(device)
        sentiment_labels = inputs.pop('sentiment_labels').to(device)
        values = inputs.pop('values').to(device)
        
        # forward pass
        stock_labels_pred, sentiment_labels_pred, regression_values_pred = model(**inputs)
        
        # compute the loss
        stock_loss = CrossEntropyLoss()(stock_labels_pred.view(-1, 6), stock_labels.view(-1))
        sentiment_loss = CrossEntropyLoss()(sentiment_labels_pred.view(-1, 3), sentiment_labels.view(-1))
        regression_loss = MSELoss()(regression_values_pred.view(-1), values.view(-1))
        
        total_loss = stock_loss + sentiment_loss + regression_loss

        # backward pass
        total_loss.backward()
        
        # update the weights
        optimizer.step()
        
    return total_loss / len(dataloader)

def validate(dataloader, model):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in dataloader:
            inputs = {key: val for key, val in batch.items()}
            stock_labels = inputs.pop('stock_labels')
            sentiment_labels = inputs.pop('sentiment_labels')
            values = inputs.pop('values')
            
            stock_labels_pred, sentiment_labels_pred, regression_values_pred = model(**inputs)
            
            stock_loss = CrossEntropyLoss()(stock_labels_pred.view(-1, 6), stock_labels)
            sentiment_loss = CrossEntropyLoss()(sentiment_labels_pred.view(-1, 3), sentiment_labels)
            regression_loss = MSELoss()(regression_values_pred.view(-1), values.view(-1))
            
            total_loss += stock_loss + sentiment_loss + regression_loss

    return total_loss / len(dataloader)

In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW
from torch.nn.utils import clip_grad_norm_
import copy

device = torch.device("cpu")

model = CustomBERTModel().to(device)
optimizer = AdamW(model.parameters(), lr=1e-5)

epochs = 100
grad_clip = 1.0
best_loss = None
patience = 3
no_improve = 0
early_stopping = False

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    print('-' * 10)

    train_loss = train(train_loader, model, optimizer)

    # Gradient clipping
    clip_grad_norm_(model.parameters(), grad_clip)

    print(f'Train loss {train_loss}')

    val_loss = validate(val_loader, model)
    print(f'Validation loss {val_loss[0]}')

    # Save the model if validation loss decreases
    if best_loss is None or val_loss < best_loss:
        print(f'Validation loss decreased from {None if best_loss is None else best_loss[0]} to {val_loss[0]}. Saving model...')
        best_loss = val_loss
        best_model = copy.deepcopy(model.state_dict())
        no_improve = 0
    else:
        no_improve += 1
    if no_improve >= patience:
        print(f'No improvement in validation loss for {patience} epochs. Stopping...')
        early_stopping = True
        break

if early_stopping:
    # Load the best model weights
    model.load_state_dict(best_model)

print('Training complete')

# Valdiation

In [33]:
def validate(dataloader, model):
    model.eval()
    total_loss = 0
    stock_outputs, sentiment_outputs, value_outputs = [], [], []
    stock_labels, sentiment_labels, values = [], [], []

    with torch.no_grad():
        for batch in dataloader:
            inputs = {key: val.to(device) for key, val in batch.items()}
            batch_stock_labels = inputs.pop('stock_labels').to(device)
            batch_sentiment_labels = inputs.pop('sentiment_labels').to(device)
            batch_values = inputs.pop('values').to(device)
            
            stock_labels_pred, sentiment_labels_pred, regression_values_pred = model(**inputs)
            
            stock_loss = CrossEntropyLoss()(stock_labels_pred.view(-1, 6), batch_stock_labels.view(-1))
            sentiment_loss = CrossEntropyLoss()(sentiment_labels_pred.view(-1, 3), batch_sentiment_labels.view(-1))
            regression_loss = MSELoss()(regression_values_pred.view(-1), batch_values.view(-1))
            
            total_loss += stock_loss + sentiment_loss + regression_loss

            # store predictions and true labels for metric computation
            stock_outputs.extend(torch.argmax(stock_labels_pred, dim=1).tolist())
            sentiment_outputs.extend(torch.argmax(sentiment_labels_pred, dim=1).tolist())
            value_outputs.extend(regression_values_pred.tolist())

            stock_labels.extend(batch_stock_labels.tolist())
            sentiment_labels.extend(batch_sentiment_labels.tolist())
            values.extend(batch_values.tolist())

    return total_loss / len(dataloader), (stock_outputs, sentiment_outputs, value_outputs), (stock_labels, sentiment_labels, values)

In [34]:
from sklearn.metrics import confusion_matrix, f1_score
import numpy as np

val_loss, (stock_outputs, sentiment_outputs, value_outputs), (stock_labels, sentiment_labels, values) = validate(val_loader, model)
print(f'Validation loss {val_loss}')

# compute confusion matrix, F1 score, etc
stock_cm = confusion_matrix(stock_labels, stock_outputs)
sentiment_cm = confusion_matrix(sentiment_labels, sentiment_outputs)

stock_f1 = f1_score(stock_labels, stock_outputs, average="weighted")
sentiment_f1 = f1_score(sentiment_labels, sentiment_outputs, average="weighted")

print('Stock Classification F1: ', stock_f1, '\nConfusion Matrix:\n', stock_cm)
print('Sentiment Classification F1: ', sentiment_f1, '\nConfusion Matrix:\n', sentiment_cm)

# for regression, we use root mean squared error (RMSE) instead of F1, etc.
value_rmse = np.sqrt(MSELoss()(torch.tensor(values), torch.tensor(value_outputs))) 
print('Regression RMSE: ', value_rmse.item())

NameError: name 'val_loader' is not defined

In [None]:
# Inference

def predict(model, dataset_item, scaler):
    model.eval()
    with torch.no_grad():
        inputs = {key: val.unsqueeze(0).to(device) for key, val in dataset_item.items()} # unsqueeze to mimic batch dim
        inputs.pop('stock_labels')
        inputs.pop('sentiment_labels')
        inputs.pop('values')

        stock_labels_pred, sentiment_labels_pred, regression_values_pred = model(**inputs)
        
        stock_label = torch.argmax(stock_labels_pred, dim=1).item()
        sentiment_label = torch.argmax(sentiment_labels_pred, dim=1).item()
        regression_value = scaler.inverse_transform(regression_values_pred.cpu().numpy()) # inverse transform of scaling

    return stock_label, sentiment_label, regression_value[0][0]  # return the single value from the 2D array

In [None]:
# reality check

for i in range(100):
    stock_label, sentiment_label, value = predict(model, train_dataset[i], scaler)
    print(f'Example {i}:')
    print(f'Predicted stock_class: {stock_label}, sentiment_class: {sentiment_label}, value: {value}')
    print(f'True stock_class: {train_dataset[i]["stock_labels"].item()}, sentiment_class: {train_dataset[i]["sentiment_labels"].item()}, value: {scaler.inverse_transform([[train_dataset[i]["values"].item()]])[0][0]}')
    print("---")

In [None]:
# validation check

for i in range(100):
    stock_label, sentiment_label, value = predict(model, val_dataset[i], scaler)
    print(f'Example {i}:')
    print(f'Predicted stock_class: {stock_label}, sentiment_class: {sentiment_label}, value: {value}')
    print(f'True stock_class: {val_dataset[i]["stock_labels"].item()}, sentiment_class: {val_dataset[i]["sentiment_labels"].item()}, value: {scaler.inverse_transform([[val_dataset[i]["values"].item()]])[0][0]}')
    print("---")

## Saving the best Model

In [25]:
train_dataset[0]

{'input_ids': tensor([  101,  1030,  6887, 27292,  4887, 17299,  3686,  1024,  1052,  8873,
          6290,  1005,  1055,  1053,  2475, 16565,  2453,  2991,  2917, 10908,
          1012,  3422,  2041,  2005,  4518, 28892,  1012,  1001, 16565,  6279,
         13701,  1001,  6887, 27292,  7231,  9333,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0]),
 'stock_labels': tensor(0),
 'sentiment_labels': tensor(0),
 'values': tensor(-0.0054)}

In [23]:
# Inference

def predict(model, dataset_item, scaler):
    model.eval()
    with torch.no_grad():
        inputs = {key: val.unsqueeze(0).to(device) for key, val in dataset_item.items()} # unsqueeze to mimic batch dim
        inputs.pop('stock_labels')
        inputs.pop('sentiment_labels')
        inputs.pop('values')

        stock_labels_pred, sentiment_labels_pred, regression_values_pred = model(**inputs)
        
        stock_label = torch.argmax(stock_labels_pred, dim=1).item()
        sentiment_label = torch.argmax(sentiment_labels_pred, dim=1).item()
        regression_value = scaler.inverse_transform(regression_values_pred.cpu().numpy()) # inverse transform of scaling

    return stock_label, sentiment_label, regression_value[0][0]  # return the single value from the 2D array

In [None]:
def prepare_input(text, tokenizer):
    """
    Takes a string, tokenizes, and prepares it into expected format (list of token ids, attention masks, etc.) ready for model input

    Arguments:
    text -- string, Raw text string
    tokenizer -- transformers.Tokenizer, Initialized tokenizer

    Returns:
    input_dict -- dictionary, Contains required inputs for model
    """

    # Tokenize the text
    encoding = tokenizer.encode_plus(
        text,
        truncation=True, 
        padding=True,
        return_tensors='pt'  # Return PyTorch tensors
    )

    # Get the input ids and attention masks from tokenizer and convert to tensors
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']

    # Put all tensor entries into a single dictionary
    input_dict = {
        'input_ids': input_ids,
        'token_type_ids': torch.zeros(input_ids.shape, dtype=torch.long),
        'attention_mask': attention_mask,
    }
    
    return input_dict

In [None]:
deployed_tokenizer = BertTokenizerFast.from_pretrained('prajjwal1/bert-tiny')
res = prepare_input("This is a new tweet from @PharmaNews", deployed_tokenizer)

In [None]:
torch.save(model.state_dict(), "best_model.pt")

In [None]:
dummy_model = CustomBERTModel()
dummy_model.load_state_dict(torch.load("best_model.pt"))
dummy_model = dummy_model.to("cpu")

In [None]:
# predict res using dummy model
device = "cpu"
stock_label, sentiment_label, value = predict(dummy_model, res, scaler)

In [24]:
train_dataset[0]

{'input_ids': tensor([  101,  1030,  6887, 27292,  4887, 17299,  3686,  1024,  1052,  8873,
          6290,  1005,  1055,  1053,  2475, 16565,  2453,  2991,  2917, 10908,
          1012,  3422,  2041,  2005,  4518, 28892,  1012,  1001, 16565,  6279,
         13701,  1001,  6887, 27292,  7231,  9333,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0]),
 'stock_labels': tensor(0),
 'sentiment_labels': tensor(0),
 'values': tensor(-0.0054)}

In [None]:
for i in range(100):
    stock_label, sentiment_label, value = predict(dummy_model, train_dataset[i], scaler)
    print(f'Example {i}:')
    print(f'Predicted stock_class: {stock_label}, sentiment_class: {sentiment_label}, value: {value}')
    print(f'True stock_class: {train_dataset[i]["stock_labels"].item()}, sentiment_class: {train_dataset[i]["sentiment_labels"].item()}, value: {scaler.inverse_transform([[train_dataset[i]["values"].item()]])[0][0]}')
    print("---")

In [None]:
# confusion matrix of the dummy model
val_loss, (stock_outputs, sentiment_outputs, value_outputs), (stock_labels, sentiment_labels, values) = validate(val_loader, dummy_model)

stock_cm = confusion_matrix(stock_labels, stock_outputs)
sentiment_cm = confusion_matrix(sentiment_labels, sentiment_outputs)

stock_f1 = f1_score(stock_labels, stock_outputs, average="weighted")
sentiment_f1 = f1_score(sentiment_labels, sentiment_outputs, average="weighted")

print('Stock Classification F1: ', stock_f1, '\nConfusion Matrix:\n', stock_cm)
print('Sentiment Classification F1: ', sentiment_f1, '\nConfusion Matrix:\n', sentiment_cm)

In [1]:
train_dataset[0]

NameError: name 'train_dataset' is not defined

# Old Inference Testing

In [None]:
# Load the tokenizer
tokenizer = BertTokenizerFast.from_pretrained('prajjwal1/bert-tiny')

# Let's say we have a new headline
headline = df['SocialMediaFeed'].iloc[0]

# Tokenize the headline
inputs = tokenizer(headline, truncation=True, padding=True, return_tensors="pt", return_token_type_ids = False).to(device)

# Pass through the model
outputs_cls, outputs_reg = model(**inputs)

# Get the predicted stock
predicted_stock = stocks[torch.argmax(outputs_cls).item()]
predicted_stock_probability = nn.functional.softmax(outputs_cls, dim=1).max().item()  # Softmax to get probabilities

# Get the predicted log return value
predicted_log_return = outputs_reg.item()

print(f"Predicted Stock: {predicted_stock}")
print(f"Predicted Stock Probability: {predicted_stock_probability}")
print(f"Predicted Log Return Value: {predicted_log_return}")

# True Values
# If the column for the given row is non-zero, then it is the true stock
# If all columns are zero, then the true stock is 'None'
true_stock = None
true_log_return = 0.0
for stock in stocks[1:]:
    if df[stock].iloc[0] != 0:
        true_stock = stock
        true_log_return = df[stock].iloc[0]
        break
        
print(f"True Stock: {true_stock}")
print(f"True Log Return Value: {true_log_return}")

In [None]:
df

In [None]:
df['SocialMediaFeed'].iloc[0]