<a href="https://colab.research.google.com/github/sheldonkemper/bank_of_england/blob/main/notebooks/modelling/Hierarchical_BERT_model_RB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

If you donâ€™t have a trained model, you need to train the Hierarchical BERT model from scratch on your dataset.

#**Step 1: Prepare the Dataset**

We assume you have a dataset with long texts and labels (e.g., a CSV file with text and label columns).

In [None]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader

class LongTextDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=512, stride=256):
        self.texts = df["text"].tolist()
        self.labels = df["label"].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.stride = stride

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Chunk the long text
        chunks = chunk_text(text, self.tokenizer, self.max_length, self.stride)
        padded_chunks, attention_masks = pad_chunks(chunks, self.max_length)

        return {
            "input_ids": padded_chunks,
            "attention_mask": attention_masks,
            "label": torch.tensor(label, dtype=torch.long),
        }

# Load dataset
df = pd.read_csv("your_dataset.csv")  # Ensure the dataset has 'text' and 'label' columns
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_dataset = LongTextDataset(df, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)


#**Step 2: Define Training Functions**

In [None]:
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize model
model = HierarchicalBERT(num_labels=3).to(device)  # Adjust `num_labels` based on your dataset
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

# Training loop
def train_model(model, train_loader, epochs=3):
    model.train()

    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}")

# Train the model
train_model(model, train_loader, epochs=3)

#**Step 3: Save and Load the Trained Model**

In [None]:
# Save trained model
torch.save(model.state_dict(), "hierarchical_bert.pth")

# Load model later
model = HierarchicalBERT(num_labels=3)
model.load_state_dict(torch.load("hierarchical_bert.pth"))
model.eval()

Now, you have a trained Hierarchical BERT model that can classify topics for long texts!