In [1]:
import pandas as pd

import torch 
from transformers import LongformerTokenizer
from transformers import LongformerForTokenClassification

from torch.utils.data import Dataset,DataLoader

from tqdm import tqdm

In [2]:
tokenized_df = pd.read_pickle('../data/preprocess_step2.pkl')

In [3]:
longest_list = max(tokenized_df['tokens'], key=len)
print("Number of token:", len(longest_list))


all_labels = [item for sublist in tokenized_df['aligned_tags'] for item in sublist]
unique_labels = set(all_labels)  
print(unique_labels)

Number of token: 1764
{'Claim', 'Rebuttal', 'Counterclaim', 'Evidence', 'Position', 'O', 'Concluding Statement', 'Lead'}


In [4]:
# Encoding Text and Labels

tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')

labels = ['O', 'Claim', 'Evidence', 'Concluding Statement', 'Rebuttal', 'Position','Counterclaim', 'Lead', 'Padding']

input_ids = [tokenizer.convert_tokens_to_ids(tokens) for tokens in tokenized_df['tokens']]
input_labels = [[labels.index(l) for l in tags] for tags in tokenized_df['aligned_tags']]


In [5]:
MAX_LENGTH = len(longest_list) + 5

def pad_sequences(sequences, max_len, padding_value=0):
    return [seq + [padding_value] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in sequences]

padded_input_ids = pad_sequences(input_ids, MAX_LENGTH, padding_value=tokenizer.pad_token_id)

padded_input_labels = pad_sequences(input_ids, MAX_LENGTH, padding_value=labels.index('Padding'))

attention_masks = [[float(token_id != tokenizer.pad_token_id) for token_id in seq] for seq in padded_input_ids]


In [6]:
class EssayDataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

    def __len__(self):
        return len(self.input_ids)
    

ds = EssayDataset(padded_input_ids[:100], attention_masks[:100], padded_input_labels[:100])

In [7]:
batch_size = 16

data_loader = DataLoader(ds, batch_size=batch_size, shuffle=True)

In [8]:
if not torch.backends.mps.is_available():
    if not torch.backends.mps.is_built():
        print("MPS not available because the current PyTorch install was not "
              "built with MPS enabled.")
    else:
        print("MPS not available because the current MacOS version is not 12.3+ "
              "and/or you do not have an MPS-enabled device on this machine.")
    device = torch.device("cpu")

else:
    device = torch.device("mps")

In [9]:
from transformers import LongformerForTokenClassification

num_labels = len(unique_labels)

model = LongformerForTokenClassification.from_pretrained(
    'allenai/longformer-base-4096',
    num_labels=num_labels
)

Some weights of LongformerForTokenClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
torch.cuda.empty_cache()

In [11]:
from transformers import AdamW
from torch.optim.lr_scheduler import StepLR

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Learning rate scheduler (optional, you can also define your own scheduler)
scheduler = StepLR(optimizer, step_size=1000, gamma=0.1)

# Move the model to GPU if available
model.to(device)

# Training loop
for epoch in tqdm(range(10)):
    model.train()
    total_loss = 0

    for batch in data_loader:
        # Move batch to device
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    print(f"Epoch {epoch}, Loss: {total_loss / len(data_loader)}")


  'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long),
  0%|          | 0/10 [00:08<?, ?it/s]


RuntimeError: MPS backend out of memory (MPS allocated: 13.74 GB, other allocations: 3.64 GB, max allowed: 18.13 GB). Tried to allocate 1.32 GB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).