In [14]:
import pandas as pd

import torch 
from transformers import LongformerTokenizer

from torch.utils.data import Dataset,DataLoader

In [4]:
tokenized_df = pd.read_pickle('../data/preprocess_step2.pkl')

In [11]:
longest_list = max(tokenized_df['tokens'], key=len)
print("Number of token:", len(longest_list))


all_labels = [item for sublist in tokenized_df['aligned_tags'] for item in sublist]
unique_labels = set(all_labels)  
print(unique_labels)

Number of token: 1764
{'Claim', 'Evidence', 'Concluding Statement', 'Rebuttal', 'Position', 'O', 'Counterclaim', 'Lead'}


In [21]:
# Encoding Text and Labels

tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')

labels = ['O', 'Claim', 'Evidence', 'Concluding Statement', 'Rebuttal', 'Position','Counterclaim', 'Lead', 'Padding']

input_ids = [tokenizer.convert_tokens_to_ids(tokens) for tokens in tokenized_df['tokens']]
input_labels = [[labels.index(l) for l in tags] for tags in tokenized_df['aligned_tags']]


In [22]:
MAX_LENGTH = len(longest_list) + 5

def pad_sequences(sequences, max_len, padding_value=0):
    return [seq + [padding_value] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in sequences]

padded_input_ids = pad_sequences(input_ids, MAX_LENGTH, padding_value=tokenizer.pad_token_id)

padded_input_labels = pad_sequences(input_ids, MAX_LENGTH, padding_value=labels.index('Padding'))

attention_masks = [[float(token_id != tokenizer.pad_token_id) for token_id in seq] for seq in padded_input_ids]


In [None]:
class EssayDataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

    def __len__(self):
        return len(self.input_ids)
    

ds = EssayDataset(padded_input_ids, attention_masks, padded_input_labels)

In [None]:
batch_size = 256

data_loader = DataLoader(ds, batch_size=batch_size, shuffle=True)

In [None]:
if not torch.backends.mps.is_available():
    if not torch.backends.mps.is_built():
        print("MPS not available because the current PyTorch install was not "
              "built with MPS enabled.")
    else:
        print("MPS not available because the current MacOS version is not 12.3+ "
              "and/or you do not have an MPS-enabled device on this machine.")
    device = torch.device("cpu")

else:
    device = torch.device("mps")