In [2]:
from datasets import Dataset
from pathlib import Path
import os
import numpy as np
import re
from tqdm import tqdm
import pickle
import nltk
from nltk.data import load as nltk_load

  from .autonotebook import tqdm as notebook_tqdm


Create a huggingface Dataset from the stored sentence and pos tag data.

In [3]:
class TQDMBytesReader(object):
    # For showing the progress bar while reading the stored pickle file.
    def __init__(self, fd, **kwargs):
        self.fd = fd
        from tqdm import tqdm
        self.tqdm = tqdm(**kwargs)

    def read(self, size=-1):
        bytes = self.fd.read(size)
        self.tqdm.update(len(bytes))
        return bytes

    def readline(self):
        bytes = self.fd.readline()
        self.tqdm.update(len(bytes))
        return bytes

    def __enter__(self):
        self.tqdm.__enter__()
        return self

    def __exit__(self, *args, **kwargs):
        return self.tqdm.__exit__(*args, **kwargs)


In [4]:
def load_data(file_name):
    # if os.path.exists(f"/home/rsaha/projects/babylm/src/taggers/processed_tagger_data/processed_pos_training_data_{file_name}.pkl"):
    if os.path.exists(f"/home/rsaha/projects/babylm/src/taggers/data/{file_name}.pkl"):
        print("Loading data from file ...")
        with open(f"/home/rsaha/projects/babylm/src/taggers/processed_tagger_data/processed_pos_training_data_{file_name}.pkl", "rb") as f:
            total = os.path.getsize(f"/home/rsaha/projects/babylm/src/taggers/processed_tagger_data/processed_pos_training_data_{file_name}.pkl")
            with TQDMBytesReader(f, total=total) as pbfd:
                up = pickle.Unpickler(pbfd)
                X_data, y_data = up.load()
            return X_data, y_data

In [5]:
def extract_features(sentence, index):
  return {
      'word':sentence[index],
      'is_first':index==0,
      'is_last':index ==len(sentence)-1,
      'is_capitalized':sentence[index][0].upper() == sentence[index][0],
      'is_all_caps': sentence[index].upper() == sentence[index],
      'is_all_lower': sentence[index].lower() == sentence[index],
      'is_alphanumeric': int(bool((re.match('^(?=.*[0-9]$)(?=.*[a-zA-Z])',sentence[index])))),
      'prefix-1':sentence[index][0],
      'prefix-2':sentence[index][:2],
      'prefix-3':sentence[index][:3],
      'prefix-3':sentence[index][:4],
      'suffix-1':sentence[index][-1],
      'suffix-2':sentence[index][-2:],
      'suffix-3':sentence[index][-3:],
      'suffix-3':sentence[index][-4:],
      'prev_word':'' if index == 0 else sentence[index-1],
      'next_word':'' if index < len(sentence) else sentence[index+1],
      'has_hyphen': '-' in sentence[index],
      'is_numeric': sentence[index].isdigit(),
      'capitals_inside': sentence[index][1:].lower() != sentence[index][1:],
  }

In [6]:
data_dir = Path("/home/rsaha/projects/babylm/src/taggers/data/")
paths = [str(f) for f in data_dir.glob("*") if f.is_file() and not f.name.endswith(".DS_Store") and f.suffix in [".pkl"]]
print("Paths: ", paths)

file_names = []
# Only select the cc_3m and local_narr files and store it in filtered_paths.
# filtered_paths = paths #[]
# # for path in paths:
# #     if "cc_3M" in path or "local_narr" in path:
# #         filtered_paths.append(path)

for path in tqdm(paths, desc="Paths"):

    file_name = Path(path).name
    # Drop the .train extension
    file_name = file_name.split(".")[0]
    file_names.append(file_name)

Paths:  ['/home/rsaha/projects/babylm/src/taggers/data/pos_tagging_dataset_all_sentences_switchboard.pkl', '/home/rsaha/projects/babylm/src/taggers/data/pos_tagging_dataset_all_sentences_open_subtitles.pkl', '/home/rsaha/projects/babylm/src/taggers/data/pos_tagging_dataset_all_sentences_childes.pkl', '/home/rsaha/projects/babylm/src/taggers/data/pos_tagging_dataset_all_sentences_simple_wiki.pkl', '/home/rsaha/projects/babylm/src/taggers/data/pos_tagging_dataset_all_sentences_bnc_spoken.pkl', '/home/rsaha/projects/babylm/src/taggers/data/pos_tagging_dataset_all_sentences_local_narr_captions.pkl', '/home/rsaha/projects/babylm/src/taggers/data/pos_tagging_dataset_all_sentences_gutenberg.pkl', '/home/rsaha/projects/babylm/src/taggers/data/pos_tagging_dataset_all_sentences_cc_3M_captions_reduced.pkl']


Paths: 100%|██████████| 8/8 [00:00<00:00, 39429.41it/s]


In [8]:
# Load the data from each file_name in file_names.
all_X_data = []
all_y_data = []
for file_name in tqdm(file_names):
    print(file_name)
    if file_name == "pos_tagging_dataset_all_sentences_open_subtitles":
        data = pickle.load(open(f"/home/rsaha/projects/babylm/src/taggers/data/{file_name}.pkl", "rb"))
        # all_X_data.extend(X_data)
        # all_y_data.extend(y_data)


  0%|          | 0/8 [00:00<?, ?it/s]

pos_tagging_dataset_all_sentences_switchboard
pos_tagging_dataset_all_sentences_open_subtitles


100%|██████████| 8/8 [00:13<00:00,  1.67s/it]

pos_tagging_dataset_all_sentences_childes
pos_tagging_dataset_all_sentences_simple_wiki
pos_tagging_dataset_all_sentences_bnc_spoken
pos_tagging_dataset_all_sentences_local_narr_captions
pos_tagging_dataset_all_sentences_gutenberg
pos_tagging_dataset_all_sentences_cc_3M_captions_reduced





In [13]:
# Create a dictionary of numbers where each tag (the second element in the tuple) is assigned a unique number. This will be the class labels.
tagdict = nltk_load('help/tagsets/upenn_tagset.pickle')
label_names = {t: i for i, t in enumerate(tagdict.keys())}
label_names['#'] = len(label_names)

In [14]:
# Remove all the empty lists from data.
data = [d for d in data if d [0]!= []]

In [99]:
# Now remove all the hastags from each example of the data.
data_no_hash = []
data_no_hash = [d for d in data_no_hash if d [0]!= []]

In [None]:
data_no_hash

In [101]:
# Data has many tuples and each tuple has two lists. The first list is the list of words and the second list is the list of tags.
# Create a third separate list of lists where each list contains the number from the label_names dictionary based on the key tag.
# This will be the class labels.
tag_to_class_mapping_for_data_no_hash = []
for i in tqdm(range(len(data))):
    tag_to_class_mapping_for_data_no_hash.append([label_names[tag] for tag in data_no_hash[i][1]])

100%|██████████| 1732818/1732818 [00:02<00:00, 670467.92it/s]


In [18]:
# Data has many tuples and each tuple has two lists. The first list is the list of words and the second list is the list of tags.
# Create a third separate list of lists where each list contains the number from the label_names dictionary based on the key tag.
# This will be the class labels.
tag_to_class_mapping_for_data = []
for i in tqdm(range(len(data))):
    tag_to_class_mapping_for_data.append([label_names[tag] for tag in data[i][1]])

100%|██████████| 1732818/1732818 [00:07<00:00, 230371.83it/s]


In [102]:
# First create a dataframe from the sentence, tags, and class labels.
# NOTE: Each example in the data variable has two lists. The first list is the list of words and the second list is the list of tags.

import pandas as pd
df_no_hash = pd.DataFrame(data_no_hash, columns=["sentence", "tags"])
df_no_hash["class_labels"] = tag_to_class_mapping_for_data_no_hash


In [103]:
# Create train and validation splits using train_test_split.
from sklearn.model_selection import train_test_split
train_df_no_hash, val_df_no_hash = train_test_split(df_no_hash, test_size=0.2, random_state=42)

In [106]:
from datasets import Dataset
df_dataset_train_no_hash = Dataset.from_pandas(train_df_no_hash)
df_dataset_val_no_hash = Dataset.from_pandas(val_df_no_hash)


In [107]:
test_df_no_hash = df_dataset_train_no_hash.select(range(1319,1321))
test_df_tokenized_train_no_hash = test_df_no_hash.map(tokenize_and_align_labels, batched=True, remove_columns=test_df_no_hash.column_names)
print("Test DF Tokenized Train: ", test_df_tokenized_train_no_hash['labels'])

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Example sentence:   [['Then', 'her', 'arm', '.', '.', '.'], ['#', 'Good', '-', 'bye', ',', 'I', 'showed', 'you', 'a', 'real', 'good', 'time', '#']]
Tokens:  ['▁then', '▁her', '▁arm', '▁', '.', '▁', '.', '▁', '.', '</s>']
Tokenized Inputs:  {'input_ids': [[98, 67, 903, 4, 5, 4, 5, 4, 5, 1], [4, 899, 123, 4, 26, 735, 4, 8, 4, 13, 1387, 15, 4, 12, 490, 123, 135, 4, 899, 1]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}
Examples:  {'sentence': [['Then', 'her', 'arm', '.', '.', '.'], ['#', 'Good', '-', 'bye', ',', 'I', 'showed', 'you', 'a', 'real', 'good', 'time', '#']], 'tags': [['RB', 'PRP$', 'NN', '.', '.', '.'], ['NNP', ':', 'NN', ',', 'PRP', 'VBD', 'PRP', 'DT', 'JJ', 'JJ', 'NN']], 'class_labels': [[25, 17, 11, 21, 21, 21], [38, 14, 11, 22, 13, 28, 13, 12, 7, 7, 11]], '__index_level_0__': [100967, 811154]}





IndexError: list index out of range

In [97]:
# Load the tokenizer.
from transformers import BertTokenizer, PreTrainedTokenizerFast
model_checkpoint = "bert-base-uncased"
tokenizer = PreTrainedTokenizerFast.from_pretrained('/home/rsaha/projects/babylm/src/tokenizer/hf_wordpiece_tokenizer_from_git/')
# tokenizer = PreTrainedTokenizerFast.from_pretrained(model_checkpoint)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [98]:
tokenizer

PreTrainedTokenizerFast(name_or_path='/home/rsaha/projects/babylm/src/tokenizer/hf_wordpiece_tokenizer_from_git/', vocab_size=32768, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'pad_token': '<pad>', 'additional_special_tokens': ['<image>', '<PERSON>']}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<mask>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32768: AddedToken("<image>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32769: AddedToken("<PERSON>", rstrip=False, lstrip=False, single_word=False,

In [82]:
# First create a dataframe from the sentence, tags, and class labels.
# NOTE: Each example in the data variable has two lists. The first list is the list of words and the second list is the list of tags.

import pandas as pd
df = pd.DataFrame(data, columns=["sentence", "tags"])
df["class_labels"] = tag_to_class_mapping_for_data


In [83]:
# Create train and validation splits using train_test_split.
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [84]:
from datasets import Dataset
df_dataset_train = Dataset.from_pandas(train_df)
df_dataset_val = Dataset.from_pandas(val_df)


In [85]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        print("Word ID: ", word_id)
        if word_id != current_word:
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            new_labels.append(-100)
        else:
            print("Inside else")
            label = labels[word_id]
            if label % 2 == 1:
                label += 1
            new_labels.append(label)
    return new_labels

In [95]:
def tokenize_and_align_labels(examples):
    print("Example sentence:  ", examples["sentence"])
    tokenized_inputs = tokenizer(
        examples["sentence"], truncation=True, is_split_into_words=True, max_length=50
    )
    print("Tokens: ", tokenized_inputs.tokens())
    print("Tokenized Inputs: ", tokenized_inputs)
    print("Examples: ", examples)
    all_labels = examples["class_labels"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        print("Word IDs: ", word_ids)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [96]:
test_df = df_dataset_train.select(range(1319,1321))
test_df_tokenized_train = test_df.map(tokenize_and_align_labels, batched=True, remove_columns=test_df.column_names)
print("Test DF Tokenized Train: ", test_df_tokenized_train['labels'])

Map: 100%|██████████| 2/2 [00:00<00:00, 226.38 examples/s]

Example sentence:   [['Then', 'her', 'arm', '.', '.', '.'], ['#', 'Good', '-', 'bye', ',', 'I', 'showed', 'you', 'a', 'real', 'good', 'time', '#']]
Tokens:  ['[CLS]', 'then', 'her', 'arm', '.', '.', '.', '[SEP]']
Tokenized Inputs:  {'input_ids': [[101, 2059, 2014, 2849, 1012, 1012, 1012, 102], [101, 1001, 2204, 1011, 9061, 1010, 1045, 3662, 2017, 1037, 2613, 2204, 2051, 1001, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}
Examples:  {'sentence': [['Then', 'her', 'arm', '.', '.', '.'], ['#', 'Good', '-', 'bye', ',', 'I', 'showed', 'you', 'a', 'real', 'good', 'time', '#']], 'tags': [['RB', 'PRP$', 'NN', '.', '.', '.'], ['#', 'NNP', ':', 'NN', ',', 'PRP', 'VBD', 'PRP', 'DT', 'JJ', 'JJ', 'NN', '#']], 'class_labels': [[25, 17, 11, 21, 21, 21], [45, 38, 14, 11, 22, 13, 28, 13, 12, 7, 7, 11, 45]], '__index_level_0__': [100967, 811154]}
Word IDs:  [N




In [77]:
test_df_tokenized_train[1]

{'input_ids': [4, 899, 1],
 'token_type_ids': [0, 0, 0],
 'attention_mask': [1, 1, 1],
 'labels': [45, 46, -100]}

In [41]:
df_dataset_tokenized_train = df_dataset_train.map(tokenize_and_align_labels, batched=True,
                                      remove_columns=df_dataset_train.column_names, num_proc=20)
# df_dataset_tokenized_eval = df_dataset_val.map(tokenize_and_align_labels, batched=True,
#                                       remove_columns=df_dataset_train.column_names, num_proc=20)

Map (num_proc=20):   0%|          | 0/1386254 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map (num_proc=20):   1%|          | 10000/1386254 [00:04<05:59, 3828.47 examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map (num_proc=20):   3%|▎         | 45000/1386254 [00:06<02:07, 10547.18 examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map (num_proc=20):   7%|▋         | 100000/1386254 [00:08<00:54, 23777.44 examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map (num_proc=20):  11%|█▏        | 157313/1386254 [00:10<00:29, 41087.02 examples/s]Asking to truncate to max_leng

In [78]:
# Figure out which of the labels array in each element of df_dataset_tokenized_train contains 46.
for i, example in enumerate(df_dataset_tokenized_train):
    if 46 in example["labels"]:
        print("example: ", i)

example:  613
example:  1320
example:  3197
example:  5002
example:  5318
example:  6776
example:  7066
example:  7599
example:  7799
example:  7892
example:  8361
example:  8429
example:  9412
example:  10666
example:  11025
example:  11231
example:  11313
example:  11617
example:  12186
example:  12415
example:  12498
example:  13241
example:  13394
example:  13574
example:  13588
example:  14141
example:  15207
example:  16213
example:  16364
example:  17249
example:  18495
example:  18945
example:  19589
example:  19601
example:  20517
example:  21093
example:  21831
example:  23790
example:  23950
example:  25847
example:  27215
example:  27644
example:  27937
example:  28030
example:  28534
example:  29128
example:  29994
example:  30590
example:  30895
example:  31036
example:  32047
example:  32857
example:  33436
example:  33506
example:  34248
example:  34570
example:  35504
example:  35925
example:  36385
example:  37959
example:  37973
example:  40828
example:  40830
exampl

KeyboardInterrupt: 

In [44]:
print(df_dataset_tokenized_train[613])
print(train_df.iloc[613])


{'input_ids': [4, 899, 1], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1], 'labels': [45, 46, -100]}
sentence         [#]
tags             [#]
class_labels    [45]
Name: 1584082, dtype: object


In [77]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [78]:
data_collator

DataCollatorForTokenClassification(tokenizer=PreTrainedTokenizerFast(name_or_path='/home/rsaha/projects/babylm/src/tokenizer/hf_wordpiece_tokenizer_from_git/', vocab_size=32768, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'pad_token': '<pad>', 'additional_special_tokens': ['<image>', '<PERSON>']}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<mask>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32768: AddedToken("<image>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32769: AddedToken("<PERSON>", r

In [79]:
batch = data_collator([df_dataset_tokenized_train[i] for i in range(4)])
print(batch["labels"])
for i in range(4):
    print(df_dataset_tokenized_train[i]["labels"])

tensor([[  13,   14,   28,   11,    1,   36,   13,   42,   13,   14,   10,   13,
            1,   36,   38,   22,   22,   12,   11,   21,   22, -100],
        [  38,   38,   38,   38,   25,    2,   29,   12,   12,   11,   21,   22,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100],
        [  13,   14,   10,    7,   11,   29,   40,   40,   21,   22, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100],
        [  38,   38,   38,   25,   28,   13,   29,   21,   22, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]])
[13, 14, 28, 11, 1, 36, 13, 42, 13, 14, 10, 13, 1, 36, 38, 22, 22, 12, 11, 21, 22, -100]
[38, 38, 38, 38, 25, 2, 29, 12, 12, 11, 21, 22, -100]
[13, 14, 10, 7, 11, 29, 40, 40, 21, 22, -100]
[38, 38, 38, 25, 28, 13, 29, 21, 22, -100]


In [80]:
id2label = {i: label for i, label in enumerate(label_names.keys())}
label2id = {v: k for k, v in id2label.items()}  # This is nothing but the label_names dictionary. But keeping it like this for consistency with the Kaggle notebook.

In [81]:
label2id

{'LS': 0,
 'TO': 1,
 'VBN': 2,
 "''": 3,
 'WP': 4,
 'UH': 5,
 'VBG': 6,
 'JJ': 7,
 'VBZ': 8,
 '--': 9,
 'VBP': 10,
 'NN': 11,
 'DT': 12,
 'PRP': 13,
 ':': 14,
 'WP$': 15,
 'NNPS': 16,
 'PRP$': 17,
 'WDT': 18,
 '(': 19,
 ')': 20,
 '.': 21,
 ',': 22,
 '``': 23,
 '$': 24,
 'RB': 25,
 'RBR': 26,
 'RBS': 27,
 'VBD': 28,
 'IN': 29,
 'FW': 30,
 'RP': 31,
 'JJR': 32,
 'JJS': 33,
 'PDT': 34,
 'MD': 35,
 'VB': 36,
 'WRB': 37,
 'NNP': 38,
 'EX': 39,
 'NNS': 40,
 'SYM': 41,
 'CC': 42,
 'CD': 43,
 'POS': 44,
 '#': 45}

In [82]:
from transformers import AutoModelForTokenClassification, BertConfig, BertForTokenClassification
bert_config = BertConfig()
model_checkpoint = "bert-base-uncased"
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, id2label=id2label, label2id=label2id)
# model = AutoModelForTokenClassification.from_config(
#     teacher_model.config,
# )

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [83]:
import evaluate

metric = evaluate.load("seqeval")

In [84]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [85]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    df_dataset_tokenized_train,
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)
eval_dataloader = DataLoader(
    df_dataset_tokenized_eval, collate_fn=data_collator, batch_size=8
)

### Test the following code

In [86]:
from torch.optim import AdamW, Adam

# optimizer = Adam(model.parameters(), lr=2e-5)
optimizer = Adam(model.parameters(), lr=2e-5)

In [87]:
from accelerate import Accelerator
import os
os.environ["PYTORCH_USE_CUDA_DSA"] = "1"
os.environ["TORCH_USE_CUDA_DSA"] = "1"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
# accelerator = Accelerator(mixed_precision="fp16")
# model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
#     model, optimizer, train_dataloader, eval_dataloader
# )
import torch
scaler = torch.cuda.amp.GradScaler(enabled=True)

In [88]:
from transformers import get_scheduler

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

# lr_scheduler = get_scheduler(
#     "linear",
#     optimizer=optimizer,
#     num_warmup_steps=0,
#     num_training_steps=num_training_steps,
# )

In [89]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

In [93]:
# Put the device to the GPU.
model.to("cpu")

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [70]:
from tqdm.auto import tqdm
import torch

progress_bar = tqdm(range(num_training_steps))

for epoch in tqdm(range(num_train_epochs)):
    # Training
    model.train()
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to("cuda:5")
        attention_mask = batch['attention_mask'].to("cuda:5")
        labels = batch['labels'].to("cuda:5")
        # with torch.autocast(device_type='cuda', dtype=torch.float16):
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        print("batch: ", batch)
        loss = outputs.loss
            
        loss.backward()
            # # accelerator.backward(loss)

        optimizer.step()
        # scaler.scale(loss).backward()
        # scaler.step(optimizer)
        # # lr_scheduler.step()
        # scaler.update()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    for batch in eval_dataloader:
        with torch.no_grad():
            outputs = model(**batch)

        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]

        # Necessary to pad predictions and labels for being gathered
        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(predictions)
        labels_gathered = accelerator.gather(labels)

        true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=true_predictions, references=true_labels)

    results = metric.compute()
    print(
        f"epoch {epoch}:",
        {
            key: results[f"overall_{key}"]
            for key in ["precision", "recall", "f1", "accuracy"]
        },
    )

  0%|          | 0/520422 [04:38<?, ?it/s]
  0%|          | 0/3 [00:00<?, ?it/s]


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [53]:
batch['labels']

tensor([[  11,   12,    6,   17,    7,   11,    1,   12,   40,   40,   29,   12,
           11,   22,   22,   12,   11,   22,   22, -100],
        [  38,   22,   22,   40,   22,   22,   11,   12,   12,   13,   21,   22,
         -100, -100, -100, -100, -100, -100, -100, -100],
        [   4,   21,   22, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100],
        [  25,   21,   22, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100],
        [  13,   35,   36,    4,   28,   13,    1,   36,   21,   22, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100],
        [  13,   10,   10,   10,   36,   13,   21,   22, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100],
        [  12, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100],
        [  29,   38,   38, 

# Code below is for testing and not part of the pipeline.

In [21]:
# Now I have three lists: data contains two lists of words and tags, tag_to_class_mapping contains the corresponding class labels.
labels = tag_to_class_mapping[0]


In [40]:
inputs.word_ids()

[0, 1, 2, 2, 3, 3, 4, 4, 5, 6, 7, 7, 7, 8, 9, 9, None]

In [None]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
print(batch["labels"])
for i in range(2):
    print(tokenized_datasets["train"][i]["labels"])

In [None]:
data = datasets.Dataset.from_dict(data_dict)