In [1]:
from datasets import Dataset
from pathlib import Path
import os
import numpy as np
import re
from tqdm import tqdm
import pickle
import nltk
from nltk.data import load as nltk_load

  from .autonotebook import tqdm as notebook_tqdm


Create a huggingface Dataset from the stored sentence and pos tag data.

In [2]:
class TQDMBytesReader(object):
    # For showing the progress bar while reading the stored pickle file.
    def __init__(self, fd, **kwargs):
        self.fd = fd
        from tqdm import tqdm
        self.tqdm = tqdm(**kwargs)

    def read(self, size=-1):
        bytes = self.fd.read(size)
        self.tqdm.update(len(bytes))
        return bytes

    def readline(self):
        bytes = self.fd.readline()
        self.tqdm.update(len(bytes))
        return bytes

    def __enter__(self):
        self.tqdm.__enter__()
        return self

    def __exit__(self, *args, **kwargs):
        return self.tqdm.__exit__(*args, **kwargs)


In [3]:
def load_data(file_name):
    # if os.path.exists(f"/home/rsaha/projects/babylm/src/taggers/processed_tagger_data/processed_pos_training_data_{file_name}.pkl"):
    if os.path.exists(f"/home/rsaha/projects/babylm/src/taggers/data/{file_name}.pkl"):
        print("Loading data from file ...")
        with open(f"/home/rsaha/projects/babylm/src/taggers/processed_tagger_data/processed_pos_training_data_{file_name}.pkl", "rb") as f:
            total = os.path.getsize(f"/home/rsaha/projects/babylm/src/taggers/processed_tagger_data/processed_pos_training_data_{file_name}.pkl")
            with TQDMBytesReader(f, total=total) as pbfd:
                up = pickle.Unpickler(pbfd)
                X_data, y_data = up.load()
            return X_data, y_data

In [4]:
def extract_features(sentence, index):
  return {
      'word':sentence[index],
      'is_first':index==0,
      'is_last':index ==len(sentence)-1,
      'is_capitalized':sentence[index][0].upper() == sentence[index][0],
      'is_all_caps': sentence[index].upper() == sentence[index],
      'is_all_lower': sentence[index].lower() == sentence[index],
      'is_alphanumeric': int(bool((re.match('^(?=.*[0-9]$)(?=.*[a-zA-Z])',sentence[index])))),
      'prefix-1':sentence[index][0],
      'prefix-2':sentence[index][:2],
      'prefix-3':sentence[index][:3],
      'prefix-3':sentence[index][:4],
      'suffix-1':sentence[index][-1],
      'suffix-2':sentence[index][-2:],
      'suffix-3':sentence[index][-3:],
      'suffix-3':sentence[index][-4:],
      'prev_word':'' if index == 0 else sentence[index-1],
      'next_word':'' if index < len(sentence) else sentence[index+1],
      'has_hyphen': '-' in sentence[index],
      'is_numeric': sentence[index].isdigit(),
      'capitals_inside': sentence[index][1:].lower() != sentence[index][1:],
  }

In [5]:
data_dir = Path("/home/rsaha/projects/babylm/src/taggers/data/")
paths = [str(f) for f in data_dir.glob("*") if f.is_file() and not f.name.endswith(".DS_Store") and f.suffix in [".pkl"]]
print("Paths: ", paths)

file_names = []
# Only select the cc_3m and local_narr files and store it in filtered_paths.
# filtered_paths = paths #[]
# # for path in paths:
# #     if "cc_3M" in path or "local_narr" in path:
# #         filtered_paths.append(path)

for path in tqdm(paths, desc="Paths"):

    file_name = Path(path).name
    # Drop the .train extension
    file_name = file_name.split(".")[0]
    file_names.append(file_name)

Paths:  ['/home/rsaha/projects/babylm/src/taggers/data/pos_tagging_dataset_all_sentences_switchboard.pkl', '/home/rsaha/projects/babylm/src/taggers/data/pos_tagging_dataset_all_sentences_open_subtitles.pkl', '/home/rsaha/projects/babylm/src/taggers/data/pos_tagging_dataset_all_sentences_childes.pkl', '/home/rsaha/projects/babylm/src/taggers/data/pos_tagging_dataset_all_sentences_simple_wiki.pkl', '/home/rsaha/projects/babylm/src/taggers/data/pos_tagging_dataset_all_sentences_bnc_spoken.pkl', '/home/rsaha/projects/babylm/src/taggers/data/pos_tagging_dataset_all_sentences_local_narr_captions.pkl', '/home/rsaha/projects/babylm/src/taggers/data/pos_tagging_dataset_all_sentences_gutenberg.pkl', '/home/rsaha/projects/babylm/src/taggers/data/pos_tagging_dataset_all_sentences_cc_3M_captions_reduced.pkl']


Paths: 100%|██████████| 8/8 [00:00<00:00, 33354.31it/s]


In [6]:
# Load the data from each file_name in file_names.
all_X_data = []
all_y_data = []
for file_name in tqdm(file_names):
    print(file_name)
    if file_name == "pos_tagging_dataset_all_sentences_open_subtitles":
        data = pickle.load(open(f"/home/rsaha/projects/babylm/src/taggers/data/{file_name}.pkl", "rb"))
        # all_X_data.extend(X_data)
        # all_y_data.extend(y_data)


  0%|          | 0/8 [00:00<?, ?it/s]

pos_tagging_dataset_all_sentences_switchboard
pos_tagging_dataset_all_sentences_open_subtitles


100%|██████████| 8/8 [00:12<00:00,  1.56s/it]

pos_tagging_dataset_all_sentences_childes
pos_tagging_dataset_all_sentences_simple_wiki
pos_tagging_dataset_all_sentences_bnc_spoken
pos_tagging_dataset_all_sentences_local_narr_captions
pos_tagging_dataset_all_sentences_gutenberg
pos_tagging_dataset_all_sentences_cc_3M_captions_reduced





In [7]:
# Create a dictionary of numbers where each tag (the second element in the tuple) is assigned a unique number. This will be the class labels.
tagdict = nltk_load('help/tagsets/upenn_tagset.pickle')
label_names = {t: i for i, t in enumerate(tagdict.keys())}
label_names['#'] = len(label_names)

In [10]:
label_names_values_sorted = sorted(label_names.keys())

In [11]:
print(label_names_values_sorted)

['#', '$', "''", '(', ')', ',', '--', '.', ':', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', '``']


In [8]:
# Remove all the empty lists from data.
data = [d for d in data if d [0]!= []]

In [9]:
# Now remove all the hastags from each example of the data.
# data_no_hash = []
# data_no_hash = [d for d in data_no_hash if d [0]!= []]

In [10]:
# Data has many tuples and each tuple has two lists. The first list is the list of words and the second list is the list of tags.
# Create a third separate list of lists where each list contains the number from the label_names dictionary based on the key tag.
# This will be the class labels.
# tag_to_class_mapping_for_data_no_hash = []
# for i in tqdm(range(len(data))):
#     tag_to_class_mapping_for_data_no_hash.append([label_names[tag] for tag in data_no_hash[i][1]])

In [11]:
# Data has many tuples and each tuple has two lists. The first list is the list of words and the second list is the list of tags.
# Create a third separate list of lists where each list contains the number from the label_names dictionary based on the key tag.
# This will be the class labels.
tag_to_class_mapping_for_data = []
for i in tqdm(range(len(data))):
    tag_to_class_mapping_for_data.append([label_names[tag] for tag in data[i][1]])

100%|██████████| 1732818/1732818 [00:06<00:00, 283395.41it/s]


In [12]:
# First create a dataframe from the sentence, tags, and class labels.
# NOTE: Each example in the data variable has two lists. The first list is the list of words and the second list is the list of tags.

# import pandas as pd
# df_no_hash = pd.DataFrame(data_no_hash, columns=["sentence", "tags"])
# df_no_hash["class_labels"] = tag_to_class_mapping_for_data_no_hash


In [13]:
# Create train and validation splits using train_test_split.
# from sklearn.model_selection import train_test_split
# train_df_no_hash, val_df_no_hash = train_test_split(df_no_hash, test_size=0.2, random_state=42)

In [14]:
# from datasets import Dataset
# df_dataset_train_no_hash = Dataset.from_pandas(train_df_no_hash)
# df_dataset_val_no_hash = Dataset.from_pandas(val_df_no_hash)


In [15]:
# test_df_no_hash = df_dataset_train_no_hash.select(range(1319,1321))
# test_df_tokenized_train_no_hash = test_df_no_hash.map(tokenize_and_align_labels, batched=True, remove_columns=test_df_no_hash.column_names)
# print("Test DF Tokenized Train: ", test_df_tokenized_train_no_hash['labels'])

In [18]:
# Load the tokenizer.
from transformers import BertTokenizer, PreTrainedTokenizerFast
model_checkpoint = "bert-base-uncased"
# tokenizer = PreTrainedTokenizerFast.from_pretrained('/home/rsaha/projects/babylm/src/tokenizer/hf_wordpiece_tokenizer_from_git/')
tokenizer = PreTrainedTokenizerFast.from_pretrained('/home/rsaha/projects/babylm/src/tokenizer/hf_wordpiece_tokenizer_from_bert-base-uncased/')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [19]:
tokenizer

PreTrainedTokenizerFast(name_or_path='/home/rsaha/projects/babylm/src/tokenizer/hf_wordpiece_tokenizer_from_bert-base-uncased/', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [21]:
# First create a dataframe from the sentence, tags, and class labels.
# NOTE: Each example in the data variable has two lists. The first list is the list of words and the second list is the list of tags.

import pandas as pd
df = pd.DataFrame(data, columns=["sentence", "tags"])
df["class_labels"] = tag_to_class_mapping_for_data


In [22]:
# Create train and validation splits using train_test_split.
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [23]:
from datasets import Dataset
df_dataset_train = Dataset.from_pandas(train_df)
df_dataset_val = Dataset.from_pandas(val_df)


In [24]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        # print("Word ID: ", word_id)
        if word_id != current_word:
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            new_labels.append(-100)
        else:
            # print("Inside else")
            label = labels[word_id]
            if label % 2 == 1:
                label += 1
            new_labels.append(label)
    return new_labels

In [25]:
def tokenize_and_align_labels(examples):
    # print("Example sentence:  ", examples["sentence"])
    tokenized_inputs = tokenizer(
        examples["sentence"], truncation=True, is_split_into_words=True, max_length=50
    )
    # print("Tokens: ", tokenized_inputs.tokens())
    # print("Tokenized Inputs: ", tokenized_inputs)
    # print("Examples: ", examples)
    all_labels = examples["class_labels"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        # print("Word IDs: ", word_ids)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [27]:
# test_df = df_dataset_train.select(range(1319,1321))
# test_df_tokenized_train = test_df.map(tokenize_and_align_labels, batched=True, remove_columns=test_df.column_names)
# print("Test DF Tokenized Train: ", test_df_tokenized_train['labels'])

In [28]:
# test_df_tokenized_train[1]

In [29]:
df_dataset_tokenized_train = df_dataset_train.map(tokenize_and_align_labels, batched=True,
                                      remove_columns=df_dataset_train.column_names, num_proc=20)
df_dataset_tokenized_eval = df_dataset_val.map(tokenize_and_align_labels, batched=True,
                                      remove_columns=df_dataset_train.column_names, num_proc=20)

Map (num_proc=20): 100%|██████████| 1386254/1386254 [00:46<00:00, 29762.85 examples/s]
Map (num_proc=20): 100%|██████████| 346564/346564 [00:14<00:00, 23653.52 examples/s]


In [31]:
# Figure out which of the labels array in each element of df_dataset_tokenized_train contains 46.
for i, example in tqdm(enumerate(df_dataset_tokenized_train)):
    if 46 in example["labels"]:
        print("example: ", i)

1386254it [01:16, 18100.07it/s]


In [32]:
print(df_dataset_tokenized_train[613])
print(train_df.iloc[613])


{'input_ids': [101, 1001, 102], 'attention_mask': [1, 1, 1], 'labels': [-100, 45, -100]}
sentence         [#]
tags             [#]
class_labels    [45]
Name: 1584082, dtype: object


In [33]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

2024-08-09 18:44:43.128684: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-08-09 18:44:43.215958: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [34]:
data_collator

DataCollatorForTokenClassification(tokenizer=PreTrainedTokenizerFast(name_or_path='/home/rsaha/projects/babylm/src/tokenizer/hf_wordpiece_tokenizer_from_bert-base-uncased/', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}, padd

In [35]:
batch = data_collator([df_dataset_tokenized_train[i] for i in range(2)])
print(batch["labels"])
for i in range(2):
    print(df_dataset_tokenized_train[i]["labels"])

tensor([[-100,    5,   22,   11,   21, -100, -100, -100],
        [-100,   36,   31,   17,    6,   40,   21, -100]])
[-100, 5, 22, 11, 21, -100]
[-100, 36, 31, 17, 6, 40, 21, -100]


In [36]:
id2label = {i: label for i, label in enumerate(label_names.keys())}
label2id = {v: k for k, v in id2label.items()}  # This is nothing but the label_names dictionary. But keeping it like this for consistency with the Kaggle notebook.

In [37]:
label2id

{'LS': 0,
 'TO': 1,
 'VBN': 2,
 "''": 3,
 'WP': 4,
 'UH': 5,
 'VBG': 6,
 'JJ': 7,
 'VBZ': 8,
 '--': 9,
 'VBP': 10,
 'NN': 11,
 'DT': 12,
 'PRP': 13,
 ':': 14,
 'WP$': 15,
 'NNPS': 16,
 'PRP$': 17,
 'WDT': 18,
 '(': 19,
 ')': 20,
 '.': 21,
 ',': 22,
 '``': 23,
 '$': 24,
 'RB': 25,
 'RBR': 26,
 'RBS': 27,
 'VBD': 28,
 'IN': 29,
 'FW': 30,
 'RP': 31,
 'JJR': 32,
 'JJS': 33,
 'PDT': 34,
 'MD': 35,
 'VB': 36,
 'WRB': 37,
 'NNP': 38,
 'EX': 39,
 'NNS': 40,
 'SYM': 41,
 'CC': 42,
 'CD': 43,
 'POS': 44,
 '#': 45}

In [39]:
from transformers import AutoModelForTokenClassification
model_checkpoint = "bert-base-uncased"
teacher_model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, id2label=id2label, label2id=label2id)
model = AutoModelForTokenClassification.from_config(
    teacher_model.config,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:
import evaluate

metric = evaluate.load("seqeval")

In [41]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [43]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    df_dataset_tokenized_train,
    shuffle=True,
    collate_fn=data_collator,
    batch_size=128,
)
eval_dataloader = DataLoader(
    df_dataset_tokenized_eval, collate_fn=data_collator, batch_size=128
)

### Test the following code

In [44]:
from torch.optim import AdamW, Adam

# optimizer = Adam(model.parameters(), lr=2e-5)
optimizer = Adam(model.parameters(), lr=2e-5)

In [45]:
from accelerate import Accelerator
import os
# os.environ["PYTORCH_USE_CUDA_DSA"] = "1"
# os.environ["TORCH_USE_CUDA_DSA"] = "1"
# os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
accelerator = Accelerator(mixed_precision="fp16")
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)
import torch
scaler = torch.cuda.amp.GradScaler(enabled=True)

In [46]:
from transformers import get_scheduler

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [47]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

In [48]:
# Put the device to the GPU.
# model.to("cpu")

In [49]:
from tqdm.auto import tqdm
import torch

progress_bar = tqdm(range(num_training_steps))

for epoch in tqdm(range(num_train_epochs)):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    for batch in eval_dataloader:
        with torch.no_grad():
            outputs = model(**batch)

        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]

        # Necessary to pad predictions and labels for being gathered
        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(predictions)
        labels_gathered = accelerator.gather(labels)

        true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=true_predictions, references=true_labels)

    results = metric.compute()
    print(
        f"epoch {epoch}:",
        {
            key: results[f"overall_{key}"]
            for key in ["precision", "recall", "f1", "accuracy"]
        },
    )

  0%|          | 0/3 [03:10<?, ?it/s]46:02,  4.95it/s]


KeyboardInterrupt: 