In [1]:
from datasets import Dataset
from pathlib import Path
import os
import numpy as np
import re
from tqdm import tqdm
import pickle
import nltk
from nltk.data import load as nltk_load

  from .autonotebook import tqdm as notebook_tqdm


Create a huggingface Dataset from the stored sentence and pos tag data.

In [2]:
class TQDMBytesReader(object):
    # For showing the progress bar while reading the stored pickle file.
    def __init__(self, fd, **kwargs):
        self.fd = fd
        from tqdm import tqdm
        self.tqdm = tqdm(**kwargs)

    def read(self, size=-1):
        bytes = self.fd.read(size)
        self.tqdm.update(len(bytes))
        return bytes

    def readline(self):
        bytes = self.fd.readline()
        self.tqdm.update(len(bytes))
        return bytes

    def __enter__(self):
        self.tqdm.__enter__()
        return self

    def __exit__(self, *args, **kwargs):
        return self.tqdm.__exit__(*args, **kwargs)


In [3]:
def load_data(file_name):
    # if os.path.exists(f"/home/rsaha/projects/babylm/src/taggers/processed_tagger_data/processed_pos_training_data_{file_name}.pkl"):
    if os.path.exists(f"/home/rsaha/projects/babylm/src/taggers/data/{file_name}.pkl"):
        print("Loading data from file ...")
        with open(f"/home/rsaha/projects/babylm/src/taggers/processed_tagger_data/processed_pos_training_data_{file_name}.pkl", "rb") as f:
            total = os.path.getsize(f"/home/rsaha/projects/babylm/src/taggers/processed_tagger_data/processed_pos_training_data_{file_name}.pkl")
            with TQDMBytesReader(f, total=total) as pbfd:
                up = pickle.Unpickler(pbfd)
                X_data, y_data = up.load()
            return X_data, y_data

In [4]:
def extract_features(sentence, index):
  return {
      'word':sentence[index],
      'is_first':index==0,
      'is_last':index ==len(sentence)-1,
      'is_capitalized':sentence[index][0].upper() == sentence[index][0],
      'is_all_caps': sentence[index].upper() == sentence[index],
      'is_all_lower': sentence[index].lower() == sentence[index],
      'is_alphanumeric': int(bool((re.match('^(?=.*[0-9]$)(?=.*[a-zA-Z])',sentence[index])))),
      'prefix-1':sentence[index][0],
      'prefix-2':sentence[index][:2],
      'prefix-3':sentence[index][:3],
      'prefix-3':sentence[index][:4],
      'suffix-1':sentence[index][-1],
      'suffix-2':sentence[index][-2:],
      'suffix-3':sentence[index][-3:],
      'suffix-3':sentence[index][-4:],
      'prev_word':'' if index == 0 else sentence[index-1],
      'next_word':'' if index < len(sentence) else sentence[index+1],
      'has_hyphen': '-' in sentence[index],
      'is_numeric': sentence[index].isdigit(),
      'capitals_inside': sentence[index][1:].lower() != sentence[index][1:],
  }

In [5]:
data_dir = Path("/home/rsaha/projects/babylm/src/taggers/data/")
paths = [str(f) for f in data_dir.glob("*") if f.is_file() and not f.name.endswith(".DS_Store") and f.suffix in [".pkl"]]
print("Paths: ", paths)

file_names = []
# Only select the cc_3m and local_narr files and store it in filtered_paths.
# filtered_paths = paths #[]
# # for path in paths:
# #     if "cc_3M" in path or "local_narr" in path:
# #         filtered_paths.append(path)

for path in tqdm(paths, desc="Paths"):

    file_name = Path(path).name
    # Drop the .train extension
    file_name = file_name.split(".")[0]
    file_names.append(file_name)

Paths:  ['/home/rsaha/projects/babylm/src/taggers/data/pos_tagging_dataset_all_sentences_switchboard.pkl', '/home/rsaha/projects/babylm/src/taggers/data/pos_tagging_dataset_all_sentences_open_subtitles.pkl', '/home/rsaha/projects/babylm/src/taggers/data/pos_tagging_dataset_all_sentences_childes.pkl', '/home/rsaha/projects/babylm/src/taggers/data/pos_tagging_dataset_all_sentences_simple_wiki.pkl', '/home/rsaha/projects/babylm/src/taggers/data/pos_tagging_dataset_all_sentences_bnc_spoken.pkl', '/home/rsaha/projects/babylm/src/taggers/data/pos_tagging_dataset_all_sentences_local_narr_captions.pkl', '/home/rsaha/projects/babylm/src/taggers/data/pos_tagging_dataset_all_sentences_gutenberg.pkl', '/home/rsaha/projects/babylm/src/taggers/data/pos_tagging_dataset_all_sentences_cc_3M_captions_reduced.pkl']


Paths: 100%|██████████| 8/8 [00:00<00:00, 18724.57it/s]


In [6]:
file_names

['pos_tagging_dataset_all_sentences_switchboard',
 'pos_tagging_dataset_all_sentences_open_subtitles',
 'pos_tagging_dataset_all_sentences_childes',
 'pos_tagging_dataset_all_sentences_simple_wiki',
 'pos_tagging_dataset_all_sentences_bnc_spoken',
 'pos_tagging_dataset_all_sentences_local_narr_captions',
 'pos_tagging_dataset_all_sentences_gutenberg',
 'pos_tagging_dataset_all_sentences_cc_3M_captions_reduced']

In [7]:
# Load the data from each file_name in file_names.
all_X_data = []
all_y_data = []
for file_name in tqdm(file_names):
    print(file_name)
    if file_name == "pos_tagging_dataset_all_sentences_open_subtitles":
        data = pickle.load(open(f"/home/rsaha/projects/babylm/src/taggers/data/{file_name}.pkl", "rb"))
        # all_X_data.extend(X_data)
        # all_y_data.extend(y_data)


  0%|          | 0/8 [00:00<?, ?it/s]

pos_tagging_dataset_all_sentences_switchboard
pos_tagging_dataset_all_sentences_open_subtitles


100%|██████████| 8/8 [00:13<00:00,  1.69s/it]

pos_tagging_dataset_all_sentences_childes
pos_tagging_dataset_all_sentences_simple_wiki
pos_tagging_dataset_all_sentences_bnc_spoken
pos_tagging_dataset_all_sentences_local_narr_captions
pos_tagging_dataset_all_sentences_gutenberg
pos_tagging_dataset_all_sentences_cc_3M_captions_reduced





In [8]:
data[0]


(['I', 'cry', 'as', 'I', 'look', 'up', 'to', 'the', 'sky'],
 ['PRP', 'VBP', 'IN', 'PRP', 'VBP', 'RB', 'TO', 'DT', 'NN'])

In [9]:
# Create a dictionary of numbers where each tag (the second element in the tuple) is assigned a unique number. This will be the class labels.
tagdict = nltk_load('help/tagsets/upenn_tagset.pickle')
label_names = {t: i for i, t in enumerate(tagdict.keys())}
label_names['#'] = len(label_names)

In [10]:
label_names

{'LS': 0,
 'TO': 1,
 'VBN': 2,
 "''": 3,
 'WP': 4,
 'UH': 5,
 'VBG': 6,
 'JJ': 7,
 'VBZ': 8,
 '--': 9,
 'VBP': 10,
 'NN': 11,
 'DT': 12,
 'PRP': 13,
 ':': 14,
 'WP$': 15,
 'NNPS': 16,
 'PRP$': 17,
 'WDT': 18,
 '(': 19,
 ')': 20,
 '.': 21,
 ',': 22,
 '``': 23,
 '$': 24,
 'RB': 25,
 'RBR': 26,
 'RBS': 27,
 'VBD': 28,
 'IN': 29,
 'FW': 30,
 'RP': 31,
 'JJR': 32,
 'JJS': 33,
 'PDT': 34,
 'MD': 35,
 'VB': 36,
 'WRB': 37,
 'NNP': 38,
 'EX': 39,
 'NNS': 40,
 'SYM': 41,
 'CC': 42,
 'CD': 43,
 'POS': 44,
 '#': 45}

In [11]:
# Data has many tuples and each tuple has two lists. The first list is the list of words and the second list is the list of tags.
# Create a third separate list of lists where each list contains the number from the label_names dictionary based on the key tag.
# This will be the class labels.
tag_to_class_mapping = []
for i in tqdm(range(len(data))):
    tag_to_class_mapping.append([label_names[tag] for tag in data[i][1]])

100%|██████████| 1734740/1734740 [00:05<00:00, 308749.40it/s]


In [12]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            new_labels.append(-100)
        else:
            label = labels[word_id]
            if label % 2 == 1:
                label += 1
            new_labels.append(label)
    return new_labels

In [13]:
# Load the tokenizer.
from transformers import BertTokenizer, PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast.from_pretrained('/home/rsaha/projects/babylm/src/tokenizer/hf_wordpiece_tokenizer_from_git/')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [14]:
labels = tag_to_class_mapping[0]
inputs = tokenizer(data[0][0], is_split_into_words=True)
word_ids = inputs.word_ids()


In [15]:
print(labels)
print(data[0][1])
print(word_ids)
print(align_labels_with_tokens(labels, word_ids))

[13, 10, 29, 13, 10, 25, 1, 12, 11]
['PRP', 'VBP', 'IN', 'PRP', 'VBP', 'RB', 'TO', 'DT', 'NN']
[0, 0, 1, 2, 3, 3, 4, 5, 6, 7, 8, None]
[13, 14, 10, 29, 13, 14, 10, 25, 1, 12, 11, -100]


In [61]:
# First create a dataframe from the sentence, tags, and class labels.
# NOTE: Each example in the data variable has two lists. The first list is the list of words and the second list is the list of tags.

import pandas as pd
df = pd.DataFrame(data, columns=["sentence", "tags"])
df["class_labels"] = tag_to_class_mapping


In [62]:
# Create train and validation splits using train_test_split.
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [63]:
from datasets import Dataset
df_dataset_train = Dataset.from_pandas(train_df)
df_dataset_val = Dataset.from_pandas(val_df)


In [66]:
df_dataset_train

Dataset({
    features: ['sentence', 'tags', 'class_labels', '__index_level_0__'],
    num_rows: 1387792
})

In [32]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["sentence"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["class_labels"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [68]:
df_dataset_tokenized_train = df_dataset_train.map(tokenize_and_align_labels, batched=True,
                                      remove_columns=df_dataset_train.column_names, num_proc=20)
df_dataset_tokenized_eval = df_dataset_val.map(tokenize_and_align_labels, batched=True,
                                      remove_columns=df_dataset_train.column_names, num_proc=20)

Map (num_proc=20): 100%|██████████| 1387792/1387792 [00:51<00:00, 27206.27 examples/s]
Map (num_proc=20): 100%|██████████| 346948/346948 [00:15<00:00, 22963.90 examples/s]


In [69]:
df_dataset_tokenized_train[0]

{'input_ids': [4,
  13,
  107,
  183,
  18,
  176,
  15,
  16,
  4,
  13,
  199,
  15,
  18,
  176,
  2083,
  4,
  8,
  71,
  94,
  4,
  17,
  1],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'labels': [13,
  14,
  28,
  11,
  1,
  36,
  13,
  42,
  13,
  14,
  10,
  13,
  1,
  36,
  38,
  22,
  22,
  12,
  11,
  21,
  22,
  -100]}

In [70]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [71]:
data_collator

DataCollatorForTokenClassification(tokenizer=PreTrainedTokenizerFast(name_or_path='/home/rsaha/projects/babylm/src/tokenizer/hf_wordpiece_tokenizer_from_git/', vocab_size=32768, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'pad_token': '<pad>', 'additional_special_tokens': ['<image>', '<PERSON>']}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<mask>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32768: AddedToken("<image>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32769: AddedToken("<PERSON>", r

In [72]:
batch = data_collator([df_dataset_tokenized_train[i] for i in range(2)])
print(batch["labels"])
for i in range(2):
    print(df_dataset_tokenized_train[i]["labels"])

tensor([[  13,   14,   28,   11,    1,   36,   13,   42,   13,   14,   10,   13,
            1,   36,   38,   22,   22,   12,   11,   21,   22, -100],
        [  38,   38,   38,   38,   25,    2,   29,   12,   12,   11,   21,   22,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]])
[13, 14, 28, 11, 1, 36, 13, 42, 13, 14, 10, 13, 1, 36, 38, 22, 22, 12, 11, 21, 22, -100]
[38, 38, 38, 38, 25, 2, 29, 12, 12, 11, 21, 22, -100]


In [73]:
id2label = {i: label for i, label in enumerate(label_names.keys())}
label2id = {v: k for k, v in id2label.items()}  # This is nothing but the label_names dictionary. But keeping it like this for consistency with the Kaggle notebook.

In [74]:
label2id

{'LS': 0,
 'TO': 1,
 'VBN': 2,
 "''": 3,
 'WP': 4,
 'UH': 5,
 'VBG': 6,
 'JJ': 7,
 'VBZ': 8,
 '--': 9,
 'VBP': 10,
 'NN': 11,
 'DT': 12,
 'PRP': 13,
 ':': 14,
 'WP$': 15,
 'NNPS': 16,
 'PRP$': 17,
 'WDT': 18,
 '(': 19,
 ')': 20,
 '.': 21,
 ',': 22,
 '``': 23,
 '$': 24,
 'RB': 25,
 'RBR': 26,
 'RBS': 27,
 'VBD': 28,
 'IN': 29,
 'FW': 30,
 'RP': 31,
 'JJR': 32,
 'JJS': 33,
 'PDT': 34,
 'MD': 35,
 'VB': 36,
 'WRB': 37,
 'NNP': 38,
 'EX': 39,
 'NNS': 40,
 'SYM': 41,
 'CC': 42,
 'CD': 43,
 'POS': 44,
 '#': 45}

In [75]:
from transformers import AutoModelForTokenClassification, BertConfig, BertForTokenClassification
bert_config = BertConfig()
model_checkpoint = "bert-base-uncased"
teacher_model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, id2label=id2label, label2id=label2id)
model = AutoModelForTokenClassification.from_config(
    teacher_model.config,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [76]:
import evaluate

metric = evaluate.load("seqeval")

In [77]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [79]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    df_dataset_tokenized_train,
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)
eval_dataloader = DataLoader(
    df_dataset_tokenized_eval, collate_fn=data_collator, batch_size=8
)

### Test the following code

In [None]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)

In [None]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [None]:
from transformers import get_scheduler

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

In [None]:
from tqdm.auto import tqdm
import torch

progress_bar = tqdm(range(num_training_steps))

for epoch in tqdm(range(num_train_epochs)):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    for batch in eval_dataloader:
        with torch.no_grad():
            outputs = model(**batch)

        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]

        # Necessary to pad predictions and labels for being gathered
        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(predictions)
        labels_gathered = accelerator.gather(labels)

        true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=true_predictions, references=true_labels)

    results = metric.compute()
    print(
        f"epoch {epoch}:",
        {
            key: results[f"overall_{key}"]
            for key in ["precision", "recall", "f1", "accuracy"]
        },
    )

# Code below is for testing and not part of the pipeline.

In [21]:
# Now I have three lists: data contains two lists of words and tags, tag_to_class_mapping contains the corresponding class labels.
labels = tag_to_class_mapping[0]


In [40]:
inputs.word_ids()

[0, 1, 2, 2, 3, 3, 4, 4, 5, 6, 7, 7, 7, 8, 9, 9, None]

In [None]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
print(batch["labels"])
for i in range(2):
    print(tokenized_datasets["train"][i]["labels"])

In [None]:
data = datasets.Dataset.from_dict(data_dict)