In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m50.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m94.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m79.7 MB/s[0m eta [36m0:00:00[0m
Col

In [None]:
! wget http://noisy-text.github.io/2017/files/wnut17train.conll

--2023-10-25 06:15:18--  http://noisy-text.github.io/2017/files/wnut17train.conll
Resolving noisy-text.github.io (noisy-text.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to noisy-text.github.io (noisy-text.github.io)|185.199.108.153|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 493781 (482K) [application/octet-stream]
Saving to: ‘wnut17train.conll’


2023-10-25 06:15:18 (8.63 MB/s) - ‘wnut17train.conll’ saved [493781/493781]



In [64]:
import re
from pathlib import Path
import random
import sklearn
import transformers
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast

In [65]:
def split_into_tokens(raw_text) :
  raw_docs = re.split(r'\n\t?\n' , raw_text)
  token_docs = []
  tag_docs = []
  for doc in raw_docs :
    tokens = []
    tags = []
    for line in doc.split('\n') :
      row = line.split('\t')
      if len(row) == 1 :
        token = row[0]
        tag = None
      else :
        token , tag = row
      tokens.append(token)
      tags.append(tag)
    token_docs.append(tokens)
    tag_docs.append(tags)

  return token_docs , tag_docs

In [None]:
def read_wnut(file_path) :
  file_path = Path(file_path)
  raw_text = file_path.read_text().strip()
  token_docs , tag_docs = split_into_tokens(raw_text)
  return token_docs , tag_docs

In [67]:
texts , tags = read_wnut("/content/wnut17train.conll")

In [68]:
len(texts) , len(tags)

(3394, 3394)

In [69]:
rand = random.choice(range(len(texts)))
print(texts[rand])
print(tags[rand])

['@nikoleeeng', 'That', "'s", 'indeed', 'correct', '.', 'For', 'your', 'reference', ',', 'you', 'may', 'visit', 'this', 'link', ':', 'http://t.co/UFShOtiDse', '.', 'Thank', 'you', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


 **NOTE : Our data has
split tokens rather than full sentence strings, thus we will set is_split_into_words
to True. We pass padding as True and truncation as True to pad the sequences to
be the same length.**

In [70]:
train_texts , val_texts , train_tags , val_tags = train_test_split(texts , tags , test_size = 0.2)

In [71]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-cased")


In [72]:
train_encodings = tokenizer(
train_texts ,
is_split_into_words = True ,
return_offsets_mapping = True ,
padding = True ,
truncation = True
)

val_encodings = tokenizer(
val_texts ,
is_split_into_words = True ,
return_offsets_mapping = True ,
padding = True ,
truncation = True
)


In [73]:
len(train_encodings)

3

## Understanding WordPiece Tokenization and Label Handling

In natural language processing (NLP), we often tokenize text into smaller units called tokens. WordPiece tokenization is a common method used for this purpose. It can split words into multiple sub-tokens to ensure they are part of the model's vocabulary. However, this token splitting can lead to a mismatch between tokens and their associated labels.

- **WordPiece Tokenization**: In this process, words are split into smaller units. For example, "unhappiness" might become "un" and "##happiness."

- **Token and Label Mismatch**: When a token is split into sub-tokens, there is a problem with matching labels to tokens.

- **Resolving Mismatch**: To resolve this, we can consider the label of the first sub-token as the label for the entire token. We set labels for sub-tokens to a special value, like -100, to indicate that they should be ignored during training.

**Example**:

Let's take the sentence "I can't run" as an example:

- Without WordPiece tokenization:
  - Tokens: ["I", "can't", "run", "."]
  - Labels: ["pronoun", "verb", "verb", "punctuation"]

- With WordPiece tokenization:
  - Tokens: ["I", "can", "##'t", "run", "."]
  - Labels: ["pronoun", "verb", -100, "verb", "punctuation"]

By setting the label for "##'t" to -100, we ensure that the model doesn't consider it during training, maintaining consistency in label assignment.

This approach helps handle token and label mismatches caused by WordPiece tokenization in NLP tasks.


In [83]:
import numpy as np

# Create a set of unique tags
unique_tags = set(tag for doc in tags for tag in doc)

# Create a mapping from tag to ID and vice versa
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

# Define a function to encode tags
def encode_tags(tags, encodings):
    labels = [[tag2id[tag] for tag in doc] for doc in tags]
    encoded_labels = []

    for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
        # Create an empty array of -100
        doc_enc_labels = np.ones(len(doc_offset), dtype=int) * -100
        arr_offset = np.array(doc_offset)

        # Set labels whose 1st offset position is 0 and the 2nd is not 0
        doc_enc_labels[(arr_offset[:, 0] == 0) & (arr_offset[:, 1] != 0)] = doc_labels
        encoded_labels.append(doc_enc_labels.tolist())

    return encoded_labels

# Encode training and validation tags
train_labels = encode_tags(train_tags, train_encodings)
val_labels = encode_tags(val_tags, val_encodings)
print(f"There are total {len(unique_tags)} entity tags in the data : {unique_tags}")

There are total 13 entity tags in the data : {'B-location', 'I-corporation', 'I-product', 'B-group', 'B-corporation', 'O', 'B-creative-work', 'I-location', 'I-group', 'I-person', 'B-product', 'I-creative-work', 'B-person'}


In [84]:
train_encodings.pop("offset_mapping")
val_encodings.pop("offset_mapping")
train_dataset = tf.data.Dataset.from_tensor_slices(
    (dict(train_encodings), train_labels)
)
val_dataset = tf.data.Dataset.from_tensor_slices(
    (dict(val_encodings), val_labels)
)


In [92]:
from transformers import TFDistilBertForTokenClassification

model = TFDistilBertForTokenClassification.from_pretrained(
    'distilbert-base-cased',
    num_labels=len(unique_tags)
)


Downloading model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForTokenClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertForTokenClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForTokenClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able t

In [93]:
from transformers import TFDistilBertForSequenceClassification
import tensorflow as tf

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)

model.compile(optimizer=optimizer, loss=model.compute_loss)

model.fit(
    train_dataset.shuffle(1000).batch(16),
    epochs=3,
    batch_size=16
)


Epoch 1/3


AttributeError: ignored