<a href="https://colab.research.google.com/github/skywalker0803r/NLP/blob/main/Token_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!wget http://noisy-text.github.io/2017/files/wnut17train.conll
!pip install transformers
!pip install datasets
!pip install seqeval

--2021-07-05 00:34:17--  http://noisy-text.github.io/2017/files/wnut17train.conll
Resolving noisy-text.github.io (noisy-text.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to noisy-text.github.io (noisy-text.github.io)|185.199.108.153|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 493781 (482K) [application/octet-stream]
Saving to: ‘wnut17train.conll’


2021-07-05 00:34:17 (53.5 MB/s) - ‘wnut17train.conll’ saved [493781/493781]

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/fd/1a/41c644c963249fd7f3836d926afa1e3f1cc234a1c40d80c5f03ad8f6f1b2/transformers-4.8.2-py3-none-any.whl (2.5MB)
[K     |████████████████████████████████| 2.5MB 33.3MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.m

In [2]:
from pathlib import Path
import re

def read_wnut(file_path):
    file_path = Path(file_path)

    raw_text = file_path.read_text().strip()
    raw_docs = re.split(r'\n\t?\n', raw_text)
    token_docs = []
    tag_docs = []
    for doc in raw_docs:
        tokens = []
        tags = []
        for line in doc.split('\n'):
            token, tag = line.split('\t')
            tokens.append(token)
            tags.append(tag)
        token_docs.append(tokens)
        tag_docs.append(tags)

    return token_docs, tag_docs

texts, tags = read_wnut('wnut17train.conll')

In [3]:
texts[0],tags[0]

(['@paulwalk',
  'It',
  "'s",
  'the',
  'view',
  'from',
  'where',
  'I',
  "'m",
  'living',
  'for',
  'two',
  'weeks',
  '.',
  'Empire',
  'State',
  'Building',
  '=',
  'ESB',
  '.',
  'Pretty',
  'bad',
  'storm',
  'here',
  'last',
  'evening',
  '.'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-location',
  'I-location',
  'I-location',
  'O',
  'B-location',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'])

In [4]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_tags, val_tags = train_test_split(texts, tags, test_size=.2)

In [5]:
unique_tags = set(tag for doc in tags for tag in doc)
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

In [6]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
train_encodings = tokenizer(train_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
val_encodings = tokenizer(val_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435797.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




In [7]:
import numpy as np

def encode_tags(tags, encodings):
    labels = [[tag2id[tag] for tag in doc] for doc in tags]
    encoded_labels = []
    for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
        # create an empty array of -100
        doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
        arr_offset = np.array(doc_offset)

        # set labels whose first offset position is 0 and the second is not 0
        doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
        encoded_labels.append(doc_enc_labels.tolist())

    return encoded_labels

train_labels = encode_tags(train_tags, train_encodings)
val_labels = encode_tags(val_tags, val_encodings)

In [8]:
import torch

class WNUTDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        #item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        #item['labels'] = torch.tensor(self.labels[idx])
        item = {key:val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

train_encodings.pop("offset_mapping") # we don't want to pass this to the model
val_encodings.pop("offset_mapping")
train_dataset = WNUTDataset(train_encodings, train_labels)
val_dataset = WNUTDataset(val_encodings, val_labels)

In [9]:
from transformers import DistilBertForTokenClassification
model = DistilBertForTokenClassification.from_pretrained('distilbert-base-cased', num_labels=len(unique_tags))

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=411.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=263273408.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForTokenClassification: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this 

In [10]:
from transformers import TrainingArguments, Trainer ,DataCollatorForTokenClassification

In [11]:
task = 'ner'
args = TrainingArguments(
    f"test-{task}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

In [12]:
from datasets import load_dataset, load_metric
data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2482.0, style=ProgressStyle(description…




In [13]:
import warnings 
warnings.filterwarnings('ignore')
labels = [i for i in tokenizer.convert_ids_to_tokens(train_dataset[4]['input_ids'])]
metric.compute(predictions=[labels], references=[labels])

{'#15': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 '#8': {'f1': 1.0, 'number': 2, 'precision': 1.0, 'recall': 1.0},
 '#A': {'f1': 1.0, 'number': 4, 'precision': 1.0, 'recall': 1.0},
 '#C': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 '#J': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 '#an': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 '#ed': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 '#f': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 '0': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 '013': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 '015': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 '2': {'f1': 1.0, 'number': 3, 'precision': 1.0, 'recall': 1.0},
 'CLS]': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 'PAD]': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 'R': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 'SE

In [21]:
import numpy as np

label_list = list(unique_tags)
print(label_list)
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

['B-person', 'I-product', 'I-creative-work', 'B-group', 'B-corporation', 'I-corporation', 'I-person', 'B-product', 'I-location', 'O', 'I-group', 'B-location', 'B-creative-work']


In [22]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)
model.to(device)
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

cuda


In [23]:
trainer.train()

***** Running training *****
  Num examples = 2715
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 510


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.162061,0.43595,0.513382,0.471508,0.961769
2,No log,0.145993,0.485588,0.532847,0.508121,0.965208
3,0.061000,0.155204,0.47807,0.530414,0.502884,0.964968


***** Running Evaluation *****
  Num examples = 679
  Batch size = 16
***** Running Evaluation *****
  Num examples = 679
  Batch size = 16
Saving model checkpoint to test-ner/checkpoint-500
Configuration saved in test-ner/checkpoint-500/config.json
Model weights saved in test-ner/checkpoint-500/pytorch_model.bin
tokenizer config file saved in test-ner/checkpoint-500/tokenizer_config.json
Special tokens file saved in test-ner/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 679
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=510, training_loss=0.060752599087415954, metrics={'train_runtime': 81.4237, 'train_samples_per_second': 100.032, 'train_steps_per_second': 6.264, 'total_flos': 286773158054700.0, 'train_loss': 0.060752599087415954, 'epoch': 3.0})

In [24]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 679
  Batch size = 16


{'epoch': 3.0,
 'eval_accuracy': 0.9649684075821803,
 'eval_f1': 0.5028835063437139,
 'eval_loss': 0.15520355105400085,
 'eval_precision': 0.4780701754385965,
 'eval_recall': 0.5304136253041363,
 'eval_runtime': 2.3418,
 'eval_samples_per_second': 289.947,
 'eval_steps_per_second': 18.362}

In [49]:
from transformers import pipeline
trained_model = DistilBertForTokenClassification.from_pretrained('/content/test-ner/checkpoint-500',num_labels=len(unique_tags))
nlp = pipeline('ner', model=trained_model, tokenizer=tokenizer)

loading configuration file /content/test-ner/checkpoint-500/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-cased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForTokenClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_11": 11,
    "LABEL_12": 12,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8": 8,
    "LABEL_9": 9
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "outpu

# test

In [92]:
import pandas as pd

def test(nlp):
  idx = np.random.randint(0,100)
  print(idx)
  map_dict = {'LABEL_9':'O'}
  text = texts[idx]
  table = pd.DataFrame()
  table['predict'] = tags[idx]
  table['labels'] = tags[idx]
  lst = []
  for j,i in enumerate(nlp(text,grouped_entities=False)):
    try:
      o = map_dict[i[0]['entity']]
    except:
      o = i[0]['entity']
    print(o)
    lst.append(o)
  table['predict'] = lst
  display(table)

# train過

In [93]:
test(nlp)

82
O
O
O
O
O
O
O
O
O
O


Unnamed: 0,predict,labels
0,O,O
1,O,O
2,O,O
3,O,O
4,O,O
5,O,O
6,O,O
7,O,O
8,O,O
9,O,O


# 沒train過

In [94]:
model = DistilBertForTokenClassification.from_pretrained('distilbert-base-cased',num_labels=len(unique_tags))
nlp = pipeline('ner', model=model, tokenizer=tokenizer)
test(nlp)

loading configuration file https://huggingface.co/distilbert-base-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/ebe1ea24d11aa664488b8de5b21e33989008ca78f207d4e30ec6350b693f073f.302bfd1b5e031cc1b17796e0b6e5b242ba2045d31d00f97589e12b458ebff27a
Model config DistilBertConfig {
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_11": 11,
    "LABEL_12": 12,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8": 8,
    "LABEL_9": 9
  },
  "max_p

86
LABEL_4
LABEL_1
LABEL_6
LABEL_12
LABEL_12
LABEL_1
LABEL_1
LABEL_1
LABEL_12
LABEL_1


Unnamed: 0,predict,labels
0,LABEL_4,O
1,LABEL_1,O
2,LABEL_6,O
3,LABEL_12,O
4,LABEL_12,O
5,LABEL_1,O
6,LABEL_1,O
7,LABEL_1,O
8,LABEL_12,O
9,LABEL_1,O
