In [46]:
import evaluate
import numpy as np
import torch

from datasets import Dataset, DatasetDict, ClassLabel, Features, Sequence, Value
from transformers import AutoTokenizer, pipeline


from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer


I will use the same function used in the demo to collect the sentences from the Conllu files

In [47]:
def read_chinese_data(inputfilename):
    with open(inputfilename, "r") as inputfile:
        sentences = []
        collection_words = []# collection of words in a sentence
        collection_labels = [] # collection of labels in a sentence
        for line in inputfile:
            if line[0] == '#':
                continue
            columns = line.split() # split the line into columns
            #print(words)
            if columns == []: 
                sentences.append((''.join(collection_words), collection_labels))
                collection_words = [] # reset collection_words because we are starting a new sentence 
                collection_labels = []
                continue
            collection_words.append(columns[1]) #
            collection_labels += [1] + ([0] * (len(columns[1]) - 1))
            
    return sentences

In [48]:
train_sentences = read_chinese_data('/scratch/lt2316-h20-resources/zh_gsd-ud-train.conllu')

In [49]:
test_sentences = read_chinese_data('/scratch/lt2316-h20-resources/zh_gsd-ud-test.conllu')

In [50]:
train_sentences[:10]

[('看似簡單，只是二選一做決擇，但其實他們代表的是你周遭的親朋好友，試著給你不同的意見，但追根究底，最後決定的還是自己。',
  [1,
   0,
   1,
   0,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   0,
   1,
   1,
   1,
   0,
   1,
   0,
   1,
   0,
   1,
   1,
   1,
   1,
   0,
   1,
   1,
   0,
   1,
   0,
   1,
   1,
   1,
   1,
   1,
   1,
   0,
   1,
   1,
   0,
   1,
   1,
   1,
   0,
   0,
   0,
   1,
   1,
   0,
   1,
   0,
   1,
   1,
   0,
   1,
   0,
   1]),
 ('其便當都是買來的，就算加熱也是由媽媽負責（後來揭曉其實是避免帶來厄運），父親則在電視台上班。',
  [1,
   1,
   0,
   1,
   0,
   1,
   0,
   1,
   1,
   1,
   0,
   1,
   0,
   1,
   0,
   1,
   1,
   0,
   1,
   0,
   1,
   1,
   0,
   1,
   0,
   1,
   0,
   1,
   1,
   0,
   1,
   0,
   1,
   0,
   1,
   1,
   1,
   0,
   1,
   1,
   1,
   0,
   1,
   1,
   0,
   1]),
 ('這次遊行最大的特色，在於越來越多年輕人上街遊行，而且當中不乏行動激烈的躁少年。',
  [1,
   1,
   1,
   0,
   1,
   0,
   1,
   1,
   0,
   1,
   1,
   1,
   1,
   0,
   0,
   1,
   1,
   0,
   1,
   1,
   0,
   1,
   0,
   1,
   1,
   0,
   1,
   0,
   1,
   0,
   1,
   0,
   1,
   0,
   1,
 

## Preprocess

Load the bert-base-chinese tokenizer to preprocess the tokens

In [51]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")

the next function <span style="color:green">*to_tokenizer*</span> is going to create the dataset which can be then passed to the pretrained model's tokenizer

In [52]:
def to_tokenizer(train_sents, test_sents):
    sents = (train_sentences,test_sentences)
    glue_dict={}
    for i, ds in enumerate(sents):
        data = {
        'id': [str(i) for i in range(len(ds))],
        'tokens': [sent[0] for sent in ds],
        'ner_tags': [sent[1] for sent in ds],
        }
        features = Features({
            'id': Value('string'),
            'tokens': Value('string'),
            'ner_tags': Sequence(ClassLabel(num_classes=2, names=['Continue_Word', 'Start_Word']))
        })
        dataset = Dataset.from_dict(data, features=features)
        if i == 0:
            glue_dict['train'] = dataset
        else:
            glue_dict['test'] = dataset
        final_ds = DatasetDict(glue_dict)
    return final_ds

We create our dataset and extract our labels from it to be used later

In [53]:
zh_dataset = to_tokenizer(train_sentences, test_sentences)
label_list = zh_dataset['train'].features['ner_tags'].feature.names
zh_dataset['train'][0]


{'id': '0',
 'tokens': '看似簡單，只是二選一做決擇，但其實他們代表的是你周遭的親朋好友，試著給你不同的意見，但追根究底，最後決定的還是自己。',
 'ner_tags': [1,
  0,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  0,
  1,
  0,
  1,
  0,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  0,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  0,
  1,
  1,
  1,
  0,
  0,
  0,
  1,
  1,
  0,
  1,
  0,
  1,
  1,
  0,
  1,
  0,
  1]}

Inspect the labels

In [54]:
label_list

['Continue_Word', 'Start_Word']

Test and ispect the tokenizer by passing one item from the dataset. We will tokenize the words by characters and not by words, *is_split_into_words=*<span style="color:blue">*False*</span>.

In [55]:
example = zh_dataset['train'][0]
tokenized_input = tokenizer(example["tokens"], is_split_into_words=False)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

['[CLS]',
 '看',
 '似',
 '簡',
 '單',
 '，',
 '只',
 '是',
 '二',
 '選',
 '一',
 '做',
 '決',
 '擇',
 '，',
 '但',
 '其',
 '實',
 '他',
 '們',
 '代',
 '表',
 '的',
 '是',
 '你',
 '周',
 '遭',
 '的',
 '親',
 '朋',
 '好',
 '友',
 '，',
 '試',
 '著',
 '給',
 '你',
 '不',
 '同',
 '的',
 '意',
 '見',
 '，',
 '但',
 '追',
 '根',
 '究',
 '底',
 '，',
 '最',
 '後',
 '決',
 '定',
 '的',
 '還',
 '是',
 '自',
 '己',
 '。',
 '[SEP]']

In [56]:
# printout input_ids
tokenized_input["input_ids"]

[101,
 4692,
 849,
 5080,
 1606,
 8024,
 1372,
 3221,
 753,
 6908,
 671,
 976,
 3748,
 3079,
 8024,
 852,
 1071,
 2179,
 800,
 947,
 807,
 6134,
 4638,
 3221,
 872,
 1453,
 6901,
 4638,
 6217,
 3301,
 1962,
 1351,
 8024,
 6275,
 5865,
 5183,
 872,
 679,
 1398,
 4638,
 2692,
 6210,
 8024,
 852,
 6841,
 3418,
 4955,
 2419,
 8024,
 3297,
 2527,
 3748,
 2137,
 4638,
 6917,
 3221,
 5632,
 2346,
 511,
 102]

Just like in the tutorial [CLS] and [SEP] and any unknown token will be labeled with *-100* to then be ignored by pytorch loss function.

In [57]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=False)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]): #examples[f"ner_tags"] f" here is a string literal. it is used to format the string. if we write examples["ner_tags"] it will be the same thing
        word_ids = tokenized_inputs.word_ids(batch_index=i)  
        #print(word_ids)
        input_ids_i = tokenized_inputs["input_ids"][i]
        adv_tokens = tokenizer.convert_ids_to_tokens(input_ids_i)
        token_tuple = zip(word_ids, adv_tokens)
        # print(adv_tokens)
        
        label_ids = []
        for idx, token in token_tuple:  # Set the special tokens to -100.
            if token not in examples["tokens"][i]:
                label_ids.append(-100)
            else:  # Only label the first token of a given word.
                label_ids.append(label[idx])
            
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

Using Datasets map function to preprocess and tokenize the previously created dataset

In [58]:
tokenized_zhds = zh_dataset.map(tokenize_and_align_labels, batched=True) 

Map: 100%|██████████| 3997/3997 [00:00<00:00, 7180.83 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 7781.73 examples/s]


In [59]:
tokenized_zhds['train'][0]

{'id': '0',
 'tokens': '看似簡單，只是二選一做決擇，但其實他們代表的是你周遭的親朋好友，試著給你不同的意見，但追根究底，最後決定的還是自己。',
 'ner_tags': [1,
  0,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  0,
  1,
  0,
  1,
  0,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  0,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  0,
  1,
  1,
  1,
  0,
  0,
  0,
  1,
  1,
  0,
  1,
  0,
  1,
  1,
  0,
  1,
  0,
  1],
 'input_ids': [101,
  4692,
  849,
  5080,
  1606,
  8024,
  1372,
  3221,
  753,
  6908,
  671,
  976,
  3748,
  3079,
  8024,
  852,
  1071,
  2179,
  800,
  947,
  807,
  6134,
  4638,
  3221,
  872,
  1453,
  6901,
  4638,
  6217,
  3301,
  1962,
  1351,
  8024,
  6275,
  5865,
  5183,
  872,
  679,
  1398,
  4638,
  2692,
  6210,
  8024,
  852,
  6841,
  3418,
  4955,
  2419,
  8024,
  3297,
  2527,
  3748,
  2137,
  4638,
  6917,
  3221,
  5632,
  2346,
  511,
  102],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,


Creating a batch of examples using DataCollatorWithPadding.

Enabling padding and assigning the label *-100* for the padding tokens

In [60]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, padding=True, label_pad_token_id=-100)

## Evaluate

Just like in the tutorial we evaluate using *"seqeval"*

In [61]:
seqeval = evaluate.load("seqeval")

Getting the NER labels first, and then creating a function that passes true predictions and true labels to compute to calculate the scores

In [62]:
labels = [label_list[i] for i in example[f"ner_tags"]]

In [63]:
labels

['Start_Word',
 'Continue_Word',
 'Start_Word',
 'Continue_Word',
 'Start_Word',
 'Start_Word',
 'Start_Word',
 'Start_Word',
 'Start_Word',
 'Start_Word',
 'Start_Word',
 'Start_Word',
 'Continue_Word',
 'Start_Word',
 'Start_Word',
 'Start_Word',
 'Continue_Word',
 'Start_Word',
 'Continue_Word',
 'Start_Word',
 'Continue_Word',
 'Start_Word',
 'Start_Word',
 'Start_Word',
 'Start_Word',
 'Continue_Word',
 'Start_Word',
 'Start_Word',
 'Continue_Word',
 'Start_Word',
 'Continue_Word',
 'Start_Word',
 'Start_Word',
 'Start_Word',
 'Start_Word',
 'Start_Word',
 'Start_Word',
 'Continue_Word',
 'Start_Word',
 'Start_Word',
 'Continue_Word',
 'Start_Word',
 'Start_Word',
 'Start_Word',
 'Continue_Word',
 'Continue_Word',
 'Continue_Word',
 'Start_Word',
 'Start_Word',
 'Continue_Word',
 'Start_Word',
 'Continue_Word',
 'Start_Word',
 'Start_Word',
 'Continue_Word',
 'Start_Word',
 'Continue_Word',
 'Start_Word']

In [64]:
def compute_metrics(p): # p is the prediction from the model
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

## Train

Creating a function that could generate both label2id and id2label dictionaries even if we changed the labels later, then pass these dicts to call the pretrained model.

In [65]:
def label_id(l):
    id2label = {}
    label2id = {}
    for i, label in enumerate(l):
        print(i, label)
        id2label[i] = label
        label2id[label] = i
    return id2label, label2id
    
id2label, label2id = label_id(label_list)

0 Continue_Word
1 Start_Word


In [66]:
label_list

['Continue_Word', 'Start_Word']

In [67]:
model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-chinese", num_labels=len(label_list), id2label=id2label, label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Define your training hyperparameters in TrainingArguments. At the end of each epoch, the Trainer will evaluate the seqeval scores and save the training checkpoint.



In [68]:
training_args = TrainingArguments(
    output_dir="chinese_WordSeg", 
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

Passing the training arguments to Trainer along with the model, dataset, tokenizer, data collator, and compute_metrics function.

In [69]:
# %env CUDA_VISIBLE_DEVICES=0
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_zhds["train"],
    eval_dataset=tokenized_zhds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

env: CUDA_VISIBLE_DEVICES=0


Calling train() to finetune the model.

In [87]:
#del model
#del trainer

#torch.cuda.empty_cache()

In [91]:
trainer.train()



Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.150962,0.919536,0.903919,0.911661,0.921456
2,No log,0.148372,0.921689,0.912767,0.917206,0.92556




TrainOutput(global_step=126, training_loss=0.13274088360014416, metrics={'train_runtime': 301.5298, 'train_samples_per_second': 26.511, 'train_steps_per_second': 0.418, 'total_flos': 412208392381740.0, 'train_loss': 0.13274088360014416, 'epoch': 2.0})

In [92]:
# save the model
trainer.save_model("chinese_WordSeg")


## Inference

We will use *segmented_text* later to compare results

In [93]:
text = "他今天晚上不来参加宴会了，对吗" # A sentence I fetched from the internet
segmented_text = ['他','今天', '晚上', '不来', '参加', '宴会', '了', '，', '对', '吗']

In [103]:
classifier = pipeline("ner", model="chinese_WordSeg")
classifier(text)

[{'entity': 'Start_Word',
  'score': 0.9999336,
  'index': 1,
  'word': '他',
  'start': 0,
  'end': 1},
 {'entity': 'Start_Word',
  'score': 0.9999572,
  'index': 2,
  'word': '今',
  'start': 1,
  'end': 2},
 {'entity': 'Continue_Word',
  'score': 0.99995434,
  'index': 3,
  'word': '天',
  'start': 2,
  'end': 3},
 {'entity': 'Start_Word',
  'score': 0.9999131,
  'index': 4,
  'word': '晚',
  'start': 3,
  'end': 4},
 {'entity': 'Continue_Word',
  'score': 0.99988747,
  'index': 5,
  'word': '上',
  'start': 4,
  'end': 5},
 {'entity': 'Start_Word',
  'score': 0.9999032,
  'index': 6,
  'word': '不',
  'start': 5,
  'end': 6},
 {'entity': 'Start_Word',
  'score': 0.99364626,
  'index': 7,
  'word': '来',
  'start': 6,
  'end': 7},
 {'entity': 'Start_Word',
  'score': 0.99995756,
  'index': 8,
  'word': '参',
  'start': 7,
  'end': 8},
 {'entity': 'Continue_Word',
  'score': 0.9999838,
  'index': 9,
  'word': '加',
  'start': 8,
  'end': 9},
 {'entity': 'Start_Word',
  'score': 0.9999298,
  '

In [95]:
# collect all entities in classifier:
classes=classifier(text)
char_preds = {}
for char in classes:
    char_preds[char['word']]=char['entity']
preds = list(char_preds.values())
preds

['Start_Word',
 'Start_Word',
 'Continue_Word',
 'Start_Word',
 'Continue_Word',
 'Start_Word',
 'Start_Word',
 'Start_Word',
 'Continue_Word',
 'Start_Word',
 'Continue_Word',
 'Start_Word',
 'Start_Word',
 'Start_Word',
 'Start_Word']

#### Or tokenize the text and return PyTorch tensors:

In [104]:
tokenizer = AutoTokenizer.from_pretrained("chinese_WordSeg")
inputs = tokenizer(text, return_tensors="pt")

Pass inputs to the model and return the logits:

In [97]:
model = AutoModelForTokenClassification.from_pretrained("chinese_WordSeg")
with torch.no_grad():
    logits = model(**inputs).logits


In [105]:
predictions = torch.argmax(logits, dim=2)
predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]]
predicted_token_class




['Start_Word',
 'Start_Word',
 'Start_Word',
 'Continue_Word',
 'Start_Word',
 'Continue_Word',
 'Start_Word',
 'Start_Word',
 'Start_Word',
 'Continue_Word',
 'Start_Word',
 'Continue_Word',
 'Start_Word',
 'Start_Word',
 'Start_Word',
 'Start_Word',
 'Start_Word']

In [106]:
len(predicted_token_class)
# length here is 17 because the tokenizer adds special tokens to the input (just a guess
# I tried to retrive the tokens but couldn't using this approach, in the first one, the pipeline, it is easy to do that).

17

In [100]:
# Ground truth values
gt=[]
for word in segmented_text:
    gt += ['Start_Word']+ ['Continue_Word']*(len(word)-1)
    
gt

['Start_Word',
 'Start_Word',
 'Continue_Word',
 'Start_Word',
 'Continue_Word',
 'Start_Word',
 'Continue_Word',
 'Start_Word',
 'Continue_Word',
 'Start_Word',
 'Continue_Word',
 'Start_Word',
 'Start_Word',
 'Start_Word',
 'Start_Word']

In [101]:
len(gt)

15

# Demo and transformer version Comparison

### Quantitative analysis and performance comparison

|     Criteria      | LSTM-Demo |  zh_Bert  |
|-------------------|-----------|-----------|
| Precision         |  0.9446   |  0.9217   |
| Recall            |  0.9385   |  0.9128   |
| F1                |  0.9415   |  0.9172   |
| Accuracy          |  0.9266   |  0.9256   |

LSTM-Demo = Demo 2.1 - Chinese word segmentation - LSTM.ipynb
zh_Bert = Chinese_WordSeg

P.S These metric-numbers are from when I ran the demo notebook myself

We notice that both are good models so to speak, as they both are exceeding 90% of all accuracy metrics:

1. **Precision**: precision = tp / (tp + fp)
Precision is the ratio of correctly predicted positives to the total predicted positives (here it is start of word), whether they were false or true. Higher precision means less false positives. LSTM model has a higher precision (0.9446) compared to Chinese_WordSeg (0.9217), indicating that LSTM-Demo has fewer false positives.

2. **Recall**: recall = tp / (tp + fn)
Recall is the ratio of correctly predicted positive observations to the both false and true predictions for the positive cases. LSTM-Demo has higher recall (0.9385) compared to Chinese_WordSeg (0.9128), indicating that LSTM-Demo is better at identifying the positive cases.

3. **F1 Score**: f1 = (2 * recall * precision) / (recall + precision)
The F1 Score is the weighted average of Precision and Recall. Therefore, it takes both false positives and false negatives into account. LSTM-Demo has a higher F1 score (0.9415) than Chinese_WordSeg (0.9172), indicating that it has a better balance of precision and recall.

4. **Accuracy**: (tp + tn) / (tp + fp + tn + fn)
Accuracy is the ratio of correctly predicted observation to the total observations. The accuracy of LSTM-Demo (0.9266) is slightly higher than that of Chinese_WordSeg (0.9256).

Overall, according to these metrics, LSTM-Demo performs better than Chinese_WordSeg, but we should not forget that the LSTM model was built specifically to do the task of Chinese word segmentation while the other one was finetuned on the data. In the demo we trained the model using 30 epochs while here we used only two! the learning rate as weel went from 0.005 to 0.00002 which is a lot lower than in the demo we also use two different loss functions. All of this could affect the accuracy of our predictions.

For finetuning it is better to use a seemingly lower learning rate than what we usually choose for training a model, as this will help the model to converge and avoid any divergence.

The number of epochs also plays an important role in determining the quality of our classification, I noticed that with more epochs it is easy for the model to overfit although still doing well according to the metrics provided during finetuning. with slightly more epochs (less than 10 epochs) the model has somewhat arrived to the level of performance observed by the Demo model.

### Qualitative analysis and performance comparison:
P.S I don't really know Chinese but I'll do my best to analyse the results from the inference part.

The model was able to correctly identify 4 out of five Continue_Word token (failed to correctly specify the 7th token in the given sentence) and all of the Start_Word tokens, which is a very good result.