In [1]:
from datasets import load_dataset, get_dataset_config_names, DatasetDict
from collections import defaultdict, Counter
import pandas as pd

## Data

* Create instance of PAN-X dataset
* Find config files for subsets of the above
* Create a dataset with user defined proportions  
* Add a column for human readable NER tags
* Plot proportion of different tags

In [2]:
dataset_name = "xtreme"
languages = ["de", "fr", "it", "en"]
proportions = [0.629, 0.229, 0.084, 0.059]
configs = [config for config in get_dataset_config_names(dataset_name) if config.startswith("PAN-X") and config.split('.')[1] in languages] # PAN-X.{2-letter ISO language code}
seed = 42

In [3]:
data_dict = defaultdict(DatasetDict)

for config, lang, prop in zip(configs, languages, proportions):
	data_dict[lang] = load_dataset(dataset_name, name=config)
	for split, ds in data_dict[lang].items():
		data_dict[lang][split] = ds.shuffle(seed).select(range(int(ds.num_rows * prop)))

Found cached dataset xtreme (/home/siddhesh1793/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached shuffled indices for dataset at /home/siddhesh1793/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-50c6130fc2dbe6ad.arrow
Loading cached shuffled indices for dataset at /home/siddhesh1793/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-3d878c38ca830baa.arrow
Loading cached shuffled indices for dataset at /home/siddhesh1793/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-c8fed2b7e6d59cbc.arrow
Found cached dataset xtreme (/home/siddhesh1793/.cache/huggingface/datasets/xtreme/PAN-X.en/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached shuffled indices for dataset at /home/siddhesh1793/.cache/huggingface/datasets/xtreme/PAN-X.en/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-e14b50505509ca06.arrow
Loading cached shuffled indices for dataset at /home/siddhesh1793/.cache/huggingface/datasets/xtreme/PAN-X.en/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-529925d4984531e4.arrow
Loading cached shuffled indices for dataset at /home/siddhesh1793/.cache/huggingface/datasets/xtreme/PAN-X.en/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-ef60137063549caf.arrow
Found cached dataset xtreme (/home/siddhesh1793/.cache/huggingface/datasets/xtreme/PAN-X.fr/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached shuffled indices for dataset at /home/siddhesh1793/.cache/huggingface/datasets/xtreme/PAN-X.fr/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-1d0cd9eb0adc8933.arrow
Loading cached shuffled indices for dataset at /home/siddhesh1793/.cache/huggingface/datasets/xtreme/PAN-X.fr/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-e0dc2d749c429e9e.arrow
Loading cached shuffled indices for dataset at /home/siddhesh1793/.cache/huggingface/datasets/xtreme/PAN-X.fr/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-ca0a6f6c69069001.arrow
Found cached dataset xtreme (/home/siddhesh1793/.cache/huggingface/datasets/xtreme/PAN-X.it/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached shuffled indices for dataset at /home/siddhesh1793/.cache/huggingface/datasets/xtreme/PAN-X.it/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-02142ffc6dacef09.arrow
Loading cached shuffled indices for dataset at /home/siddhesh1793/.cache/huggingface/datasets/xtreme/PAN-X.it/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-79992a3f46d42fd5.arrow
Loading cached shuffled indices for dataset at /home/siddhesh1793/.cache/huggingface/datasets/xtreme/PAN-X.it/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-0e5a91e037304e5e.arrow


In [4]:
mapping = data_dict['de']['train'].features['ner_tags'].feature # ClassLabel object

In [5]:
def create_ner_names(x, mapping=mapping):
	x['ner_names'] = [mapping.int2str(idx) for idx in x['ner_tags']]
	return x

In [6]:
for lang in data_dict.keys():
	for split in data_dict[lang].keys():
		data_dict[lang][split] = data_dict[lang][split].map(create_ner_names)

Loading cached processed dataset at /home/siddhesh1793/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-a4714dc4be797803.arrow
Loading cached processed dataset at /home/siddhesh1793/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-86d4a1d8fd82ddbd.arrow
Loading cached processed dataset at /home/siddhesh1793/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-c6112e841ec8c9b1.arrow
Loading cached processed dataset at /home/siddhesh1793/.cache/huggingface/datasets/xtreme/PAN-X.en/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-0ef085b80e9e8e08.arrow
Loading cached processed dataset at /home/siddhesh1793/.cache/huggingface/datasets/xtreme/PAN-X.en/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-bc3b8df0625b3ce3.arrow
Loadi

In [9]:
counts_dict = defaultdict(Counter)
df_dict = defaultdict(list)

for split in data_dict['de'].keys():
	df_dict['split'].append(split)
	ner_list = []
	for lang in data_dict.keys():
		ds = data_dict[lang][split]

		for item in ds:
			ner_list.extend([tag for tag in item['ner_names'] if tag.startswith('B-')])
	counts_dict[split] = Counter(ner_list)

In [10]:
df_dict = {'split' : [], 'B-ORG' : [], 'B-PER' : [], 'B-LOC' : []}

for split in counts_dict:
	df_dict['split'].append(split)

	for key in counts_dict[split].keys():
		df_dict[key].append(counts_dict[split][key])

pd.DataFrame.from_dict(df_dict)

Unnamed: 0,split,B-ORG,B-PER,B-LOC
0,train,8686,9241,9725
1,validation,4333,4623,4875
2,test,4317,4756,4893


## Multi-Lingual Transformers and Tokenization

* mBERT is eclipsed by XLM-RoBERTa. Differences are the following -
	* Trained on orders of magnitude larger dataset
	* NSP not used in pre-training objective
	* SentencePiece used as tokenizer instead of WordPiece
* Explore using SP and WP

In [11]:
from transformers import AutoTokenizer

xlmr_model_name = 'xlm-roberta-base'

word_piece = AutoTokenizer.from_pretrained('bert-base-uncased')
sentence_piece = AutoTokenizer.from_pretrained(xlmr_model_name)

### Try the tokenizers

In [12]:
sentence = "Jack Sparrow loves New York"
print (f"WordPiece : {word_piece(sentence).tokens()}")
print (f"SentencePiece : {sentence_piece(sentence).tokens()}")

WordPiece : ['[CLS]', 'jack', 'sparrow', 'loves', 'new', 'york', '[SEP]']
SentencePiece : ['<s>', '▁Jack', '▁Spar', 'row', '▁love', 's', '▁New', '▁York', '</s>']


In [13]:
sp_tokenized = sentence_piece(sentence).tokens()
"".join([tok.replace(u"\u2581", " ") for tok in sp_tokenized])

'<s> Jack Sparrow loves New York</s>'

## Multilingual NER model, Tokenize datasets, Performance metrics
* Implement XLM-R for token classification using pretrained XLM-R
* Write a helper function that tokenizes -> model.forward
* Function to tokenize entire dataset and take into consideration attention masks
* Play around with seqeval for evaluating token classification task

In [14]:
import torch.nn as nn
import torch.nn.functional as F
from transformers import XLMRobertaConfig
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.modeling_roberta import RobertaModel, RobertaPreTrainedModel

In [17]:
class XLMRobertaForTokenClassification(RobertaPreTrainedModel):
	config_class = XLMRobertaConfig
	def __init__(self, config):
		super().__init__(config)

		self.num_labels = config.num_labels

		# Rename to roberta 
		self.roberta = RobertaModel(config, add_pooling_layer=True)

		self.dropout = nn.Dropout(config.hidden_dropout_prob)
		self.classifier = nn.Linear(config.hidden_size, config.num_labels)

		self.init_weights()

	def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, **kwargs):
		outputs = self.roberta(input_ids, attention_mask, token_type_ids, **kwargs)

		sequence_op = self.dropout(outputs[0])
		logits = self.classifier(sequence_op)

		loss = None

		if labels is not None:
			loss = F.cross_entropy(logits.view(-1, self.num_labels), labels.view(-1))

		return TokenClassifierOutput(loss, logits, outputs.hidden_states, outputs.attentions)


In [15]:
from transformers import AutoConfig

index2tag = {idx: tag for idx, tag in enumerate(mapping.names)}
tag2index = {tag: idx for idx, tag in enumerate(mapping.names)}

xlmr_config = AutoConfig.from_pretrained(xlmr_model_name, num_labels=mapping.num_classes, id2label=index2tag, label2id=tag2index)

In [18]:
import torch

device = 'cuda'

xlmr_model = XLMRobertaForTokenClassification.from_pretrained(xlmr_model_name, config=xlmr_config).to(device)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForTokenClassification: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.embeddings.position_ids']
You should pro

In [19]:
op = sentence_piece(sentence, return_tensors='pt')
xlmr_tokens = op.tokens()
input_ids = op.input_ids
pd.DataFrame([xlmr_tokens, input_ids.numpy().tolist()[0]], index=['Tokens', 'Ids'])

Unnamed: 0,0,1,2,3,4,5,6,7,8
Tokens,<s>,▁Jack,▁Spar,row,▁love,s,▁New,▁York,</s>
Ids,0,21763,37456,15555,5161,7,2356,5753,2


In [20]:
outputs = xlmr_model(input_ids.to(device))
preds = torch.argmax(outputs.logits, dim=-1)

In [30]:
preds_str = [mapping.names[idx] for idx in preds[0]]
pd.DataFrame([xlmr_tokens, preds_str], index=['Tokens', 'Preds'])

Unnamed: 0,0,1,2,3,4,5,6,7,8
Tokens,<s>,▁Jack,▁Spar,row,▁love,s,▁New,▁York,</s>
Preds,I-PER,I-PER,I-PER,I-PER,I-PER,I-PER,I-PER,I-PER,I-PER


### TODO - wrap above in one function

## Tokenizing Texts
* We want to tokenize all the samples in the dataset and also assign labels to each word in each sample
* Need to define a function with the signature - func(x : Dict[str, List]) -> Dict[str, List]
* Need to keep in mind the convention that only the first token of a word is assigned the NER tag, subsequent tokens are ignored

In [32]:
example = data_dict['de']['train'][0] 

In [33]:
tokenized = sentence_piece(example['tokens'], is_split_into_words=True)

In [34]:
pd.DataFrame([tokenized.tokens(), tokenized.word_ids()], index=['Tokens', 'Word Ids'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,28,29,30,31,32,33,34,35,36,37
Tokens,<s>,▁Olymp,ique,▁N,îm,es,▁,",",▁Au,xer,...,▁die,▁Haupt,runde,▁quali,fi,zieren,▁können,▁,.,</s>
Word Ids,,0,0,1,1,1,2,2,3,3,...,14,15,15,16,16,16,17,18,18,


In [36]:
previous_word_idx = None
label_ids = []
word_ids = tokenized.word_ids()

for idx, word_idx in enumerate(word_ids):
	if word_idx is None:
		label_ids.append(-100)
	elif word_idx != previous_word_idx:
		label_ids.append(example['ner_tags'][word_idx])
	else:
		label_ids.append(-100)

	previous_word_idx = word_idx
labels = [mapping.names[idx] if idx != -100 else None for idx in label_ids]

In [37]:
pd.DataFrame([tokenized.tokens(), labels, label_ids, word_ids], index=['Tokens', 'Labels', 'Label Ids', 'Word Ids'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,28,29,30,31,32,33,34,35,36,37
Tokens,<s>,▁Olymp,ique,▁N,îm,es,▁,",",▁Au,xer,...,▁die,▁Haupt,runde,▁quali,fi,zieren,▁können,▁,.,</s>
Labels,,B-ORG,,I-ORG,,,O,,O,,...,O,O,,O,,,O,O,,
Label Ids,-100,3,-100,4,-100,-100,0,-100,0,-100,...,0,0,-100,0,-100,-100,0,0,-100,-100
Word Ids,,0,0,1,1,1,2,2,3,3,...,14,15,15,16,16,16,17,18,18,


In [53]:
def encode_dataset(examples):
	tokenized = sentence_piece(examples['tokens'], is_split_into_words=True, truncation=True)

	labels = []
	subword_tokens = []

	for idx, label in enumerate(examples["ner_tags"]):
		label_ids = []
		previous_word_idx = None

		for word_id in tokenized.word_ids(batch_index=idx):
			if word_id is None:
				label_ids.append(-100)
			elif word_id != previous_word_idx:
				label_ids.append(label[word_id])
			else:
				label_ids.append(-100)

			previous_word_idx = word_id
		labels.append(label_ids)
		subword_tokens.append(tokenized.tokens(batch_index=idx))

	tokenized['labels'] = labels
	tokenized['subword_tokens'] = subword_tokens
	return tokenized

In [54]:
new_ds = data_dict['de']['train'].map(encode_dataset, batched=True, remove_columns=['tokens', 'ner_tags', 'langs', 'ner_names'])

  0%|          | 0/13 [00:00<?, ?ba/s]

In [56]:
pd.DataFrame([new_ds[0]['subword_tokens'], new_ds[0]['input_ids'], new_ds[0]['labels']], index=['Tokens', 'Token Ids', 'Label Ids'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,28,29,30,31,32,33,34,35,36,37
Tokens,<s>,▁Olymp,ique,▁N,îm,es,▁,",",▁Au,xer,...,▁die,▁Haupt,runde,▁quali,fi,zieren,▁können,▁,.,</s>
Token Ids,0,68237,11503,541,52644,90,6,4,5655,34058,...,68,47582,95611,14768,1029,117484,2556,6,5,2
Label Ids,-100,3,-100,4,-100,-100,0,-100,0,-100,...,0,0,-100,0,-100,-100,0,0,-100,-100


## Performance Metrics
* We use Precision, Recall and F1-score
* For an entity to be classified correctly all the words corresponding to that entity should be correctly classified
* seqeval package is used to compute the metrics

In [78]:
from seqeval.metrics import classification_report
from seqeval.scheme import IOB2
labels = [['O', 'O', 'B-MISC', 'I-MISC', 'O', 'B-PER', 'I-PER']]
preds = [['O', 'O', 'I-MISC', 'I-MISC', 'O', 'B-PER', 'I-PER']]

In [79]:
print(classification_report(labels, preds))#, mode='strict', scheme=IOB2))

              precision    recall  f1-score   support

        MISC       1.00      1.00      1.00         1
         PER       1.00      1.00      1.00         1

   micro avg       1.00      1.00      1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2



In [75]:
y_true = [['B-NP', 'I-NP', 'O' ]]
preds = [['I-NP', 'I-NP', 'O' ]]
print(classification_report(y_true, preds, mode='strict', scheme=IOB2))

              precision    recall  f1-score   support

          NP       0.00      0.00      0.00         1

   micro avg       0.00      0.00      0.00         1
   macro avg       0.00      0.00      0.00         1
weighted avg       0.00      0.00      0.00         1



## TODO - Understand strict vs not strict better