In [1]:
from datasets import load_dataset, get_dataset_config_names, DatasetDict
from collections import defaultdict, Counter
import pandas as pd

## Data

* Create instance of PAN-X dataset
* Find config files for subsets of the above
* Create a dataset with user defined proportions  
* Add a column for human readable NER tags
* Plot proportion of different tags

In [2]:
dataset_name = "xtreme"
languages = ["de", "fr", "it", "en"]
proportions = [0.629, 0.229, 0.084, 0.059]
configs = [config for config in get_dataset_config_names(dataset_name) if config.startswith("PAN-X") and config.split('.')[1] in languages] # PAN-X.{2-letter ISO language code}
seed = 42

In [3]:
data_dict = defaultdict(DatasetDict)

for config, lang, prop in zip(configs, languages, proportions):
	data_dict[lang] = load_dataset(dataset_name, name=config)
	for split, ds in data_dict[lang].items():
		data_dict[lang][split] = ds.shuffle(seed).select(range(int(ds.num_rows * prop)))

Found cached dataset xtreme (/home/siddhesh1793/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached shuffled indices for dataset at /home/siddhesh1793/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-50c6130fc2dbe6ad.arrow
Loading cached shuffled indices for dataset at /home/siddhesh1793/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-3d878c38ca830baa.arrow
Loading cached shuffled indices for dataset at /home/siddhesh1793/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-c8fed2b7e6d59cbc.arrow
Found cached dataset xtreme (/home/siddhesh1793/.cache/huggingface/datasets/xtreme/PAN-X.en/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached shuffled indices for dataset at /home/siddhesh1793/.cache/huggingface/datasets/xtreme/PAN-X.en/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-e14b50505509ca06.arrow
Loading cached shuffled indices for dataset at /home/siddhesh1793/.cache/huggingface/datasets/xtreme/PAN-X.en/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-529925d4984531e4.arrow
Loading cached shuffled indices for dataset at /home/siddhesh1793/.cache/huggingface/datasets/xtreme/PAN-X.en/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-ef60137063549caf.arrow
Found cached dataset xtreme (/home/siddhesh1793/.cache/huggingface/datasets/xtreme/PAN-X.fr/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached shuffled indices for dataset at /home/siddhesh1793/.cache/huggingface/datasets/xtreme/PAN-X.fr/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-1d0cd9eb0adc8933.arrow
Loading cached shuffled indices for dataset at /home/siddhesh1793/.cache/huggingface/datasets/xtreme/PAN-X.fr/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-e0dc2d749c429e9e.arrow
Loading cached shuffled indices for dataset at /home/siddhesh1793/.cache/huggingface/datasets/xtreme/PAN-X.fr/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-ca0a6f6c69069001.arrow
Found cached dataset xtreme (/home/siddhesh1793/.cache/huggingface/datasets/xtreme/PAN-X.it/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached shuffled indices for dataset at /home/siddhesh1793/.cache/huggingface/datasets/xtreme/PAN-X.it/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-02142ffc6dacef09.arrow
Loading cached shuffled indices for dataset at /home/siddhesh1793/.cache/huggingface/datasets/xtreme/PAN-X.it/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-79992a3f46d42fd5.arrow
Loading cached shuffled indices for dataset at /home/siddhesh1793/.cache/huggingface/datasets/xtreme/PAN-X.it/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-0e5a91e037304e5e.arrow


In [13]:
mapping = data_dict['de']['train'].features['ner_tags'].feature # ClassLabel object

In [14]:
def create_ner_names(x, mapping=mapping):
	x['ner_names'] = [mapping.int2str(idx) for idx in x['ner_tags']]
	return x

In [None]:
for lang in data_dict.keys():
	for split in data_dict[lang].keys():
		data_dict[lang][split] = data_dict[lang][split].map(create_ner_names)

In [23]:
counts_dict = defaultdict(Counter)
df_dict = defaultdict(list)

for split in data_dict['de'].keys():
	df_dict['split'].append(split)
	ner_list = []
	for lang in data_dict.keys():
		ds = data_dict[lang][split]

		for item in ds:
			ner_list.extend([tag for tag in item['ner_names'] if tag.startswith('B-')])
	counts_dict[split] = Counter(ner_list)

	for key, value in counts_dict[split].items():
		df_dict[key] = value

In [24]:
df_dict = {'split' : [], 'B-ORG' : [], 'B-PER' : [], 'B-LOC' : []}

for split in counts_dict:
	df_dict['split'].append(split)

	for key in counts_dict[split].keys():
		df_dict[key].append(counts_dict[split][key])

pd.DataFrame.from_dict(df_dict)

Unnamed: 0,split,B-ORG,B-PER,B-LOC
0,train,8686,9241,9725
1,validation,4333,4623,4875
2,test,4317,4756,4893


## Multi-Lingual Transformers and Tokenization

* mBERT is eclipsed by XLM-RoBERTa. Differences are the following -
	* Trained on orders of magnitude larger dataset
	* NSP not used in pre-training objective
	* SentencePiece used as tokenizer instead of WordPiece
* Explore using SP and WP

In [2]:
from transformers import AutoTokenizer

xlmr_model_name = 'xlm-roberta-base'

word_piece = AutoTokenizer.from_pretrained('bert-base-uncased')
sentence_piece = AutoTokenizer.from_pretrained(xlmr_model_name)

### Try the tokenizers

In [30]:
sentence = "Jack Sparrow loves New York"
print (f"WordPiece : {word_piece(sentence).tokens()}")
print (f"SentencePiece : {sentence_piece(sentence).tokens()}")

WordPiece : ['[CLS]', 'jack', 'sparrow', 'loves', 'new', 'york', '[SEP]']
SentencePiece : ['<s>', '▁Jack', '▁Spar', 'row', '▁love', 's', '▁New', '▁York', '</s>']


In [40]:
sp_tokenized = sentence_piece(sentence).tokens()
"".join([tok.replace(u"\u2581", " ") for tok in sp_tokenized])

'<s> Jack Sparrow loves New York</s>'

## Multilingual NER model, Tokenize datasets, Performance metrics
* Implement XLM-R for token classification using pretrained XLM-R
* Write a helper function that tokenizes -> model.forward
* Function to tokenize entire dataset and take into consideration attention masks
* Play around with seqeval for evaluating token classification task

In [47]:
import torch.nn as nn
import torch.nn.functional as F
from transformers import XLMRobertaConfig
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.modeling_roberta import RobertaModel, RobertaPreTrainedModel

In [80]:
class XLMRobertaForTokenClassification(RobertaPreTrainedModel):
	config_class = XLMRobertaConfig
	def __init__(self, config):
		super().__init__(config)

		self.num_labels = config.num_labels

		# Rename to roberta 
		self.roberta = RobertaModel(config, add_pooling_layer=True)

		self.dropout = nn.Dropout(config.hidden_dropout_prob)
		self.classifier = nn.Linear(config.hidden_size, config.num_labels)

		self.init_weights()

	def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, **kwargs):
		outputs = self.roberta(input_ids, attention_mask, token_type_ids, **kwargs)

		sequence_op = self.dropout(outputs[0])
		logits = self.classifier(sequence_op)

		loss = None

		if labels is not None:
			loss = F.cross_entropy(logits.view(-1, self.num_labels), labels.view(-1))

		return TokenClassifierOutput(loss, logits, outputs.hidden_states, outputs.attentions)


In [81]:
from transformers import AutoConfig

index2tag = {idx: tag for idx, tag in enumerate(mapping.names)}
tag2index = {tag: idx for idx, tag in enumerate(mapping.names)}

xlmr_config = AutoConfig.from_pretrained(xlmr_model_name, num_labels=mapping.num_classes, id2label=index2tag, label2id=tag2index)

loading configuration file config.json from cache at /home/siddhesh1793/.cache/huggingface/hub/models--xlm-roberta-base/snapshots/42f548f32366559214515ec137cdd16002968bf6/config.json
Model config XLMRobertaConfig {
  "_name_or_path": "xlm-roberta-base",
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-PER",
    "2": "I-PER",
    "3": "B-ORG",
    "4": "I-ORG",
    "5": "B-LOC",
    "6": "I-LOC"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-LOC": 5,
    "B-ORG": 3,
    "B-PER": 1,
    "I-LOC": 6,
    "I-ORG": 4,
    "I-PER": 2,
    "O": 0
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad

In [82]:
import torch

device = 'cuda'

xlmr_model = XLMRobertaForTokenClassification.from_pretrained(xlmr_model_name, config=xlmr_config).to(device)

loading weights file pytorch_model.bin from cache at /home/siddhesh1793/.cache/huggingface/hub/models--xlm-roberta-base/snapshots/42f548f32366559214515ec137cdd16002968bf6/pytorch_model.bin
Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForTokenClassification: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForTokenClassifica

In [83]:
input_ids = sentence_piece.encode(sentence, return_tensors="pt")
xlmr_tokens = sentence_piece(sentence).tokens()
pd.DataFrame([xlmr_tokens, input_ids[0].numpy()], index=['Tokens', 'Ids'])

Unnamed: 0,0,1,2,3,4,5,6,7,8
Tokens,<s>,▁Jack,▁Spar,row,▁love,s,▁New,▁York,</s>
Ids,0,21763,37456,15555,5161,7,2356,5753,2


In [88]:
outputs = xlmr_model(input_ids.to(device))
preds = torch.argmax(outputs.logits, dim=-1)

In [93]:
preds_str = [mapping.names[idx] for idx in preds[0]]
pd.DataFrame([xlmr_tokens, preds_str], index=['Tokens', 'Preds'])

Unnamed: 0,0,1,2,3,4,5,6,7,8
Tokens,<s>,▁Jack,▁Spar,row,▁love,s,▁New,▁York,</s>
Preds,I-PER,I-PER,I-PER,I-PER,I-PER,I-PER,I-PER,I-PER,I-PER


In [4]:
sentence = "Jack Sparrow loves New York"

In [36]:
sp_backend_tokenizer = sentence_piece.backend_tokenizer	
normalized_str = sp_backend_tokenizer.normalizer.normalize_str(sentence)
print(normalized_str)
pretokenized_str = sp_backend_tokenizer.pre_tokenizer.pre_tokenize_str(normalized_str)
print(pretokenized_str)
tokenized_str = sp_backend_tokenizer.model.tokenize(normalized_str)
print(tokenized_str)
#
#sentence_piece(sentence).tokens()

Jack Sparrow loves New York
[('▁Jack', (0, 4)), ('▁Sparrow', (5, 12)), ('▁loves', (13, 18)), ('▁New', (19, 22)), ('▁York', (23, 27))]
[<tokenizers.Token object at 0x7fbc59ca7da0>, <tokenizers.Token object at 0x7fbc59cae080>, <tokenizers.Token object at 0x7fbc59cae670>, <tokenizers.Token object at 0x7fbc59cae6c0>, <tokenizers.Token object at 0x7fbc59cae710>, <tokenizers.Token object at 0x7fbc59cae760>, <tokenizers.Token object at 0x7fbc59cae7b0>, <tokenizers.Token object at 0x7fbc59cae800>, <tokenizers.Token object at 0x7fbc59cae850>, <tokenizers.Token object at 0x7fbc59cae8a0>, <tokenizers.Token object at 0x7fbc59cae8f0>]


In [40]:
sp_backend_tokenizer = word_piece.backend_tokenizer	
normalized_str = sp_backend_tokenizer.normalizer.normalize_str(sentence)
print(normalized_str)
pretokenized_str = sp_backend_tokenizer.pre_tokenizer.pre_tokenize_str(normalized_str)
print(pretokenized_str)
tokenized_str = sp_backend_tokenizer.model.tokenize(normalized_str)
print(tokenized_str)

word_piece(sentence).tokens()

jack sparrow loves new york
[('jack', (0, 4)), ('sparrow', (5, 12)), ('loves', (13, 18)), ('new', (19, 22)), ('york', (23, 27))]
[<tokenizers.Token object at 0x7fbc59c14ad0>]


['[CLS]', 'jack', 'sparrow', 'loves', 'new', 'york', '[SEP]']