### NER (NAME ENTITY RECOGNITION)

#cOnll 2003
#XTREME
#crossNER

In [3]:
import pandas as pd

tokens = "Sandeep Reddy is a computer scientist at Google Aravind facebook Snapchat in Texas".split()
labels = ["B-PER", "I-PER", "O", "O", "O", "O", "O", "B-ORG", "B-PER","B-ORG", "I-ORG", "O", "B-LOC"]

df = pd.DataFrame(data=[tokens, labels], index=['tokens', 'labels'])
df


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
tokens,Sandeep,Reddy,is,a,computer,scientist,at,Google,Aravind,facebook,Snapchat,in,Texas
labels,B-PER,I-PER,O,O,O,O,O,B-ORG,B-PER,B-ORG,I-ORG,O,B-LOC


## Dataset
**Cross-lingual TRansfer Evaluation of Multilingual Encoders (XTREME)**

In [4]:
from datasets import get_dataset_config_names
from datasets import load_dataset

xtreme_subsets = get_dataset_config_names("xtreme")

  from .autonotebook import tqdm as notebook_tqdm
Downloading readme: 100%|██████████| 131k/131k [00:00<00:00, 1.86MB/s]


In [5]:
print(f"XTREME has {len(xtreme_subsets)} configurations")

XTREME has 183 configurations


In [6]:
panx_subsets = [s for s in xtreme_subsets if s.startswith("PAN")]
panx_subsets[:3]

['PAN-X.af', 'PAN-X.ar', 'PAN-X.bg']

In [7]:
len(panx_subsets)

40

In [8]:
load_dataset("xtreme", name="PAN-X.en")

Downloading data: 100%|██████████| 942k/942k [00:00<00:00, 1.32MB/s]
Downloading data: 100%|██████████| 472k/472k [00:00<00:00, 1.40MB/s]
Downloading data: 100%|██████████| 472k/472k [00:00<00:00, 1.22MB/s]
Generating train split: 100%|██████████| 20000/20000 [00:00<00:00, 1139772.01 examples/s]
Generating validation split: 100%|██████████| 10000/10000 [00:00<00:00, 1146830.72 examples/s]
Generating test split: 100%|██████████| 10000/10000 [00:00<00:00, 1142581.93 examples/s]


DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
})

In [9]:
## Load inside a variable
en = load_dataset("xtreme", name='PAN-X.en')

In [10]:
en.keys()

dict_keys(['train', 'validation', 'test'])

In [21]:
en_train = en['train']

In [12]:
en_train

Dataset({
    features: ['tokens', 'ner_tags', 'langs'],
    num_rows: 20000
})

In [13]:
en_train[42]

{'tokens': ['File', ':', 'CCStaClara.JPG|Through', 'Santa', 'Clara'],
 'ner_tags': [0, 0, 0, 5, 6],
 'langs': ['en', 'en', 'en', 'en', 'en']}

### Create a dictionary

In [27]:
pd.DataFrame(en["train"][0]).transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
tokens,R.H.,Saunders,(,St.,Lawrence,River,),(,968,MW,)
ner_tags,3,4,0,3,4,4,0,0,0,0,0
langs,en,en,en,en,en,en,en,en,en,en,en


In [15]:
for key, value in en["train"].features.items():
    print(f"{key}: {value} \n")

tokens: Sequence(feature=Value(dtype='string', id=None), length=-1, id=None) 

ner_tags: Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None) 

langs: Sequence(feature=Value(dtype='string', id=None), length=-1, id=None) 



In [16]:
tags = en["train"].features["ner_tags"].feature
def create_tag_names(batch):
    return {"ner_tags_str": [tags.int2str(idx) for idx in batch["ner_tags"]]}

panx_en = en.map(create_tag_names)
de_example = panx_en["train"][0]
pd.DataFrame([de_example["tokens"], de_example["ner_tags_str"]],
['Tokens', 'Tags'])

Map: 100%|██████████| 20000/20000 [00:00<00:00, 28714.47 examples/s]
Map: 100%|██████████| 10000/10000 [00:00<00:00, 27360.98 examples/s]
Map: 100%|██████████| 10000/10000 [00:00<00:00, 31148.91 examples/s]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
Tokens,R.H.,Saunders,(,St.,Lawrence,River,),(,968,MW,)
Tags,B-ORG,I-ORG,O,B-ORG,I-ORG,I-ORG,O,O,O,O,O


In [17]:
from collections import Counter
from collections import defaultdict
from datasets import DatasetDict
split2freqs = defaultdict(Counter)
for split, dataset in panx_en.items():
    for row in dataset["ner_tags_str"]:
        for tag in row:
            if tag.startswith("B"):
                tag_type = tag.split("-")[1]
                split2freqs[split][tag_type] += 1
pd.DataFrame.from_dict(split2freqs, orient="index")

Unnamed: 0,ORG,PER,LOC
train,9422,9164,9345
validation,4677,4635,4834
test,4745,4556,4657


### Model we would be using 

In [18]:
## XLM-Roberta model
from transformers import AutoTokenizer

bert_model_name = "bert-base-cased"
xlmr_model_name = "xlm-roberta-base"
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)

In [19]:
text = "Jack Sparrow loves New York!"
bert_tokens = bert_tokenizer(text).tokens()
xlmr_tokens = xlmr_tokenizer(text).tokens()

In [20]:
df = pd.DataFrame([bert_tokens, xlmr_tokens], index=["BERT", "XLM-R"])
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
BERT,[CLS],Jack,Spa,##rrow,loves,New,York,!,[SEP],
XLM-R,<s>,▁Jack,▁Spar,row,▁love,s,▁New,▁York,!,</s>


## SentencePiece Tokenizer

![chapter04_clf-architecture.png](attachment:chapter04_clf-architecture.png)
