#### Importing datasets library from Hugging Face

In [13]:
from datasets import load_dataset
import pandas as pd

#### Loading CoNLL-2003 Dataset

In [10]:
conll_dataset = load_dataset("conll2003", trust_remote_code=True)

In [11]:
print(conll_dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})


Train Samples: 14011  
Validation Samples: 3250  
Test Samples: 3453  

#### Exploring first sample from the train dataset

In [42]:
conll_dataset["train"][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [43]:
ner_labels = conll_dataset["train"].features["ner_tags"].feature
ner_labels

ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None)

| Numeric Tag | NER Label | Description                          |
|-------------|-----------|--------------------------------------|
| 0           | O         | Outside any named entity             |
| 1           | B-PER     | Beginning of a Person entity         |
| 2           | I-PER     | Inside a Person entity               |
| 3           | B-ORG     | Beginning of an Organization entity  |
| 4           | I-ORG     | Inside an Organization entity        |
| 5           | B-LOC     | Beginning of a Location entity       |
| 6           | I-LOC     | Inside a Location entity             |
| 7           | B-MISC    | Beginning of a Miscellaneous entity  |
| 8           | I-MISC    | Inside a Miscellaneous entity        |


In [44]:
df = pd.DataFrame(conll_dataset["train"][0])
df["ner_tags"] = df["ner_tags"].apply(ner_labels.int2str)
df

Unnamed: 0,id,tokens,pos_tags,chunk_tags,ner_tags
0,0,EU,22,11,B-ORG
1,0,rejects,42,21,O
2,0,German,16,11,B-MISC
3,0,call,21,12,O
4,0,to,35,21,O
5,0,boycott,37,22,O
6,0,British,16,11,B-MISC
7,0,lamb,21,12,O
8,0,.,7,0,O
