In [6]:
!pip install -qU transformers[torch] datasets accelerate tokenizers seqeval evaluate

In [53]:
!pip install -qU datasets

In [7]:
import datasets
import numpy as np
from transformers import BertTokenizerFast
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification

In [40]:
import os
from google.colab import userdata
os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')

In [8]:
import os
os.environ["WANDB_DISABLED"] = "true"

CoNLL-2003 Dataset Kya Hai?

CoNLL-2003 ek NER (Named Entity Recognition) dataset hai, jo 2003 ke CoNLL (Conference on Computational Natural Language Learning) me introduce kiya gaya tha. Yeh dataset news articles ka collection hai aur 4 entity types detect karne ke liye use hota hai:

PER → Person (e.g., "Elon Musk")

LOC → Location (e.g., "India", "New York")

ORG → Organization (e.g., "Google", "NASA")

MISC → Miscellaneous (e.g., "Olympics", "iPhone")


In [9]:
conll2003 = datasets.load_dataset("conll2003")

In [12]:
conll2003

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [13]:
conll2003["train"]

Dataset({
    features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
    num_rows: 14041
})

In [14]:
conll2003["train"].description

'The shared task of CoNLL-2003 concerns language-independent named entity recognition. We will concentrate on\nfour types of named entities: persons, locations, organizations and names of miscellaneous entities that do\nnot belong to the previous three groups.\n\nThe CoNLL-2003 shared task data files contain four columns separated by a single space. Each word has been put on\na separate line and there is an empty line after each sentence. The first item on each line is a word, the second\na part-of-speech (POS) tag, the third a syntactic chunk tag and the fourth the named entity tag. The chunk tags\nand the named entity tags have the format I-TYPE which means that the word is inside a phrase of type TYPE. Only\nif two phrases of the same type immediately follow each other, the first word of the second phrase will have tag\nB-TYPE to show that it starts a new phrase. A word with tag O is not part of a phrase. Note the dataset uses IOB2\ntagging scheme, whereas the original dataset uses 

In [15]:
conll2003["train"]. features

{'id': Value(dtype='string', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'pos_tags': Sequence(feature=ClassLabel(names=['"', "''", '#', '$', '(', ')', ',', '.', ':', '``', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'NN|SYM', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'], id=None), length=-1, id=None),
 'chunk_tags': Sequence(feature=ClassLabel(names=['O', 'B-ADJP', 'I-ADJP', 'B-ADVP', 'I-ADVP', 'B-CONJP', 'I-CONJP', 'B-INTJ', 'I-INTJ', 'B-LST', 'I-LST', 'B-NP', 'I-NP', 'B-PP', 'I-PP', 'B-PRT', 'I-PRT', 'B-SBAR', 'I-SBAR', 'B-UCP', 'I-UCP', 'B-VP', 'I-VP'], id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)}

In [16]:
conll2003["train"][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [17]:
conll2003["train"]. features['ner_tags']

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

In [18]:
conll2003["train"]. features['chunk_tags']

Sequence(feature=ClassLabel(names=['O', 'B-ADJP', 'I-ADJP', 'B-ADVP', 'I-ADVP', 'B-CONJP', 'I-CONJP', 'B-INTJ', 'I-INTJ', 'B-LST', 'I-LST', 'B-NP', 'I-NP', 'B-PP', 'I-PP', 'B-PRT', 'I-PRT', 'B-SBAR', 'I-SBAR', 'B-UCP', 'I-UCP', 'B-VP', 'I-VP'], id=None), length=-1, id=None)

In [19]:
conll2003["train"]. features['pos_tags']

Sequence(feature=ClassLabel(names=['"', "''", '#', '$', '(', ')', ',', '.', ':', '``', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'NN|SYM', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'], id=None), length=-1, id=None)

The (DT)        → Determiner

quick (JJ)      → Adjective

brown (JJ)      → Adjective

fox (NN)        → Noun (Singular)

jumps (VBZ)     → Verb (3rd person singular)

over (IN)       → Preposition

the (DT)        → Determiner

lazy (JJ)       → Adjective

dog (NN)        → Noun (Singular)


In [20]:
example_text=conll2003['train'][0]

In [21]:
example_text

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [22]:
example_text["tokens"]

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

In [23]:
#model = "bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

In [24]:
tokenized_id = tokenizer(example_text["tokens"],is_split_into_words=True)

In [25]:
tokenized_id

{'input_ids': [101, 7327, 19164, 2446, 2655, 2000, 17757, 2329, 12559, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [26]:
tokenized_id["input_ids"]

[101, 7327, 19164, 2446, 2655, 2000, 17757, 2329, 12559, 1012, 102]

In [27]:
tokens = tokenizer.convert_ids_to_tokens(tokenized_id["input_ids"])

In [28]:
tokens

['[CLS]',
 'eu',
 'rejects',
 'german',
 'call',
 'to',
 'boycott',
 'british',
 'lamb',
 '.',
 '[SEP]']

In [29]:
example_text['ner_tags']

[3, 0, 7, 0, 0, 0, 7, 0, 0]

In [30]:
['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
[0,1,2,3,4,5,6,7,8]

[0, 1, 2, 3, 4, 5, 6, 7, 8]

In [31]:
def tokenize_and_align_labels(examples, label_all_tokens=True):

    #tokeinze ids
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []


    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        # word_ids() => Return a list mapping the tokens
        # to their actual word in the initial sentence.
        # It Returns a list indicating the word corresponding to each token.

        previous_word_idx = None
        label_ids = []
        # Special tokens like `` and `<\s>` are originally mapped to None
        # We need to set the label to -100 so they are automatically ignored in the loss function.
        for word_idx in word_ids:
            if word_idx is None:
                # set –100 as the label for these special tokens
                label_ids.append(-100)

            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            elif word_idx != previous_word_idx:
                # if current word_idx is != prev then its the most regular case
                # and add the corresponding token
                label_ids.append(label[word_idx])
            else:
                # to take care of sub-words which have the same word_idx
                # set -100 as well for them, but only if label_all_tokens == False
                label_ids.append(label[word_idx] if label_all_tokens else -100)
                # mask the subword representations after the first subword

            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [32]:
conll2003["train"][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [33]:
q=tokenize_and_align_labels(conll2003["train"][0:1])

In [34]:
for token, label in zip(tokenizer.convert_ids_to_tokens(q["input_ids"][0]),q["labels"][0]):
    print(f"{token:_<40} {label}")

[CLS]___________________________________ -100
eu______________________________________ 3
rejects_________________________________ 0
german__________________________________ 7
call____________________________________ 0
to______________________________________ 0
boycott_________________________________ 0
british_________________________________ 7
lamb____________________________________ 0
._______________________________________ 0
[SEP]___________________________________ -100


In [35]:
tokenized_datasets = conll2003.map(tokenize_and_align_labels, batched=True)

In [36]:
tokenized_datasets["train"][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0],
 'input_ids': [101,
  7327,
  19164,
  2446,
  2655,
  2000,
  17757,
  2329,
  12559,
  1012,
  102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, -100]}

In [37]:
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased",num_labels=9)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
model

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [39]:
label_list=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [40]:
import evaluate
metric = evaluate.load("seqeval")

In [41]:
example=conll2003["train"][0]

In [42]:
example

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [43]:
conll2003["train"].features

{'id': Value(dtype='string', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'pos_tags': Sequence(feature=ClassLabel(names=['"', "''", '#', '$', '(', ')', ',', '.', ':', '``', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'NN|SYM', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'], id=None), length=-1, id=None),
 'chunk_tags': Sequence(feature=ClassLabel(names=['O', 'B-ADJP', 'I-ADJP', 'B-ADVP', 'I-ADVP', 'B-CONJP', 'I-CONJP', 'B-INTJ', 'I-INTJ', 'B-LST', 'I-LST', 'B-NP', 'I-NP', 'B-PP', 'I-PP', 'B-PRT', 'I-PRT', 'B-SBAR', 'I-SBAR', 'B-UCP', 'I-UCP', 'B-VP', 'I-VP'], id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)}

In [44]:
conll2003["train"].features["ner_tags"].feature. names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [45]:
example

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [46]:
label_list=conll2003["train"].features["ner_tags"].feature. names

In [47]:
label_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [48]:
for i in example["ner_tags"]:
  print(i)

3
0
7
0
0
0
7
0
0


In [49]:
labels = [label_list[i] for i in example["ner_tags"]]
labels

['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']

In [50]:
label=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

Assusming example

In [51]:
input=['EU','rejects','German','call','to','boycott','British','lamb','.']

actual_lable=['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']

pred_lable=['B-ORG', 'O', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [52]:
metric.compute(predictions=[pred_lable],references=[actual_lable])

  _warn_prf(average, modifier, msg_start, len(result))


{'LOC': {'precision': np.float64(0.0),
  'recall': np.float64(0.0),
  'f1': np.float64(0.0),
  'number': np.int64(0)},
 'MISC': {'precision': np.float64(0.0),
  'recall': np.float64(0.0),
  'f1': np.float64(0.0),
  'number': np.int64(2)},
 'ORG': {'precision': np.float64(0.5),
  'recall': np.float64(1.0),
  'f1': np.float64(0.6666666666666666),
  'number': np.int64(1)},
 'PER': {'precision': np.float64(0.0),
  'recall': np.float64(0.0),
  'f1': np.float64(0.0),
  'number': np.int64(0)},
 'overall_precision': np.float64(0.2),
 'overall_recall': np.float64(0.3333333333333333),
 'overall_f1': np.float64(0.25),
 'overall_accuracy': 0.2222222222222222}

In [53]:
results=metric.compute(predictions=[labels],references=[labels])

In [54]:
print(list(results.keys()))

['MISC', 'ORG', 'overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy']


In [55]:
print(results["overall_f1"])

1.0


In [56]:
from transformers import TrainingArguments, Trainer

In [57]:
#these are hyperparameter
args=TrainingArguments(
    "test-ner",
    eval_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    report_to="none"  # Disable wandb logging
)

In [58]:
data_collator=DataCollatorForTokenClassification(tokenizer)

In [59]:
def compute_metrics(eval_preds):
    pred_logits, labels = eval_preds

    pred_logits = np.argmax(pred_logits, axis=2)
    # the logits and the probabilities are in the same order,
    # so we don’t need to apply the softmax

    # We remove all the values where the label is -100
    predictions = [
        [label_list[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(pred_logits, labels)
    ]

    true_labels = [
      [label_list[l] for (eval_preds, l) in zip(prediction, label) if l != -100]
       for prediction, label in zip(pred_logits, labels)
   ]
    results = metric.compute(predictions=predictions, references=true_labels)

    return {
          "precision": results["overall_precision"],
          "recall": results["overall_recall"],
          "f1": results["overall_f1"],
          "accuracy": results["overall_accuracy"],
  }

In [60]:
trainer=Trainer(
   model,
   args,
   train_dataset=tokenized_datasets["train"],
   eval_dataset=tokenized_datasets["validation"],
   data_collator=data_collator,
   tokenizer=tokenizer,
   compute_metrics=compute_metrics
)

  trainer=Trainer(


In [61]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2231,0.070292,0.899759,0.91979,0.909664,0.980047


TrainOutput(global_step=878, training_loss=0.16350296037886844, metrics={'train_runtime': 200.1257, 'train_samples_per_second': 70.161, 'train_steps_per_second': 4.387, 'total_flos': 341387954498718.0, 'train_loss': 0.16350296037886844, 'epoch': 1.0})

In [62]:
model.save_pretrained("ner_model")

In [63]:
# import shutil

# # Model folder ka naam jo tu save kar chuka hai
# model_folder = "/content/ner_model"

# # Model ko zip me convert karo
# shutil.make_archive("bert_model", 'zip', model_folder)

# print("Model successfully zipped as bert_model.zip")


In [64]:
# from google.colab import files

# # ZIP file ko download karne ke liye
# files.download("bert_model.zip")


In [65]:
# import zipfile

# # ZIP file ka path
# zip_path = "bert_model.zip"

# # Extract karne ke liye
# with zipfile.ZipFile(zip_path, 'r') as zip_ref:
#     zip_ref.extractall("./extracted_model")

# print("Model successfully extracted!")

In [66]:
tokenizer.save_pretrained("tokenizer")

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.txt',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

In [67]:
import json

In [68]:
config=json.load(open("/content/ner_model/config.json"))

In [69]:
config

{'architectures': ['BertForTokenClassification'],
 'attention_probs_dropout_prob': 0.1,
 'classifier_dropout': None,
 'gradient_checkpointing': False,
 'hidden_act': 'gelu',
 'hidden_dropout_prob': 0.1,
 'hidden_size': 768,
 'id2label': {'0': 'LABEL_0',
  '1': 'LABEL_1',
  '2': 'LABEL_2',
  '3': 'LABEL_3',
  '4': 'LABEL_4',
  '5': 'LABEL_5',
  '6': 'LABEL_6',
  '7': 'LABEL_7',
  '8': 'LABEL_8'},
 'initializer_range': 0.02,
 'intermediate_size': 3072,
 'label2id': {'LABEL_0': 0,
  'LABEL_1': 1,
  'LABEL_2': 2,
  'LABEL_3': 3,
  'LABEL_4': 4,
  'LABEL_5': 5,
  'LABEL_6': 6,
  'LABEL_7': 7,
  'LABEL_8': 8},
 'layer_norm_eps': 1e-12,
 'max_position_embeddings': 512,
 'model_type': 'bert',
 'num_attention_heads': 12,
 'num_hidden_layers': 12,
 'pad_token_id': 0,
 'position_embedding_type': 'absolute',
 'torch_dtype': 'float32',
 'transformers_version': '4.52.4',
 'type_vocab_size': 2,
 'use_cache': True,
 'vocab_size': 30522}

In [70]:
conll2003["train"].features["ner_tags"].feature. names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [71]:
id2label = {str(i): label for i,label in enumerate(label_list)}

In [72]:
label2id = {label: str(i) for i,label in enumerate(label_list)}

In [73]:
config["id2label"]=id2label

In [74]:
config["label2id"] = label2id

In [75]:
json.dump(config,open("/content/ner_model/config.json","w"))

In [76]:
config=json.load(open("/content/ner_model/config.json"))

In [77]:
config

{'architectures': ['BertForTokenClassification'],
 'attention_probs_dropout_prob': 0.1,
 'classifier_dropout': None,
 'gradient_checkpointing': False,
 'hidden_act': 'gelu',
 'hidden_dropout_prob': 0.1,
 'hidden_size': 768,
 'id2label': {'0': 'O',
  '1': 'B-PER',
  '2': 'I-PER',
  '3': 'B-ORG',
  '4': 'I-ORG',
  '5': 'B-LOC',
  '6': 'I-LOC',
  '7': 'B-MISC',
  '8': 'I-MISC'},
 'initializer_range': 0.02,
 'intermediate_size': 3072,
 'label2id': {'O': '0',
  'B-PER': '1',
  'I-PER': '2',
  'B-ORG': '3',
  'I-ORG': '4',
  'B-LOC': '5',
  'I-LOC': '6',
  'B-MISC': '7',
  'I-MISC': '8'},
 'layer_norm_eps': 1e-12,
 'max_position_embeddings': 512,
 'model_type': 'bert',
 'num_attention_heads': 12,
 'num_hidden_layers': 12,
 'pad_token_id': 0,
 'position_embedding_type': 'absolute',
 'torch_dtype': 'float32',
 'transformers_version': '4.52.4',
 'type_vocab_size': 2,
 'use_cache': True,
 'vocab_size': 30522}

# Tansformer Pipeline

In [8]:
from transformers import pipeline

In [9]:
model_name = "bert-base-uncased"

In [10]:
#model_name="rahuls37/NER-Model-Fine-Tuned"

In [11]:
tokenizer = BertTokenizerFast.from_pretrained("/content/tokenizer")

In [12]:
model_fine_tuned=AutoModelForTokenClassification.from_pretrained("/content/ner_model")

In [83]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: write

In [13]:
model_fine_tuned=AutoModelForTokenClassification.from_pretrained(model_name)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
nlp_pipeline=pipeline("ner",model=model_fine_tuned,tokenizer=tokenizer, device='cuda')

Device set to use cuda


In [None]:
example="Rahul is Data Scientist and Generative AI Engineer"

In [16]:
nlp_pipeline(example)

[{'entity': 'LABEL_0',
  'score': np.float32(0.62513596),
  'index': 1,
  'word': 'sunny',
  'start': 0,
  'end': 5},
 {'entity': 'LABEL_0',
  'score': np.float32(0.55061716),
  'index': 2,
  'word': 'is',
  'start': 6,
  'end': 8},
 {'entity': 'LABEL_0',
  'score': np.float32(0.57001513),
  'index': 3,
  'word': 'data',
  'start': 9,
  'end': 13},
 {'entity': 'LABEL_0',
  'score': np.float32(0.5448726),
  'index': 4,
  'word': 'scientist',
  'start': 14,
  'end': 23},
 {'entity': 'LABEL_1',
  'score': np.float32(0.5516194),
  'index': 5,
  'word': 'and',
  'start': 24,
  'end': 27},
 {'entity': 'LABEL_0',
  'score': np.float32(0.5708128),
  'index': 6,
  'word': 'genera',
  'start': 28,
  'end': 34},
 {'entity': 'LABEL_0',
  'score': np.float32(0.58616346),
  'index': 7,
  'word': '##tive',
  'start': 34,
  'end': 38},
 {'entity': 'LABEL_0',
  'score': np.float32(0.6936058),
  'index': 8,
  'word': 'ai',
  'start': 39,
  'end': 41},
 {'entity': 'LABEL_0',
  'score': np.float32(0.63436

In [17]:
example2="apple launch mobile while eating apple which taste like orange"

In [18]:
nlp_pipeline(example2)

[{'entity': 'LABEL_0',
  'score': np.float32(0.5643003),
  'index': 1,
  'word': 'apple',
  'start': 0,
  'end': 5},
 {'entity': 'LABEL_0',
  'score': np.float32(0.50278264),
  'index': 2,
  'word': 'launch',
  'start': 6,
  'end': 12},
 {'entity': 'LABEL_1',
  'score': np.float32(0.54607844),
  'index': 3,
  'word': 'mobile',
  'start': 13,
  'end': 19},
 {'entity': 'LABEL_1',
  'score': np.float32(0.52345985),
  'index': 4,
  'word': 'while',
  'start': 20,
  'end': 25},
 {'entity': 'LABEL_1',
  'score': np.float32(0.53127474),
  'index': 5,
  'word': 'eating',
  'start': 26,
  'end': 32},
 {'entity': 'LABEL_0',
  'score': np.float32(0.5823803),
  'index': 6,
  'word': 'apple',
  'start': 33,
  'end': 38},
 {'entity': 'LABEL_1',
  'score': np.float32(0.56778514),
  'index': 7,
  'word': 'which',
  'start': 39,
  'end': 44},
 {'entity': 'LABEL_1',
  'score': np.float32(0.52626485),
  'index': 8,
  'word': 'taste',
  'start': 45,
  'end': 50},
 {'entity': 'LABEL_1',
  'score': np.float

In [None]:
example="my name is Rahul Samant"

In [20]:
nlp_pipeline(example)

[{'entity': 'LABEL_0',
  'score': np.float32(0.65440047),
  'index': 1,
  'word': 'my',
  'start': 0,
  'end': 2},
 {'entity': 'LABEL_0',
  'score': np.float32(0.5329803),
  'index': 2,
  'word': 'name',
  'start': 3,
  'end': 7},
 {'entity': 'LABEL_0',
  'score': np.float32(0.5358475),
  'index': 3,
  'word': 'is',
  'start': 8,
  'end': 10},
 {'entity': 'LABEL_0',
  'score': np.float32(0.69338924),
  'index': 4,
  'word': 'su',
  'start': 11,
  'end': 13},
 {'entity': 'LABEL_0',
  'score': np.float32(0.62256986),
  'index': 5,
  'word': '##un',
  'start': 13,
  'end': 15},
 {'entity': 'LABEL_0',
  'score': np.float32(0.53070694),
  'index': 6,
  'word': '##y',
  'start': 15,
  'end': 16},
 {'entity': 'LABEL_0',
  'score': np.float32(0.6166061),
  'index': 7,
  'word': 'sa',
  'start': 17,
  'end': 19},
 {'entity': 'LABEL_0',
  'score': np.float32(0.6299165),
  'index': 8,
  'word': '##vita',
  'start': 19,
  'end': 23}]

In [21]:
example2="apple launch mobile while"

In [22]:
nlp_pipeline(example2)

[{'entity': 'LABEL_0',
  'score': np.float32(0.61157155),
  'index': 1,
  'word': 'apple',
  'start': 0,
  'end': 5},
 {'entity': 'LABEL_1',
  'score': np.float32(0.50158113),
  'index': 2,
  'word': 'launch',
  'start': 6,
  'end': 12},
 {'entity': 'LABEL_1',
  'score': np.float32(0.55365235),
  'index': 3,
  'word': 'mobile',
  'start': 13,
  'end': 19},
 {'entity': 'LABEL_0',
  'score': np.float32(0.533301),
  'index': 4,
  'word': 'while',
  'start': 20,
  'end': 25}]

In [23]:
example="apple founder loves eating apple"

In [24]:
nlp_pipeline(example)

[{'entity': 'LABEL_0',
  'score': np.float32(0.69223094),
  'index': 1,
  'word': 'apple',
  'start': 0,
  'end': 5},
 {'entity': 'LABEL_1',
  'score': np.float32(0.52663136),
  'index': 2,
  'word': 'founder',
  'start': 6,
  'end': 13},
 {'entity': 'LABEL_0',
  'score': np.float32(0.5877507),
  'index': 3,
  'word': 'loves',
  'start': 14,
  'end': 19},
 {'entity': 'LABEL_0',
  'score': np.float32(0.5124518),
  'index': 4,
  'word': 'eating',
  'start': 20,
  'end': 26},
 {'entity': 'LABEL_1',
  'score': np.float32(0.5292877),
  'index': 5,
  'word': 'apple',
  'start': 27,
  'end': 32}]

In [25]:
example="Microsoft Windows created their software by idea that came from the window of the house"

In [26]:
nlp_pipeline(example)

[{'entity': 'LABEL_0',
  'score': np.float32(0.8010843),
  'index': 1,
  'word': 'microsoft',
  'start': 0,
  'end': 9},
 {'entity': 'LABEL_0',
  'score': np.float32(0.75660294),
  'index': 2,
  'word': 'windows',
  'start': 10,
  'end': 17},
 {'entity': 'LABEL_0',
  'score': np.float32(0.5212051),
  'index': 3,
  'word': 'created',
  'start': 18,
  'end': 25},
 {'entity': 'LABEL_0',
  'score': np.float32(0.59358436),
  'index': 4,
  'word': 'their',
  'start': 26,
  'end': 31},
 {'entity': 'LABEL_0',
  'score': np.float32(0.7306076),
  'index': 5,
  'word': 'software',
  'start': 32,
  'end': 40},
 {'entity': 'LABEL_0',
  'score': np.float32(0.52446055),
  'index': 6,
  'word': 'by',
  'start': 41,
  'end': 43},
 {'entity': 'LABEL_0',
  'score': np.float32(0.557764),
  'index': 7,
  'word': 'idea',
  'start': 44,
  'end': 48},
 {'entity': 'LABEL_1',
  'score': np.float32(0.50097317),
  'index': 8,
  'word': 'that',
  'start': 49,
  'end': 53},
 {'entity': 'LABEL_1',
  'score': np.floa

In [27]:
example= "sunny is a founder of facebook and microsoft"

In [28]:
nlp_pipeline(example)

[{'entity': 'LABEL_0',
  'score': np.float32(0.6400294),
  'index': 1,
  'word': 'sunny',
  'start': 0,
  'end': 5},
 {'entity': 'LABEL_0',
  'score': np.float32(0.63691145),
  'index': 2,
  'word': 'is',
  'start': 6,
  'end': 8},
 {'entity': 'LABEL_0',
  'score': np.float32(0.5735639),
  'index': 3,
  'word': 'a',
  'start': 9,
  'end': 10},
 {'entity': 'LABEL_0',
  'score': np.float32(0.59646946),
  'index': 4,
  'word': 'founder',
  'start': 11,
  'end': 18},
 {'entity': 'LABEL_0',
  'score': np.float32(0.68693227),
  'index': 5,
  'word': 'of',
  'start': 19,
  'end': 21},
 {'entity': 'LABEL_0',
  'score': np.float32(0.6014315),
  'index': 6,
  'word': 'facebook',
  'start': 22,
  'end': 30},
 {'entity': 'LABEL_0',
  'score': np.float32(0.6331626),
  'index': 7,
  'word': 'and',
  'start': 31,
  'end': 34},
 {'entity': 'LABEL_0',
  'score': np.float32(0.6610162),
  'index': 8,
  'word': 'microsoft',
  'start': 35,
  'end': 44}]

In [106]:
# 1. pretrain model
# 2. supervise fiunetuing(NER data)
# 3. ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [47]:
!pip install -U "huggingface_hub[cli]"

Collecting InquirerPy==0.3.4 (from huggingface_hub[cli])
  Downloading InquirerPy-0.3.4-py3-none-any.whl.metadata (8.1 kB)
Collecting pfzy<0.4.0,>=0.3.1 (from InquirerPy==0.3.4->huggingface_hub[cli])
  Downloading pfzy-0.3.4-py3-none-any.whl.metadata (4.9 kB)
Downloading InquirerPy-0.3.4-py3-none-any.whl (67 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.7/67.7 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pfzy-0.3.4-py3-none-any.whl (8.5 kB)
Installing collected packages: pfzy, InquirerPy
Successfully installed InquirerPy-0.3.4 pfzy-0.3.4


In [48]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: write

In [49]:
from transformers import AutoModelForTokenClassification, AutoTokenizer
from huggingface_hub import HfApi

In [50]:
# Model directory (jisme fine-tuned model hai)
model_dir = "/content/ner_model"

In [51]:
!huggingface-cli upload rahuls37/NER-Model-Fine-Tuned /content/ner_model

Start hashing 2 files.
Finished hashing 2 files.
Uploading...: 100% 436M/436M [00:11<00:00, 39.0MB/s]
https://huggingface.co/rahuls37/NER-Model-Fine-Tuned/tree/main/.


In [52]:

#tokenizer.push_to_hub("your_username/your_model_name")

print("✅ Model pushed successfully!")

✅ Model pushed successfully!


In [None]:
# For Finetuning bert on Summarization dataset
# Consider the below data: https://huggingface.co/datasets/alexfabbri/multi_news

# follow the same notebook and make required changes

# For Finetuning bert on text classification dataset
# consider the below data: https://huggingface.co/datasets/fancyzhx/ag_news

# follow the same notebook and make required changes