In [1]:
# Install the transformers library
!pip install transformers torch -q

# Import the 'pipeline' function from the transformers library
from transformers import pipeline

In [2]:
# Load the pre-trained NER pipeline
# This will download the model and tokenizer from the Hugging Face Hub the first time you run it.
# We use aggregation_strategy="simple" to automatically group sub-words.
# For example, "Rishi" and "##Sunak" will be combined into a single entity "Rishi Sunak".
ner_pipeline = pipeline(
    "ner",
    model="dslim/bert-base-NER",
    tokenizer="dslim/bert-base-NER",
    aggregation_strategy="simple"
)

print("NER pipeline loaded successfully!")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu


NER pipeline loaded successfully!


In [5]:
# --- Example 1 ---
sentence1 = "Apple's CEO, Tim Cook, unveiled the new MacBook Pro at their annual conference in Cupertino, California."

# Get the NER results
ner_results1 = ner_pipeline(sentence1)

# Print the results in a clean format
print(f"Sentence: '{sentence1}'")
print("\nPredicted Entities:")
for entity in ner_results1:
    print(f"  - Entity: '{entity['word']}', Type: {entity['entity_group']}, Confidence: {entity['score']:.4f}")



Sentence: 'Apple's CEO, Tim Cook, unveiled the new MacBook Pro at their annual conference in Cupertino, California.'

Predicted Entities:
  - Entity: 'Apple', Type: ORG, Confidence: 0.9985
  - Entity: 'Tim Cook', Type: PER, Confidence: 0.9997
  - Entity: 'MacBook Pro', Type: MISC, Confidence: 0.9978
  - Entity: 'Cupertino', Type: LOC, Confidence: 0.9977
  - Entity: 'California', Type: LOC, Confidence: 0.9993


In [7]:
# Save the model in HDF5 format
model_path = "ner_model.h5"
ner_pipeline.model.save_pretrained(model_path)

print(f"Model saved successfully to {model_path}")

Model saved successfully to ner_model.h5


In [8]:
# Install the necessary libraries: transformers for the model, torch as the backend, and gradio for the UI
!pip install gradio transformers torch -q

In [9]:
import gradio as gr
from transformers import pipeline

# Load the pre-trained NER pipeline. This will be loaded once and reused.
# We use aggregation_strategy="simple" to automatically group sub-words
# (e.g., "Rishi" and "##Sunak" become "Rishi Sunak").
print("Loading NER model...")
ner_pipeline = pipeline(
    "ner",
    model="dslim/bert-base-NER",
    tokenizer="dslim/bert-base-NER",
    aggregation_strategy="simple"
)
print("Model loaded successfully!")

Loading NER model...


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


Model loaded successfully!


In [10]:
def ner_prediction(text):
    """
    This function takes a string of text, runs it through the NER pipeline,
    and formats the output for Gradio's HighlightedText component.
    """
    # Get the predictions from the pipeline
    ner_results = ner_pipeline(text)

    # Check if any entities were found
    if not ner_results:
        # If no entities, return the original text with a placeholder label
        return [(text, "No entities found")]

    # Format the results into a list of (word, entity_group) tuples
    # This is the specific format required by gr.HighlightedText
    highlighted_output = []
    for entity in ner_results:
        highlighted_output.append((entity['word'], entity['entity_group']))

    return highlighted_output

In [11]:
# Create the Gradio Interface
app = gr.Interface(
    fn=ner_prediction,
    inputs=gr.Textbox(
        lines=5,
        label="Input Text",
        placeholder="Enter a sentence here..."
    ),
    outputs=gr.HighlightedText(
        label="Named Entities",
        color_map={"PER": "red", "ORG": "blue", "LOC": "green", "MISC": "yellow"}
    ),
    title="Named Entity Recognition (NER) with BERT",
    description="""
    This app uses a pre-trained BERT model (dslim/bert-base-NER) to identify and highlight named entities in your text.
    Enter a sentence and see the model identify Persons (PER), Organizations (ORG), Locations (LOC), and Miscellaneous (MISC) entities.
    """,
    examples=[
        ["The British Prime Minister Rishi Sunak will visit Paris to meet with Emmanuel Macron next Tuesday."],
        ["Apple, based in Cupertino, California, was co-founded by Steve Jobs."],
        ["The NASA rover Perseverance successfully collected its first rock sample on Mars on September 6th, 2021."]
    ]
)

# Launch the app. A public URL will be generated.
app.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://73ddfeae1cd9fd4e14.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


