# Resume NER — Step 1
Environment setup and dataset check


In [1]:
!pip install -q transformers datasets seqeval evaluate accelerate gradio


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [2]:
import torch, sys
import transformers, datasets

print("Python:", sys.version.splitlines()[0])
print("PyTorch:", torch.__version__, "CUDA available:", torch.cuda.is_available())
print("Transformers:", transformers.__version__)
print("Datasets:", datasets.__version__)


Python: 3.12.11 (main, Jun  4 2025, 08:56:18) [GCC 11.4.0]
PyTorch: 2.8.0+cu126 CUDA available: True
Transformers: 4.56.1
Datasets: 4.0.0


In [12]:
!pip install -U datasets==2.19.1




In [16]:
from datasets import load_dataset


try:
    dataset = load_dataset("conll2003", revision="refs/convert/parquet")
except Exception as e:
    print(f"Could not load dataset as parquet: {e}")

    dataset = load_dataset("conll2003")

dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [17]:
from transformers import AutoTokenizer

MODEL_CHECKPOINT = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [18]:
label_list = dataset["train"].features["ner_tags"].feature.names
num_labels = len(label_list)

print("Number of labels:", num_labels)
print("Labels:", label_list)

label_to_id = {l: i for i, l in enumerate(label_list)}
id_to_label = {i: l for i, l in enumerate(label_list)}


Number of labels: 9
Labels: ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']


In [19]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding=False
    )

    all_labels = []
    for i, labels in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(labels[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        all_labels.append(label_ids)

    tokenized_inputs["labels"] = all_labels
    return tokenized_inputs


In [20]:
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)
tokenized_datasets



Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

In [21]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    MODEL_CHECKPOINT,
    num_labels=num_labels,
    id2label=id_to_label,
    label2id=label_to_id
)


model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
import evaluate

metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)

    true_labels = [
        [id_to_label[l] for l in label if l != -100]
        for label in labels
    ]
    true_predictions = [
        [id_to_label[p] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


Downloading builder script: 0.00B [00:00, ?B/s]

In [23]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)


In [24]:
from transformers import TrainingArguments

batch_size = 16

args = TrainingArguments(
    output_dir="bert-finetuned-ner",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=50,
    report_to="none"
)


In [25]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


  trainer = Trainer(


In [26]:
trainer.train()


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0476,0.041304,0.927689,0.93487,0.931266,0.98863
2,0.0176,0.040059,0.939308,0.945473,0.94238,0.990382
3,0.0125,0.037143,0.944565,0.952036,0.948286,0.991024


TrainOutput(global_step=2634, training_loss=0.05107785048544543, metrics={'train_runtime': 457.2087, 'train_samples_per_second': 92.131, 'train_steps_per_second': 5.761, 'total_flos': 1050534559887048.0, 'train_loss': 0.05107785048544543, 'epoch': 3.0})

In [27]:
metrics = trainer.evaluate()
print(metrics)


{'eval_loss': 0.03714258596301079, 'eval_precision': 0.9445650358991484, 'eval_recall': 0.9520363513968361, 'eval_f1': 0.9482859777051378, 'eval_accuracy': 0.9910244928157004, 'eval_runtime': 9.166, 'eval_samples_per_second': 354.57, 'eval_steps_per_second': 22.256, 'epoch': 3.0}


In [28]:
trainer.save_model("bert-ner-model")
tokenizer.save_pretrained("bert-ner-model")


('bert-ner-model/tokenizer_config.json',
 'bert-ner-model/special_tokens_map.json',
 'bert-ner-model/vocab.txt',
 'bert-ner-model/added_tokens.json',
 'bert-ner-model/tokenizer.json')

In [29]:
from huggingface_hub import notebook_login
notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [30]:
trainer.push_to_hub("bert-finetuned-ner")


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...ert-finetuned-ner/training_args.bin: 100%|##########| 5.71kB / 5.71kB            

  ...ert-finetuned-ner/model.safetensors:   1%|          | 3.23MB /  431MB            

CommitInfo(commit_url='https://huggingface.co/soh7/bert-finetuned-ner/commit/a7c7b6b38c213d70d6520796135c7e76193c350a', commit_message='bert-finetuned-ner', commit_description='', oid='a7c7b6b38c213d70d6520796135c7e76193c350a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/soh7/bert-finetuned-ner', endpoint='https://huggingface.co', repo_type='model', repo_id='soh7/bert-finetuned-ner'), pr_revision=None, pr_num=None)

In [32]:
import gradio as gr
import torch


from transformers import pipeline

ner_pipeline = pipeline(
    "token-classification",
    model="bert-ner-model",
    tokenizer="bert-ner-model",
    aggregation_strategy="simple"
)

def ner_inference(text):
    results = ner_pipeline(text)
    return results

demo = gr.Interface(
    fn=ner_inference,
    inputs=gr.Textbox(lines=5, placeholder="Paste resume text here..."),
    outputs="json",
    title="Resume NER Demo"
)

demo.launch(share=True)


Device set to use cuda:0


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://20e282b3efd4722edf.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [33]:
!rm -rf bert-finetuned-ner && git clone https://huggingface.co/spaces/soh7/bert-finetuned-ner
%cd bert-finetuned-ner



Cloning into 'bert-finetuned-ner'...
remote: Enumerating objects: 8, done.[K
remote: Counting objects: 100% (4/4), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 8 (delta 0), reused 0 (delta 0), pack-reused 4 (from 1)[K
Unpacking objects: 100% (8/8), 1.98 KiB | 1.98 MiB/s, done.
/content/bert-finetuned-ner


In [34]:
%%writefile app.py
import gradio as gr
from transformers import pipeline

# Load fine-tuned model from Hugging Face Hub
# Make sure you push your model with trainer.push_to_hub() first
ner_pipeline = pipeline(
    "token-classification",
    model="soh7/bert-finetuned-ner",   # change if your model repo is named differently
    tokenizer="soh7/bert-finetuned-ner",
    aggregation_strategy="simple"
)

def ner_inference(text):
    return ner_pipeline(text)

demo = gr.Interface(
    fn=ner_inference,
    inputs=gr.Textbox(lines=5, placeholder="Paste resume text here..."),
    outputs="json",
    title="Resume NER Demo"
)

demo.launch()


Overwriting app.py


In [37]:
%%writefile requirements.txt
transformers
torch
gradio


Overwriting requirements.txt


In [36]:
!pip freeze > requirements.txt

In [38]:
%%writefile README.md
# Resume Named Entity Recognition (NER) with Hugging Face

## 📌 Overview
This project fine-tunes **BERT (bert-base-cased)** on the CoNLL-2003 dataset for **Named Entity Recognition (NER)**.
The same pipeline can be applied to **resume parsing** (extracting skills, degrees, companies, job titles).

## 🚀 Features
- Fine-tuned Transformer (BERT) for token classification
- Achieves high F1 on validation set
- Interactive Gradio demo to test NER on custom text
- Ready for deployment to Hugging Face Spaces

## 🛠️ Tech Stack
- Python, PyTorch
- Hugging Face Transformers & Datasets
- Gradio (demo UI)
- Google Colab (training)

## 📊 Example
**Input**

```
John Doe is a Software Engineer at Google, who graduated from Stanford University.
```

**Output**
```json
[
  {"entity": "PER", "word": "John Doe"},
  {"entity": "ORG", "word": "Google"},
  {"entity": "JOB", "word": "Software Engineer"},
  {"entity": "LOC", "word": "Stanford University"}
]
```
Model:https://huggingface.co/soh7/bert-finetuned-ner

Demo: https://huggingface.co/spaces/soh7/bert-finetuned-ner

Code: https://github.com/sohamgupta779-art/bert-finetuned-ner.git


Overwriting README.md


In [40]:
!cat README.md


# Resume Named Entity Recognition (NER) with Hugging Face

## 📌 Overview
This project fine-tunes **BERT (bert-base-cased)** on the CoNLL-2003 dataset for **Named Entity Recognition (NER)**.
The same pipeline can be applied to **resume parsing** (extracting skills, degrees, companies, job titles).

## 🚀 Features
- Fine-tuned Transformer (BERT) for token classification
- Achieves high F1 on validation set
- Interactive Gradio demo to test NER on custom text
- Ready for deployment to Hugging Face Spaces

## 🛠️ Tech Stack
- Python, PyTorch
- Hugging Face Transformers & Datasets
- Gradio (demo UI)
- Google Colab (training)

## 📊 Example
**Input**

```
John Doe is a Software Engineer at Google, who graduated from Stanford University.
```

**Output**
```json
[
  {"entity": "PER", "word": "John Doe"},
  {"entity": "ORG", "word": "Google"},
  {"entity": "JOB", "word": "Software Engineer"},
  {"entity": "LOC", "word": "Stanford University"}
]
```
Model:https://huggingface.co/soh7/bert-finetuned-

In [41]:
import os

os.environ["GITHUB_USER"] = "sohamgupta779-art"
os.environ["GITHUB_TOKEN"] = "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"

print("GITHUB_USER set to:", os.environ.get("GITHUB_USER"))
print("GITHUB_TOKEN set (not printed for security).")

GITHUB_USER set to: sohamgupta779-art
GITHUB_TOKEN set (not printed for security).


In [42]:
!git clone https://$GITHUB_USER:$GITHUB_TOKEN@github.com/sohamgupta779-art/bert-finetuned-ner.git
%cd bert-finetuned-ner


Cloning into 'bert-finetuned-ner'...
/content/bert-finetuned-ner/bert-finetuned-ner


In [45]:
%%writefile README.md
# Resume Named Entity Recognition (NER) with Hugging Face

## 📌 Overview
This project fine-tunes **BERT (bert-base-cased)** on the CoNLL-2003 dataset for **Named Entity Recognition (NER)**.
The same pipeline can be applied to **resume parsing** (extracting skills, degrees, companies, job titles).

## 🚀 Features
- Fine-tuned Transformer (BERT) for token classification
- Interactive Gradio demo
- Ready for Hugging Face Spaces

## 🛠️ Tech Stack
- Python, PyTorch
- Hugging Face Transformers
- Gradio
- Google Colab

## 📊 Example
**Input**
John Doe worked at Google as a Software Engineer after studying at Stanford University.

**Output**
```json
[
  {"entity": "PER", "word": "John Doe"},
  {"entity": "ORG", "word": "Google"},
  {"entity": "JOB", "word": "Software Engineer"},
  {"entity": "LOC", "word": "Stanford University"}
]
Model:https://huggingface.co/soh7/bert-finetuned-ner

Demo: https://huggingface.co/spaces/soh7/bert-finetuned-ner

Code: https://github.com/sohamgupta779-art/bert-finetuned-ner.git


Writing README.md


In [49]:
%%writefile requirements.txt
transformers
torch
gradio



Writing requirements.txt


In [52]:
!git config --global user.name "sohamgupta779-art"
!git config --global user.email "sohamgupta779@gmail.com"


In [53]:
!git add .
!git commit -m "Add README and requirements"


[main (root-commit) 3bcc20e] Add README and requirements
 2 files changed, 38 insertions(+)
 create mode 100644 README.md
 create mode 100644 requirements.txt


In [60]:
import os

os.environ["GITHUB_USER"] = "sohamgupta779-art"
os.environ["GITHUB_TOKEN"] = "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"

print("GitHub user:", os.environ.get("GITHUB_USER"))
print("Token set:", "GITHUB_TOKEN" in os.environ)




GitHub user: sohamgupta779-art
Token set: True


In [61]:
!git remote set-url origin https://$GITHUB_USER:$GITHUB_TOKEN@github.com/sohamgupta779-art/bert-finetuned-ner.git


In [62]:
!git push -u origin main


Enumerating objects: 4, done.
Counting objects:  25% (1/4)Counting objects:  50% (2/4)Counting objects:  75% (3/4)Counting objects: 100% (4/4)Counting objects: 100% (4/4), done.
Delta compression using up to 2 threads
Compressing objects:  33% (1/3)Compressing objects:  66% (2/3)Compressing objects: 100% (3/3)Compressing objects: 100% (3/3), done.
Writing objects:  25% (1/4)Writing objects:  50% (2/4)Writing objects:  75% (3/4)Writing objects: 100% (4/4)Writing objects: 100% (4/4), 873 bytes | 873.00 KiB/s, done.
Total 4 (delta 0), reused 0 (delta 0), pack-reused 0
To https://github.com/sohamgupta779-art/bert-finetuned-ner.git
 * [new branch]      main -> main
Branch 'main' set up to track remote branch 'main' from 'origin'.
