# Resume NER — Step 1
Environment setup and dataset check


In [None]:
!pip install -q transformers datasets seqeval evaluate accelerate gradio


In [None]:
import torch, sys
import transformers, datasets

print("Python:", sys.version.splitlines()[0])
print("PyTorch:", torch.__version__, "CUDA available:", torch.cuda.is_available())
print("Transformers:", transformers.__version__)
print("Datasets:", datasets.__version__)


In [None]:
!pip install -U datasets==2.19.1


In [None]:
from datasets import load_dataset


try:
    dataset = load_dataset("conll2003", revision="refs/convert/parquet")
except Exception as e:
    print(f"Could not load dataset as parquet: {e}")

    dataset = load_dataset("conll2003")

dataset

In [None]:
from transformers import AutoTokenizer

MODEL_CHECKPOINT = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)


In [None]:
label_list = dataset["train"].features["ner_tags"].feature.names
num_labels = len(label_list)

print("Number of labels:", num_labels)
print("Labels:", label_list)

label_to_id = {l: i for i, l in enumerate(label_list)}
id_to_label = {i: l for i, l in enumerate(label_list)}


In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding=False
    )

    all_labels = []
    for i, labels in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(labels[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        all_labels.append(label_ids)

    tokenized_inputs["labels"] = all_labels
    return tokenized_inputs


In [None]:
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)
tokenized_datasets



In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    MODEL_CHECKPOINT,
    num_labels=num_labels,
    id2label=id_to_label,
    label2id=label_to_id
)


In [None]:
import evaluate

metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)

    true_labels = [
        [id_to_label[l] for l in label if l != -100]
        for label in labels
    ]
    true_predictions = [
        [id_to_label[p] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)


In [None]:
from transformers import TrainingArguments

batch_size = 16

args = TrainingArguments(
    output_dir="bert-finetuned-ner",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=50,
    report_to="none"
)


In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


In [None]:
trainer.train()


In [None]:
metrics = trainer.evaluate()
print(metrics)


In [None]:
trainer.save_model("bert-ner-model")
tokenizer.save_pretrained("bert-ner-model")


In [None]:
from huggingface_hub import notebook_login
notebook_login()


In [None]:
trainer.push_to_hub("bert-finetuned-ner")


In [None]:
import gradio as gr
import torch


from transformers import pipeline

ner_pipeline = pipeline(
    "token-classification",
    model="bert-ner-model",
    tokenizer="bert-ner-model",
    aggregation_strategy="simple"
)

def ner_inference(text):
    results = ner_pipeline(text)
    return results

demo = gr.Interface(
    fn=ner_inference,
    inputs=gr.Textbox(lines=5, placeholder="Paste resume text here..."),
    outputs="json",
    title="Resume NER Demo"
)

demo.launch(share=True)


In [None]:
!rm -rf bert-finetuned-ner && git clone https://huggingface.co/spaces/soh7/bert-finetuned-ner
%cd bert-finetuned-ner



In [None]:
%%writefile app.py
import gradio as gr
from transformers import pipeline

# Load fine-tuned model from Hugging Face Hub
# Make sure you push your model with trainer.push_to_hub() first
ner_pipeline = pipeline(
    "token-classification",
    model="soh7/bert-finetuned-ner",   # change if your model repo is named differently
    tokenizer="soh7/bert-finetuned-ner",
    aggregation_strategy="simple"
)

def ner_inference(text):
    return ner_pipeline(text)

demo = gr.Interface(
    fn=ner_inference,
    inputs=gr.Textbox(lines=5, placeholder="Paste resume text here..."),
    outputs="json",
    title="Resume NER Demo"
)

demo.launch()


In [None]:
%%writefile requirements.txt
transformers
torch
gradio


In [None]:
!pip freeze > requirements.txt

In [None]:
%%writefile README.md
# Resume Named Entity Recognition (NER) with Hugging Face

## 📌 Overview
This project fine-tunes **BERT (bert-base-cased)** on the CoNLL-2003 dataset for **Named Entity Recognition (NER)**.
The same pipeline can be applied to **resume parsing** (extracting skills, degrees, companies, job titles).

## 🚀 Features
- Fine-tuned Transformer (BERT) for token classification
- Achieves high F1 on validation set
- Interactive Gradio demo to test NER on custom text
- Ready for deployment to Hugging Face Spaces

## 🛠️ Tech Stack
- Python, PyTorch
- Hugging Face Transformers & Datasets
- Gradio (demo UI)
- Google Colab (training)

## 📊 Example
**Input**

```
John Doe is a Software Engineer at Google, who graduated from Stanford University.
```

**Output**
```json
[
  {"entity": "PER", "word": "John Doe"},
  {"entity": "ORG", "word": "Google"},
  {"entity": "JOB", "word": "Software Engineer"},
  {"entity": "LOC", "word": "Stanford University"}
]
```
Model:https://huggingface.co/soh7/bert-finetuned-ner

Demo: https://huggingface.co/spaces/soh7/bert-finetuned-ner

Code: https://github.com/sohamgupta779-art/bert-finetuned-ner.git


In [None]:
!cat README.md


In [None]:
!git clone https://$GITHUB_USER:$GITHUB_TOKEN@github.com/sohamgupta779-art/bert-finetuned-ner.git
%cd bert-finetuned-ner


In [None]:
%%writefile README.md
# Resume Named Entity Recognition (NER) with Hugging Face

## 📌 Overview
This project fine-tunes **BERT (bert-base-cased)** on the CoNLL-2003 dataset for **Named Entity Recognition (NER)**.
The same pipeline can be applied to **resume parsing** (extracting skills, degrees, companies, job titles).

## 🚀 Features
- Fine-tuned Transformer (BERT) for token classification
- Interactive Gradio demo
- Ready for Hugging Face Spaces

## 🛠️ Tech Stack
- Python, PyTorch
- Hugging Face Transformers
- Gradio
- Google Colab

## 📊 Example
**Input**
John Doe worked at Google as a Software Engineer after studying at Stanford University.

**Output**
```json
[
  {"entity": "PER", "word": "John Doe"},
  {"entity": "ORG", "word": "Google"},
  {"entity": "JOB", "word": "Software Engineer"},
  {"entity": "LOC", "word": "Stanford University"}
]
Model:https://huggingface.co/soh7/bert-finetuned-ner

Demo: https://huggingface.co/spaces/soh7/bert-finetuned-ner

Code: https://github.com/sohamgupta779-art/bert-finetuned-ner.git


Writing README.md


In [None]:
%%writefile requirements.txt
transformers
torch
gradio

