Step 1: Install Required Libraries

In [1]:

!pip install spacy pdfminer.six pymupdf
!python -m spacy download en_core_web_sm


Collecting pdfminer.six
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pymupdf
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m52.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m51.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf, pdfminer.six
Successfully installed pdfminer.six-20250506 pymupdf-1.26.3
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m63.3 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5

Step 2: Import Required Libraries

In [2]:

import spacy
import fitz  # PyMuPDF
from pdfminer.high_level import extract_text as extract_text_pdfminer
import io
from google.colab import files

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")


Step 3: Define Text Extraction Functions

In [12]:


def extract_text_from_pdf(file):
    text = ""
    with fitz.open(stream=file.read(), filetype="pdf") as doc:
        for page in doc:
            text += page.get_text()
    return text

def extract_text_from_txt(file):
    return file.read().decode('utf-8')


Step 4: Define Entity Extraction Function


In [None]:

def extract_entities(text):
    doc = nlp(text)
    entities = {
        "Name": "",
        "Skills": [],
        "Degree": "",
        "Institutions": [],
        "Work Experience": ""
    }

    for ent in doc.ents:
        if ent.label_ == "PERSON" and not entities["Name"]:
            entities["Name"] = ent.text
        elif ent.label_ == "ORG":
            entities["Institutions"].append(ent.text)
        elif ent.label_ in ["WORK_OF_ART", "NORP"]:
            entities["Work Experience"] += ent.text + " "
        elif ent.label_ in ["EDUCATION", "DEGREE", "FAC"]:
            entities["Degree"] = ent.text
    return entities


Step 5 : Upload Resume

In [11]:

uploaded = files.upload()

for filename in uploaded:
    with open(filename, 'rb') as f:
        if filename.endswith(".pdf"):
            raw_text = extract_text_from_pdf(f)
        elif filename.endswith(".txt"):
            raw_text = extract_text_from_txt(f)
        else:
            raise ValueError("Unsupported file format!")

    print("\n📄 Extracted Resume Text:\n")
    print(raw_text[:1000])  # Print first 1000 chars
    print("\n🔍 Extracted Entities:\n")
    result = extract_entities(raw_text)
    print(result)


Saving Sowmithra-R-FlowCV-Resume-20250718 (2).pdf to Sowmithra-R-FlowCV-Resume-20250718 (2) (2).pdf

📄 Extracted Resume Text:

Sowmithra R
sowmithraramesh6259@gmail.com
Chennai.India
linkedin.com/in/sowmithra-ramesh-148594272
+91 6379251663
github.com/sowmithraramesh
OBJECTIVE
To contribute to the development of intelligent, data-driven solutions by leveraging my expertise in Artificial Intelligence 
and Machine Learning. I aim to design and deploy scalable models that enhance operational efficiency and decision-
making.
EDUCATION
B.E IN ELECTRONICS AND COMMUNICATION ENGINEERING  
(Hons)
Saveetha Engineering College, Chennai
2022 – 2026
CGPA-9.2
12th STANDARD
Everwin Vidhyashram, Chennai
2021 – 2022
PERCENTAGE-87%
10th STANDARD
Everwin Vidhyashram, Chennai
2020 – 2021
PERCENTAGE-88%
TECHNICAL SKILLS
PROGRAMMING LANGUAGES
Java, Programming in C,Python
DATABASE
MySQL
DIGITAL IMAGE PROCESSING
MACHINE LEARNING
Scikit-learn, Pandas,Numpy,Matplotlib
DATA SCIENCE
Matplotlib,TensorFlow,Seaborn

In [10]:
import gradio as gr
import spacy
import fitz  # PyMuPDF
import tempfile

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Function to extract text from PDF
def extract_text_from_pdf(file_path):
    text = ""
    with fitz.open(file_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

# Function to extract text from TXT
def extract_text_from_txt(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read()

# NER Extraction Function
def extract_entities(text):
    doc = nlp(text)
    entities = {
        "Name": "",
        "Skills": [],
        "Degree": "",
        "Institutions": [],
        "Work Experience": ""
    }

    for ent in doc.ents:
        if ent.label_ == "PERSON" and not entities["Name"]:
            entities["Name"] = ent.text
        elif ent.label_ == "ORG":
            entities["Institutions"].append(ent.text)
        elif ent.label_ in ["WORK_OF_ART", "NORP"]:
            entities["Work Experience"] += ent.text + " "
        elif ent.label_ in ["EDUCATION", "DEGREE", "FAC"]:
            entities["Degree"] = ent.text
    return entities

# Main function to process file
def parse_resume(file):
    if file is None:
        return "Please upload a file."

    file_path = file.name
    if file_path.endswith('.pdf'):
        text = extract_text_from_pdf(file_path)
    elif file_path.endswith('.txt'):
        text = extract_text_from_txt(file_path)
    else:
        return "Unsupported file format. Please upload PDF or TXT."

    entities = extract_entities(text)
    return entities

# Gradio Interface
iface = gr.Interface(
    fn=parse_resume,
    inputs=gr.File(label="Upload Resume (.pdf or .txt)"),
    outputs=gr.JSON(label="Parsed Resume Information"),
    title="📄 NLP Resume Parser",
    description="Upload a resume in PDF or TXT format. The system will extract Name, Degree, Institution, Work Experience, etc. using SpaCy NER.",
    theme=gr.themes.Soft(),
    live=True
)

iface.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://d7dc98b6733c42aeec.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [8]:
import nest_asyncio
nest_asyncio.apply()
