In [7]:
import gradio as gr
import PyPDF2
from transformers import pipeline
import re

# Load an open-source LLM for text generation (e.g., flan-t5)
llm_pipeline = pipeline("text2text-generation", model="google/flan-t5-small")

# Function to clean extracted text from PDF
def clean_text(text):
    # Remove extra whitespace and newlines
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Function to extract text from PDF
def extract_text_from_pdf(file):
    reader = PyPDF2.PdfReader(file)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return clean_text(text)

# Function to extract resume information using LLM
def extract_resume_info(text):
    # Define a detailed prompt for the LLM
    prompt = f"""
    Extract the following information from the resume text below:
    - Name: Extract the full name of the person.
    - Email: Extract the email address.
    - Skills: Extract a list of technical skills (e.g., Python, Java, Machine Learning).
    - Experience: Summarize the work experience in 2-3 sentences.

    Resume Text:
    {text}

    Provide the output in the following format:
    Name: [Full Name]
    Email: [Email Address]
    Skills: [Skill 1, Skill 2, Skill 3]
    Experience: [Summary of work experience]
    """

    # Use the LLM to extract information
    result = llm_pipeline(prompt, max_length=500)
    extracted_info = result[0]['generated_text']

    # Post-process the output to ensure it follows the desired format
    if "Name:" not in extracted_info:
        extracted_info = "Name: Not found\n" + extracted_info
    if "Email:" not in extracted_info:
        extracted_info = "Email: Not found\n" + extracted_info
    if "Skills:" not in extracted_info:
        extracted_info = "Skills: Not found\n" + extracted_info
    if "Experience:" not in extracted_info:
        extracted_info = "Experience: Not found\n" + extracted_info

    return extracted_info

# Gradio interface
def process_resume(file):
    # Step 1: Extract text from the uploaded PDF
    text = extract_text_from_pdf(file)

    # Step 2: Extract information using the LLM
    extracted_info = extract_resume_info(text)

    return extracted_info

# Define Gradio interface
interface = gr.Interface(
    fn=process_resume,
    inputs=gr.File(label="Upload Resume (PDF)"),
    outputs=gr.Textbox(label="Extracted Information"),
    title="Resume Tracking System",
    description="Upload a resume (PDF) to extract key information using an LLM."
)

# Launch the Gradio app
interface.launch()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Device set to use cuda:0


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://42082222b21a351747.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [6]:
pip install gradio transformers PyPDF2 torch

Collecting gradio
  Downloading gradio-5.29.0-py3-none-any.whl.metadata (16 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.0 (from gradio)
  Downloading gradio_client-1.10.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.9-py3-none-manylinux_2_17_x86