In [None]:
import os
import re
import fitz  # PyMuPDF
import docx  # python-docx
import requests  # To call the AI API
from pathlib import Path
import json

from vertexai.preview.generative_models import (
    GenerationConfig,
    GenerativeModel,
)


generation_config = GenerationConfig(
    temperature=0,
    max_output_tokens=8192,
)
model = GenerativeModel("gemini-pro")



# Function to parse PDF files
def parse_pdf(file_path):
    text = ""
    with fitz.open(file_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

# Function to parse DOCX files
def parse_docx(file_path):
    text = ""
    doc = docx.Document(file_path)
    for para in doc.paragraphs:
        text += para.text + "\n"
    return text

# Function to extract fields from text using Gen AI API
def extract_fields(text):
    # Replace this URL with your Gen AI API endpoint
    
   
    # Prompt for consistent output format
    prompt = f"""
    Extract the following fields from the resume text:
    1. Full Name
    2. Contact Information: Phone number, Email address, LinkedIn profile (if available)
    3. Address: City and state
    4. Education: Degrees obtained, Institutions attended, Graduation years

    Format the output as a JSON text that can be used to convert to object using json.load() with the keys: 'name', 'contact', 'address', 'education'.
    Resume text:
    {text}
    """
   
    # Make API request
    response = model.generate_content(prompt).text.replace('json','').replace("```",'')
    
    
    response=json.loads(response)
    return response

# Function to rename file based on last name and first name
def rename_file(file_path, full_name):
    # Split full name into last name and first name
    
    new_file_name = f"{full_name}{Path(file_path).suffix}"

    new_file_path = Path(file_path).parent / new_file_name
    print(new_file_path)
    os.rename(file_path, new_file_path)
    return new_file_path
# Return original path if name is not in expected format

# Main function to process resumes
def process_resumes(directory):
    for file in Path(directory).rglob("*"):
        try:
            if file.suffix in [".pdf", ".docx", ".doc"]:
                if file.suffix == ".pdf":
                    text = parse_pdf(file)
                else:
                    pass
                    # For .docx and .doc
                    #text = parse_docx(file)

                # Extract fields
                fields = extract_fields(text)

                # Print or save the extracted fields
                  # Or save to a database/file
                
                with open(f"json/{fields.get('name')}.json", 'w') as f:
                   json.dump(fields, f, indent=4)
                # # Rename the file
                renamed_file_path = rename_file(file, fields.get('name'))
                #renamed_file_path = rename_file(file, fields.get('name'))
                #renamed_file_path=file
                print(f"Renamed file to: {renamed_file_path}")
        except Exception as e:
            print(e)
            continue

# Example usage
process_resumes("extracted_files")

extracted_files/Engineering CVs/Christopher McComb.pdf
Renamed file to: extracted_files/Engineering CVs/Christopher McComb.pdf
extracted_files/Engineering CVs/PANAYIOTIS (PANOS) MOUTIS.pdf
Renamed file to: extracted_files/Engineering CVs/PANAYIOTIS (PANOS) MOUTIS.pdf
400 Unable to submit request because the input token count is 63063 but model only supports up to 32766. Reduce the input token count and try again. You can also use the CountTokens API to calculate prompt token count and billable characters. Learn more: https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models
400 Unable to submit request because the input token count is 35429 but model only supports up to 32766. Reduce the input token count and try again. You can also use the CountTokens API to calculate prompt token count and billable characters. Learn more: https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models
extracted_files/Engineering CVs/Matteo Pozzi .pdf
Renamed file to: extracted_files/Engin

In [None]:
import os
import re
import fitz  # PyMuPDF
import docx  # python-docx
import requests  # To call the AI API
from pathlib import Path
