In [None]:
import os
import re
import fitz  # PyMuPDF
import docx  # python-docx
import requests  # To call the AI API
from pathlib import Path
import json

from vertexai.preview.generative_models import (
    GenerationConfig,
    GenerativeModel,
)


generation_config = GenerationConfig(
    temperature=0,
    max_output_tokens=8192,
)
model = GenerativeModel("gemini-pro")



# Function to parse PDF files
def parse_pdf(file_path):
    text = ""
    with fitz.open(file_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

# Function to parse DOCX files
def parse_docx(file_path):
    text = ""
    doc = docx.Document(file_path)
    for para in doc.paragraphs:
        text += para.text + "\n"
    return text

# Function to extract fields from text using Gen AI API
def extract_fields(text):
    # Replace this URL with your Gen AI API endpoint
    
   
    # Prompt for consistent output format
    prompt = f"""
    Extract the following fields from the resume text:
    1. Full Name
    2. Contact Information: Phone number, Email address, LinkedIn profile (if available)
    3. Address: City and state
    4. Education: Degrees obtained, Institutions attended, Graduation years

    Format the output as a JSON text that can be used to convert to object using json.load() with the keys: 'name', 'contact', 'address', 'education'.
    Resume text:
    {text}
    """
   
    # Make API request
    response = model.generate_content(prompt).text.replace('json','').replace("```",'')
    
    
    response=json.loads(response)
    return response

# Function to rename file based on last name and first name
def rename_file(file_path, full_name):
    # Split full name into last name and first name
    
    new_file_name = f"{full_name}{Path(file_path).suffix}"

    new_file_path = Path(file_path).parent / new_file_name
    print(new_file_path)
    os.rename(file_path, new_file_path)
    return new_file_path
# Return original path if name is not in expected format

# Main function to process resumes
def process_resumes(directory):
    for file in Path(directory).rglob("*"):
        try:
            if file.suffix in [".pdf", ".docx", ".doc"]:
                if file.suffix == ".pdf":
                    text = parse_pdf(file)
                else:
                    pass
                    # For .docx and .doc
                    #text = parse_docx(file)

                # Extract fields
                fields = extract_fields(text)

                # Print or save the extracted fields
                  # Or save to a database/file
                
                with open(f"json/{fields.get('name')}.json", 'w') as f:
                   json.dump(fields, f, indent=4)
                # # Rename the file
                renamed_file_path = rename_file(file, fields.get('name'))
                #renamed_file_path = rename_file(file, fields.get('name'))
                #renamed_file_path=file
                print(f"Renamed file to: {renamed_file_path}")
        except Exception as e:
            print(e)
            continue

# Example usage
process_resumes("extracted_files")

In [28]:
import os
import re
import fitz  # PyMuPDF
import docx  # python-docx
import requests  # To call the AI API
from pathlib import Path
import json
from tqdm.notebook import tqdm

from vertexai.preview.generative_models import (
    GenerationConfig,
    GenerativeModel,
)


generation_config = GenerationConfig(
    temperature=0,
    max_output_tokens=8192,
)
model = GenerativeModel("gemini-1.5-pro")



# Function to parse PDF files
def parse_pdf(file_path):
    text = ""
    with fitz.open(file_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

# Function to parse DOCX files
def parse_docx(file_path):
    text = ""
    doc = docx.Document(file_path)
    for para in doc.paragraphs:
        text += para.text + "\n"
    return text

# Function to extract fields from text using Gen AI API
def extract_fields(text):
    # Replace this URL with your Gen AI API endpoint
    
   
    # Prompt for consistent output format
    prompt = f"""
    Extract the following fields from the resume text:
    1. Full Name
    2. Contact Information: Phone number, Email address, LinkedIn profile (if available)
    3. Address: City and state
    4. Education: Degrees obtained, Institutions attended, Graduation years

    Format the output as a JSON text that can be used to convert to object using json.load() with the keys: 'name', 'contact', 'address', 'education'.
    Resume text:
    {text}
    """
   
    # Make API request
    response = model.generate_content(prompt).text.replace('json','').replace("```",'')
    
    
    response=json.loads(response)
    return response

# Function to rename file based on last name and first name
def rename_file(file_path, full_name):
    # Split full name into last name and first name
    
    new_file_name = f"{full_name}{Path(file_path).suffix}"

    new_file_path = Path(file_path).parent / new_file_name
    #print(new_file_path)
    os.rename(file_path, new_file_path)
    return new_file_path
# Return original path if name is not in expected format

# Main function to process resumes
def process_resumes(directory):
    lst_name=[]
    lst_error=[]
    for file in  tqdm(Path(directory).rglob("*")):
        try:
            if file.suffix in [".pdf", ".docx", ".doc"]:
                if file.suffix == ".pdf":
                    text = parse_pdf(file)
                if file.suffix == ".doc":
                    text = parse_doc(file)
                if file.suffix == ".docx":
                    text = parse_docx(file)
                else:
                    pass
                    # For .docx and .doc
                    #text = parse_docx(file)

                # Extract fields
                fields = extract_fields(text[:1000])

                # Print or save the extracted fields
                  # Or save to a database/file
                
                # with open(f"json/{fields.get('name')}.json", 'w') as f:
                #    json.dump(fields, f, indent=4)
                # # Rename the file
                renamed_file_path = rename_file(file, fields.get('name'))
                #renamed_file_path = rename_file(file, fields.get('name'))
                #renamed_file_path=file
                print(f"Renamed file to: {renamed_file_path}")
                lst_name.append([fields.get('name'),renamed_file_path,file])
        except Exception as e:
            lst_error.append(file)
            print(e)
            continue
    return lst_name ,lst_error
# Example usage
lst_name, lst_error =process_resumes("extracted_files")

0it [00:00, ?it/s]

Renamed file to: extracted_files/Engineering CVs/Low, Tze Meng.pdf
name 'parse_doc' is not defined
name 'parse_doc' is not defined
Renamed file to: extracted_files/Engineering CVs/Sheng Shen.pdf
Renamed file to: extracted_files/Engineering CVs/Qing Li.pdf
Renamed file to: extracted_files/Engineering CVs/Christopher McComb.pdf
Renamed file to: extracted_files/Engineering CVs/PANAYIOTIS (PANOS) MOUTIS.pdf
Renamed file to: extracted_files/Engineering CVs/Philip R. LeDuc.pdf
Renamed file to: extracted_files/Engineering CVs/M. GRANGER MORGAN.pdf
Renamed file to: extracted_files/Engineering CVs/Matteo Pozzi.pdf
Renamed file to: extracted_files/Engineering CVs/DAVID R. ROUNCE, Ph.D..pdf
Renamed file to: extracted_files/Engineering CVs/A. Ahmed Biyabani.docx
Renamed file to: extracted_files/Engineering CVs/Joanne C. Peca.pdf
Renamed file to: extracted_files/Engineering CVs/Elias Towe.pdf
Renamed file to: extracted_files/Engineering CVs/KAI YU.pdf
Renamed file to: extracted_files/Engineering CV

In [32]:
print(lst_name)

[['Low, Tze Meng', PosixPath('extracted_files/Engineering CVs/Low, Tze Meng.pdf'), PosixPath('extracted_files/Engineering CVs/Low, Tze Meng.pdf')], ['Sheng Shen', PosixPath('extracted_files/Engineering CVs/Sheng Shen.pdf'), PosixPath('extracted_files/Engineering CVs/CV_Sheng Shen_ 2022.pdf')], ['Qing Li', PosixPath('extracted_files/Engineering CVs/Qing Li.pdf'), PosixPath('extracted_files/Engineering CVs/Li_CV_updated2022.pdf')], ['Christopher McComb', PosixPath('extracted_files/Engineering CVs/Christopher McComb.pdf'), PosixPath('extracted_files/Engineering CVs/McComb_CV.pdf')], ['PANAYIOTIS (PANOS) MOUTIS', PosixPath('extracted_files/Engineering CVs/PANAYIOTIS (PANOS) MOUTIS.pdf'), PosixPath('extracted_files/Engineering CVs/PM CV 2022 long.pdf')], ['Philip R. LeDuc', PosixPath('extracted_files/Engineering CVs/Philip R. LeDuc.pdf'), PosixPath('extracted_files/Engineering CVs/CV-leduc-0123.pdf')], ['M. GRANGER MORGAN', PosixPath('extracted_files/Engineering CVs/M. GRANGER MORGAN.pdf'),

In [33]:
print(lst_error)

[PosixPath('extracted_files/Engineering CVs/Anna, Shelly Lynn CV January 26 2023.doc'), PosixPath('extracted_files/Engineering CVs/Rebecca Taylor PT CV Feb_8_2023.doc'), PosixPath('extracted_files/Engineering CVs/Tony_Wasserman_CV2022_Full.doc'), PosixPath('extracted_files/Engineering CVs/McHenry.C.V.-2023.pdf'), PosixPath('extracted_files/Engineering CVs/CV Chris Pistorius.doc'), PosixPath('extracted_files/Engineering CVs/YuliWangCV2022.pdf'), PosixPath('extracted_files/Engineering CVs/LPorter vitae.docx'), PosixPath('extracted_files/Engineering CVs/SChristianCV 20230207.pdf'), PosixPath('extracted_files/Engineering CVs/Sokalski_CV_2-8-23.pdf'), PosixPath('extracted_files/Engineering CVs/DavidVarodayan.docx'), PosixPath('extracted_files/Engineering CVs/Budnick, Mark 2 - 2023_01_01.docx'), PosixPath('extracted_files/Engineering CVs/Whitehead, Kathryn.doc'), PosixPath('extracted_files/Engineering CVs/Smailagic_Vitae_Feb2023.doc'), PosixPath('extracted_files/Engineering CVs/VanBriesen Fu

In [24]:
from tqdm.notebook import tqdm