In [11]:
# pip install PyMuPDF

In [10]:
import fitz  # PyMuPDF
import base64
import json
from typing import Dict, List
import os


In [13]:
# Step 1: PDF Processing
def process_pdf(pdf_path: str) -> Dict[str, str]:
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    
    # 'text' to store in  database
    # PDF data to store encoded pdf details
    pdf_data = base64.b64encode(open(pdf_path, "rb").read()).decode('utf-8')
    return {"text": text, "pdf_data": pdf_data}

In [14]:
# Step 2: Querying and Analysis
# This step would involve your vector database and bot analysis
# We'll simulate it with a simple function
def analyze_resume(resume_text: str, job_description: str) -> str:
    # Simulated analysis
    return f"This candidate is a good match for {job_description}"

In [15]:
# Step 3: PDF Recreation
def recreate_pdf(pdf_data: str) -> str:
    # The pdf_data is already base64 encoded, so we can return it directly
    return pdf_data

In [16]:
# Main workflow
def resume_workflow(cv_folder: str, job_description: str) -> List[Dict[str, str]]:
    results = []
    for filename in os.listdir(cv_folder):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(cv_folder, filename)
            
            # Process PDF
            processed_data = process_pdf(pdf_path)
            
            # Analyze resume
            analysis = analyze_resume(processed_data["text"], job_description)
            
            # Recreate PDF
            recreated_pdf = recreate_pdf(processed_data["pdf_data"])
            
            results.append({
                "filename": filename,
                "analysis": analysis,
                "pdf_data": recreated_pdf
            })
    
    return results

In [26]:
# Example usage
cv_folder = "./cv_folder"  # Path to the folder containing CV PDFs
job_description = "Software Engineer"
results = resume_workflow(cv_folder, job_description)

In [27]:
results

[{'filename': 'test.pdf',
  'analysis': 'This candidate is a good match for Software Engineer',
  'pdf_data': 'JVBERi0xLjMKMyAwIG9iago8PC9UeXBlIC9QYWdlCi9QYXJlbnQgMSAwIFIKL1Jlc291cmNlcyAyIDAgUgovQ29udGVudHMgNCAwIFI+PgplbmRvYmoKNCAwIG9iago8PC9GaWx0ZXIgL0ZsYXRlRGVjb2RlIC9MZW5ndGggMTM2Pj4Kc3RyZWFtCnicNYxBDoIwFAX3nOItdVNbsP2yNYqJKxJ7gSbUUiPFlG+Itzdo2E5mpsS1kEIT5uJosWsUlBZSwt5xtgtStRJkQPVeHAi2w+bimWMKuLHL7DvMkXu0p2ZCTGg/3I9pC/tYezKiqkDGCF3++r+COMEhZO8YrzyG7IZhuT5dCm8X/Lr4Ajm2LYQKZW5kc3RyZWFtCmVuZG9iagoxIDAgb2JqCjw8L1R5cGUgL1BhZ2VzCi9LaWRzIFszIDAgUiBdCi9Db3VudCAxCi9NZWRpYUJveCBbMCAwIDU5NS4yOCA4NDEuODldCj4+CmVuZG9iago1IDAgb2JqCjw8L1R5cGUgL0ZvbnQKL0Jhc2VGb250IC9IZWx2ZXRpY2EKL1N1YnR5cGUgL1R5cGUxCi9FbmNvZGluZyAvV2luQW5zaUVuY29kaW5nCj4+CmVuZG9iagoyIDAgb2JqCjw8Ci9Qcm9jU2V0IFsvUERGIC9UZXh0IC9JbWFnZUIgL0ltYWdlQyAvSW1hZ2VJXQovRm9udCA8PAovRjEgNSAwIFIKPj4KL1hPYmplY3QgPDwKPj4KPj4KZW5kb2JqCjYgMCBvYmoKPDwKL1Byb2R1Y2VyIChQeUZQREYgMS43LjIgaHR0cDovL3B5ZnBkZi5nb29nbGVjb2RlLmNvbS8pCi9DcmVhdGlvbkRhdGUgKEQ6MjAyN

In [28]:
results[0]['pdf_data']

'JVBERi0xLjMKMyAwIG9iago8PC9UeXBlIC9QYWdlCi9QYXJlbnQgMSAwIFIKL1Jlc291cmNlcyAyIDAgUgovQ29udGVudHMgNCAwIFI+PgplbmRvYmoKNCAwIG9iago8PC9GaWx0ZXIgL0ZsYXRlRGVjb2RlIC9MZW5ndGggMTM2Pj4Kc3RyZWFtCnicNYxBDoIwFAX3nOItdVNbsP2yNYqJKxJ7gSbUUiPFlG+Itzdo2E5mpsS1kEIT5uJosWsUlBZSwt5xtgtStRJkQPVeHAi2w+bimWMKuLHL7DvMkXu0p2ZCTGg/3I9pC/tYezKiqkDGCF3++r+COMEhZO8YrzyG7IZhuT5dCm8X/Lr4Ajm2LYQKZW5kc3RyZWFtCmVuZG9iagoxIDAgb2JqCjw8L1R5cGUgL1BhZ2VzCi9LaWRzIFszIDAgUiBdCi9Db3VudCAxCi9NZWRpYUJveCBbMCAwIDU5NS4yOCA4NDEuODldCj4+CmVuZG9iago1IDAgb2JqCjw8L1R5cGUgL0ZvbnQKL0Jhc2VGb250IC9IZWx2ZXRpY2EKL1N1YnR5cGUgL1R5cGUxCi9FbmNvZGluZyAvV2luQW5zaUVuY29kaW5nCj4+CmVuZG9iagoyIDAgb2JqCjw8Ci9Qcm9jU2V0IFsvUERGIC9UZXh0IC9JbWFnZUIgL0ltYWdlQyAvSW1hZ2VJXQovRm9udCA8PAovRjEgNSAwIFIKPj4KL1hPYmplY3QgPDwKPj4KPj4KZW5kb2JqCjYgMCBvYmoKPDwKL1Byb2R1Y2VyIChQeUZQREYgMS43LjIgaHR0cDovL3B5ZnBkZi5nb29nbGVjb2RlLmNvbS8pCi9DcmVhdGlvbkRhdGUgKEQ6MjAyNDA4MTUxMTEyMjcpCj4+CmVuZG9iago3IDAgb2JqCjw8Ci9UeXBlIC9DYXRhbG9nCi9QYWdlcyAxIDAgUgovT3BlbkFjdGlvbiBbMyAwIFIgL0Z

In [21]:
api_response = json.dumps(results)

In [30]:
api_response

'[{"filename": "test.pdf", "analysis": "This candidate is a good match for Software Engineer", "pdf_data": "JVBERi0xLjMKMyAwIG9iago8PC9UeXBlIC9QYWdlCi9QYXJlbnQgMSAwIFIKL1Jlc291cmNlcyAyIDAgUgovQ29udGVudHMgNCAwIFI+PgplbmRvYmoKNCAwIG9iago8PC9GaWx0ZXIgL0ZsYXRlRGVjb2RlIC9MZW5ndGggMTM2Pj4Kc3RyZWFtCnicNYxBDoIwFAX3nOItdVNbsP2yNYqJKxJ7gSbUUiPFlG+Itzdo2E5mpsS1kEIT5uJosWsUlBZSwt5xtgtStRJkQPVeHAi2w+bimWMKuLHL7DvMkXu0p2ZCTGg/3I9pC/tYezKiqkDGCF3++r+COMEhZO8YrzyG7IZhuT5dCm8X/Lr4Ajm2LYQKZW5kc3RyZWFtCmVuZG9iagoxIDAgb2JqCjw8L1R5cGUgL1BhZ2VzCi9LaWRzIFszIDAgUiBdCi9Db3VudCAxCi9NZWRpYUJveCBbMCAwIDU5NS4yOCA4NDEuODldCj4+CmVuZG9iago1IDAgb2JqCjw8L1R5cGUgL0ZvbnQKL0Jhc2VGb250IC9IZWx2ZXRpY2EKL1N1YnR5cGUgL1R5cGUxCi9FbmNvZGluZyAvV2luQW5zaUVuY29kaW5nCj4+CmVuZG9iagoyIDAgb2JqCjw8Ci9Qcm9jU2V0IFsvUERGIC9UZXh0IC9JbWFnZUIgL0ltYWdlQyAvSW1hZ2VJXQovRm9udCA8PAovRjEgNSAwIFIKPj4KL1hPYmplY3QgPDwKPj4KPj4KZW5kb2JqCjYgMCBvYmoKPDwKL1Byb2R1Y2VyIChQeUZQREYgMS43LjIgaHR0cDovL3B5ZnBkZi5nb29nbGVjb2RlLmNvbS8pCi9DcmVhdGlvbkRhdGUgKEQ6MjAyNDA4

In [29]:
def generate_pdf_from_base64(json_data, output_folder):
    # Parse the JSON string
    parsed_data = json.loads(json_data)
    
    # Create the output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    
    # Process each item in the parsed data
    for item in parsed_data:
        filename = item['filename']
        pdf_data = item['pdf_data']
        
        # Decode the base64 string
        decoded_pdf = base64.b64decode(pdf_data)
        
        # Generate the output file path
        output_path = os.path.join(output_folder, filename)
        
        # Write the decoded data to a PDF file
        with open(output_path, 'wb') as f:
            f.write(decoded_pdf)
        
        print(f"Generated PDF: {output_path}")

In [31]:
output_folder = "generated_pdfs"
generate_pdf_from_base64(api_response, output_folder)

Generated PDF: generated_pdfs/test.pdf
