In [1]:
import os
import requests
import json
import fitz  # PyMuPDF
from PyPDF2 import PdfReader
from IPython.display import clear_output

In [2]:
system_prompt = "[Important instructions are written in BOLD]\
- You are a resume summarizer.\
- You will be provided with texts taken from the pdf file of a resume.\
- You will summary the content of the pdf file into the following information:\
    + Personal information: full name, phone number and the email. IF THE EMAIL IS INVALID, JUST WRITE THE TEXT YOU GOT.\
    + Past experiences: highest position, duration, company name (must have), location and summary of their related skills. THERE MUST BE A COMPANY NAME FOR IT TO BE A VALID EXPERIENCE.\
    + Projects: project name and related techstacks (e.g. if they worked in IT). IF THERE ARE NO PROJECTS, DO NOT MENTION ABOUT PROJECTS\
    + Highest education: type (degree/diploma/bachelor/graduate...), major, year graduated and location. ONLY SUMMARIZE THE HIGHEST EDUCATION\
- TRANSLATE ALL THE INFORMATION TO ENGLISH."

system_prompt_json = \
"[Important instructions are written in BOLD]\
- You are a summary-to-json parser.\
- You will be presented with a summary from a resume.\
- You will parse this summary into a json object, while maintaining the structure based on the json schema provided.\
- DO NOT COMMENT IN THE JSON STRING.\
- ONLY OUTPUT A VALID JSON STRING."

system_prompt_doublecheck = \
"[Important instructions are written in BOLD]\
- You are a json checker.\
- You will be presented with a json object and a summary of a resume. You will check if the json object is correctly interpreted from this summary.\
- If there are any discrepancies, fix the json object and return it back to the user as a json object.\
- YOU MUST MAINTAIN THE FORMAT FOLLOWING THE JSON SCHEMA\
- IF THE EMAIL IS INVALID, DON'T FIX IT.\
- DO NOT COMMENT IN THE JSON STRING.\
- ONLY OUTPUT A VALID JSON STRING."

json_schema = """
{
    "personal_info": 
    {
        "full_name": STRING,
        "phone_number":STRING,
        "email": STRING
    },
    "past_experiences": [{
            "highest_position": STRING,
            "duration": [INT (YEAR_FROM), INT (YEAR_TO)],
            "location": STRING,
            "company": STRING,
            "related_skills": [STRING, STRING, ...],
        }
    ],
    "projects": [{
            "project_name": STRING,
            "techstacks": [STRING, STRING, ...]
        }
    ],
    "highest_education": {
        "type":STRING,
        "major": STRING,
        "graduated_year": INT,
        "location": STRING
    }
}
"""

summary_to_json_prompt = f"""
- Please parse this summary into the following json schema:
{
    json_schema
}
- IF THERE ARE NO EXACT YEAR PROVIDED OR IT IS NOW/PRESENT, YOU MUST LEAVE THE INT VALUE AS NULL."""
    
json_doublecheck_prompt = f"""Given the following json schema:
{
    json_schema
}
Please double check if the following json object is correctly interpreted from this summary while maintain the json schema.
"""

In [3]:
def extract_text_from_pdf(file_path, type='pypdf2'):
    text = ""
    if type == 'pymupdf':
        with fitz.open(file_path) as pdf:
            for page_num in range(pdf.page_count):
                page = pdf[page_num]
                text += page.get_text("text", sort=True)
    else:
        reader = PdfReader(file_path)
        for page in reader.pages:
            text += page.extract_text()
    return text


In [4]:
def doc_to_text_catdoc(filename):
    (fi, fo, fe) = os.popen3('catdoc -w "%s"' % filename)
    fi.close()
    retval = fo.read()
    erroroutput = fe.read()
    fo.close()
    fe.close()
    if not erroroutput:
        return retval
    else:
        raise OSError("Executing the command caused an error: %s" % erroroutput)

In [5]:
### funnction to get cv summary
def cv_summary(raw_text:str):
    url = "http://127.0.0.1:1234/v1/chat/completions"  # Replace with your actual API URL if different
    headers = {"Content-Type": "application/json"}
    data = {
        "model":"qwen2.5-14b-instruct",
        "messages": [
            {"role":"system", "content":system_prompt},
            {"role":"user","content":raw_text}
        ]
    }

    response = requests.post(url, headers=headers, json=data)
    return response.json()['choices'][0]['message']['content']

### summary to json
def cv_summary_to_json(cv_summary:str):
    url = "http://127.0.0.1:1234/v1/chat/completions"  # Replace with your actual API URL if different
    headers = {"Content-Type": "application/json"}
    data = {
        "model": "qwen2.5-14b-instruct",
        "messages": [
            {"role":"system", "content":system_prompt_json},
            {
                "role": "user",
                "content" : cv_summary
            },
            {
                "role": "user",
                "content": summary_to_json_prompt
            }
        ]
    }

    response = requests.post(url, headers=headers, json=data)
    return response.json()['choices'][0]['message']['content']

### double check json
def json_double_check(cv_summary:str, json_parse_response:str):
    json_start_index = json_parse_response.find("```json")

    json_content_string = json_parse_response[json_start_index:]
    json_content_string = json_content_string.strip("`json\n")

    double_check_prompt = json_doublecheck_prompt + json_content_string

    url = "http://127.0.0.1:1234/v1/chat/completions"  # Replace with your actual API URL if different
    headers = {"Content-Type": "application/json"}
    data = {
        "model": "qwen2.5-14b-instruct",
        "messages": [
            {"role":"system", "content":system_prompt_doublecheck},
            {
                "role": "user",
                "content" : cv_summary
            },
            {
                "role": "user",
                "content": double_check_prompt
            }
        ]
    }

    response = requests.post(url, headers=headers, json=data)
    return response.json()['choices'][0]['message']['content']

In [12]:
count = 0
for filename_ex in os.listdir("CV for HBU\\IT"):
    if count == 4: break
    filepath = "CV for HBU\\IT\\" + filename_ex
    file_name = filename_ex[:filename_ex.find(".")]
    if os.path.isfile(f"CV for HBU\\IT\\results_json\\{file_name}.json"): continue
    print(f"At file: {file_name}")
    if "pdf" in filepath:
        raw_text = extract_text_from_pdf(filepath)
    else:
        continue
        # raw_text = doc_to_text_catdoc(filepath)
    print("- Done parsing text from pdf")
    while(True):
        summary_response = cv_summary(raw_text)
        print("- Done summarization")
        json_response = cv_summary_to_json(summary_response)
        print("- Done parsing summary to json")
        json_response = json_double_check(summary_response, json_response)
        print("- Done double checking json")
        print(json_response)
        try:
            json_string = json.loads(json_response)
            break
        except:
            continue
    with open(f"CV for HBU\\IT\\results_json\\{file_name}.json", "x", encoding='utf8') as f:
        json.dump(json_string, f, indent=4, ensure_ascii=False)
    clear_output()
    
    count += 1
    

incorrect startxref pointer(1)


At file: 19cedb5dfd4d3bc121e002e02aa115fd
- Done parsing text from pdf


In [6]:
count = 0
for filename_ex in os.listdir("CV for HBU\\HR"):
    if count == 4: break
    filepath = "CV for HBU\\HR\\" + filename_ex
    file_name = filename_ex[:filename_ex.find(".")]
    if os.path.isfile(f"CV for HBU\\HR\\results_json\\{file_name}.json"): continue
    print(f"At file: {file_name}")
    if "pdf" in filepath:
        raw_text = extract_text_from_pdf(filepath)
    else:
        continue
        # raw_text = doc_to_text_catdoc(filepath)
    print("- Done parsing text from pdf")
    while(True):
        summary_response = cv_summary(raw_text)
        print("- Done summarization")
        json_response = cv_summary_to_json(summary_response)
        print("- Done parsing summary to json")
        json_response = json_double_check(summary_response, json_response)
        print("- Done double checking json")
        print(json_response)
        try:
            json_string = json.loads(json_response)
            break
        except:
            continue
    with open(f"CV for HBU\\HR\\results_json\\{file_name}.json", "x", encoding='utf8') as f:
        json.dump(json_string, f, indent=4, ensure_ascii=False)
    clear_output()
    
    count += 1
    