In [1]:
import os
import pandas as pd
import numpy as np
from pdf2image import convert_from_path
import pytesseract
from llmsinferer import LlmsInferer

In [2]:
def process_cvs(path):
    
	def extract_text_from_pdf(path):
		images = convert_from_path(path)
		text = ""
		for img in images:
			text += pytesseract.image_to_string(img, lang='eng')
		return text
	
	inferer = LlmsInferer(model="gpt-4o", run_local=False)
	inferer.system_prompts.load_prompts_definitions_file("prompts_definitions.yaml")
	
	system_prompt = inferer.system_prompts.get_prompt(module="analyze_cv", prompt_id=1)

	results = []

	# base_folder = "/home/ruben/recruitment_model/data/CVs"  # Replace with the base folder containing subfolders
	for root, dirs, files in os.walk(path):
		for file in files:
			if file.endswith(".pdf"):
				full_path = os.path.join(root, file)
				text = extract_text_from_pdf(full_path)
				analysis = inferer.get_response(system_prompt=system_prompt, user_prompt=text, temperature=0.3)
				results.append({
					"id": file.replace(".pdf", ""),
					"analysis": analysis
				})

	df = pd.DataFrame(results)

	return df

In [3]:
cvs_train_processed = process_cvs("data/CVs_train")
cvs_train_processed

Loaded prompts definitions.


Unnamed: 0,id,analysis
0,CVANON0002,"{""professionality"": 4, ""visual"": 3, ""grammar"":..."
1,CVANON0247,"{""professionality"": 7, ""visual"": 6, ""grammar"":..."
2,CVANON0105,"{""professionality"": 7, ""visual"": 5, ""grammar"":..."
3,CVANON0009,"{""professionality"": 7, ""visual"": 6, ""grammar"":..."
4,CVANON0033,"{\n ""professionality"": 7,\n ""visual"": 6,\n ..."
...,...,...
322,CVANON0206,"{""professionality"": 7, ""visual"": 6, ""grammar"":..."
323,CVANON0185,"{""professionality"": 7, ""visual"": 6, ""grammar"":..."
324,CVANON0119,"{""professionality"": 6, ""visual"": 5, ""grammar"":..."
325,CVANON0059,"{""professionality"": 7, ""visual"": 6, ""grammar"":..."


In [None]:
cvs_predict_processed = process_cvs("data/CVs_predict")
cvs_predict_processed

In [None]:
cvs_train_processed.to_parquet("data/cvs_train_processed.parquet")
cvs_predict_processed.to_parquet("data/cvs_predict_processed.parquet")