# EY AI challenge 2025 - CV analyzer bot

## Imports

In [31]:
import pandas as pd
from PyPDF2 import PdfReader
import os
import re

## Read dataset

In [32]:
from PyPDF2 import PdfReader
reader = PdfReader("../CVs/cv_1.pdf")
text = "".join([page.extract_text() for page in reader.pages])

In [33]:
print(text)

cv_1.md 2025-05-02
1 / 1Carla Nogueira
Title:  Tax Consultant
Location:  Lisbon, P ortugal
Email:  carla.nogueira@example.com
Phone:  +351 938 444 555
Summary
Tax Consultant specializing in corporate taxation and cross-border transactions. Experienced in managing tax
compliance for multinational clients and advising on the tax implications of mergers, restructurings, and
intercompany transactions.
Experience
EY Portugal – Lisbon
Senior Tax Consultant
March 2018 – Present
Provided tax planning and compliance services to Fortune 500 clients.
Advised on transfer pricing documentation and tax audits.
Monitored BEPS developments and supported implementation of D AC6 reporting.
Education
Catholic Univ ersity o f Portugal – Lisbon
LLM in International Tax Law
2015 – 2017
Univ ersity o f Lisbon – Lisbon
LLB in Law
2011 – 2015
Skills
Corporate income tax, V AT, transfer pricing
SAP FI, Oracle T ax
OECD guidelines, EU directives
Fluent in English and P ortuguese


In [None]:
folder = "../CVs/"
all_data = []

for filename in os.listdir(folder):
    if filename.endswith(".pdf"):
        # Extract index from filename
        match = re.search(r'cv_(\d+)', filename)
        if not match:
            continue
        cv_index = int(match.group(1))

        # Extract text from PDF
        reader = PdfReader(os.path.join(folder, filename))
        text = "".join([page.extract_text() or "" for page in reader.pages])

        # Normalize spacing
        text = text.replace("\n", " ").replace("  ", " ").strip()

        # Extract fields using regex
        try:
            name = re.search(r'\d+\s*/\s*\d+(.*?)Title:', text, re.IGNORECASE).group(1).strip()
            title = re.search(r'Title:\s*(.*?)Location:', text, re.IGNORECASE).group(1).strip()
            location = re.search(r'Location:\s*(.*?)Email:', text, re.IGNORECASE).group(1).strip()
            summary = re.search(r'Summary(.*?)Experience', text, re.IGNORECASE).group(1).strip()
            experience = re.search(r'Experience(.*?)Education', text, re.IGNORECASE).group(1).strip()
            education = re.search(r'Education(.*?)Skills', text, re.IGNORECASE).group(1).strip()
            skills = re.search(r'Skills(.*)', text, re.IGNORECASE).group(1).strip()

            all_data.append({
                "cv_index": cv_index,
                "name": name,
                "title": title,
                "location": location,
                "summary": summary,
                "experience": experience,
                "education": education,
                "skills": skills
            })
        except AttributeError:
            print(f"Could not extract all fields from {filename}. Skipping.")

# Create DataFrame
CVs = pd.DataFrame(all_data)
CVs.set_index("cv_index", inplace=True)

# Preview
print(CVs.shape)
CVs.head()

Could not extract all fields from cv_87.pdf. Skipping.
Could not extract all fields from cv_90.pdf. Skipping.
Could not extract all fields from cv_2.pdf. Skipping.
Could not extract all fields from cv_75.pdf. Skipping.
Could not extract all fields from cv_45.pdf. Skipping.
Could not extract all fields from cv_21.pdf. Skipping.


Unnamed: 0_level_0,name,title,location,summary,experience,education,skills
cv_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
47,Helena Santos,Healthcare Consultant,"Porto, P ortugal",Healthcare professional with background in hos...,in healthcare operations improvement. Expertis...,Porto Business School – P orto MBA in Healthca...,Healthcare operations managementcv_47.md 2025-...
103,Tiago Ramos,Audit Manager,"Porto, P ortugal",CPA-certified Audit Manager with 8+ years of,"auditing clients in financial services, insura...",Univ ersidade Cat ólica P ortuguesa – P orto M...,"IFRS, GAAP, SAP Caseware, Audit Command Langua..."
69,Ricardo Mendes,Forensic Accountant,"Lisbon, P ortugal",Detail-oriented forensic accountant with 5 yea...,in fraud investigation and litigation support....,ISEG - Lisbon School o f Economics & Managemen...,in a Big4 forensic services team. Experience P...
94,Ana Ferreira,Tax Consultant,"Lisbon, P ortugal",Tax professional with 5 years of,advising multinational clients on cross-border...,Católica Lisbon School o f Business & Economic...,Portuguese and international tax lawcv_94.md 2...
79,Catarina Neves,Human Capital Consultant,"Lisbon, P ortugal",HR professional with expertise in organization...,leading HR transformation projects and impleme...,ISCTE Business School – Lisbon MSc in Human Re...,Organizational developmentcv_79.md 2025-05-02 ...


In [39]:
folder = "../JobDescriptions"
all_jobs = []

for filename in os.listdir(folder):
    if filename.endswith(".pdf"):
        filepath = os.path.join(folder, filename)
        reader = PdfReader(filepath)
        text = "".join([page.extract_text() or "" for page in reader.pages])

        # Normalize whitespace
        text = text.replace("\n", " ").replace("  ", " ").strip()

        try:
            # Job title = first line (before "Location")
            job_title_match = re.match(r'^(.*?)\sLocation:', text, re.IGNORECASE)
            job_title = job_title_match.group(1).strip() if job_title_match else None

            location = re.search(r'Location:\s*(.*?)\sDepartment:', text, re.IGNORECASE).group(1).strip()
            experience_level = re.search(r'Experience Level:\s*(.*?)\sAbout the Role:', text, re.IGNORECASE).group(1).strip()

            about_role = re.search(r'About the Role:\s*(.*?)\sKey Responsibilities:', text, re.IGNORECASE).group(1).strip()

            key_resp = re.search(r'Key Responsibilities:\s*(.*?)\sRequirements:', text, re.IGNORECASE).group(1).strip()

            requirements = re.search(r'Requirements:\s*(.*)', text, re.IGNORECASE).group(1).strip()

            all_jobs.append({
                "job_title": job_title,
                "location": location,
                "experience_level": experience_level,
                "about_role": about_role,
                "key_responsibilities": key_resp,
                "requirements": requirements
            })

        except AttributeError:
            print(f"Could not extract all fields from {filename}. Skipping.")

df_jobs = pd.DataFrame(all_jobs)
print(df_jobs.shape)
df_jobs.head()

(5, 6)


Unnamed: 0,job_title,location,experience_level,about_role,key_responsibilities,requirements
0,Financial Analyst,"Lisbon, Portugal",Staff,Join our Financial Services team as a Financia...,Assist in the preparation of financial reports...,"Bachelor's degree in Finance, Accounting, or r..."
1,"Directors, Data & AI Strategy","Lisbon, Portugal",Director (15+ years),EY Portugal is seeking two experienced Directo...,Lead enterprise -wide data transformation and ...,"15+ years of experience in data strategy, busi..."
2,Cybersecurity Team Recruitment,"Lisbon, Portugal",,EY Portugal is embarking on an exciting journe...,Develop and implement a comprehensive cybersec...,Proven experience in building and leading cybe...
3,"Senior Consultant, Technology Risk","Lisbon, Portugal",Senior,"As a Senior Consultant in Technology Risk, you...",Conduct assessments of technology controls and...,4-6 years of experience in technology risk or ...
4,"Consultant, Forensic & Integrity Services","Lisbon, Portugal",Manager,EY Portugal is seeking a Consultant to join ou...,Conduct forensic investigations and data analy...,7+ years of experience in forensic accounting ...


## Save datasets