# Resume Data Extraction (Beginner)

This notebook demonstrates how to extract structured data from resume PDFs using **PyMuPDF**, regex, and pandas.


In [None]:
!pip install pymupdf pandas


In [None]:
import fitz  # PyMuPDF
import re
import glob
import pandas as pd


## 1. Extract Text from PDF


In [None]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
    return text


## 2. Normalize Text


In [None]:
def normalize_text(text):
    text = re.sub(r'\s+', ' ', text)
    return text.strip()


## 3. Extract Fields with Regex


In [None]:
def extract_fields(text):
    fields = {}
    fields['Email'] = re.search(r'[\w\.-]+@[\w\.-]+', text).group(0) if re.search(r'[\w\.-]+@[\w\.-]+', text) else None
    fields['Name'] = text.split('\n')[0]
    fields['Skills'] = re.search(r'Skills?[:\s]+([A-Za-z, ]+)', text).group(1) if re.search(r'Skills?[:\s]+([A-Za-z, ]+)', text) else None
    fields['Education'] = re.search(r'Education[:\s]+(.+?)Experience', text).group(1) if re.search(r'Education[:\s]+(.+?)Experience', text) else None
    return fields


## 4. Process Multiple Resumes and Export CSV


In [None]:
def process_resumes(folder_path):
    results = []
    for pdf in glob.glob(folder_path + '/*.pdf'):
        raw = extract_text_from_pdf(pdf)
        clean = normalize_text(raw)
        data = extract_fields(clean)
        data['File'] = pdf
        results.append(data)
    return pd.DataFrame(results)

# Example usage:
# df = process_resumes('resumes')
# df.to_csv('extracted_resumes.csv', index=False)
