In [1]:
pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Note: you may need to restart the kernel to use updated packages.


In [None]:
import PyPDF2

def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, "rb") as file:
        pdf_reader = PyPDF2.PdfFileReader(file)
        num_pages = pdf_reader.numPages

        for page_num in range(num_pages):
            page = pdf_reader.getPage(page_num)
            text += page.extractText()

    return text

if __name__ == "__main__":
    pdf_path = "sample.pdf"
    extracted_text = extract_text_from_pdf(pdf_path)
    print(extracted_text)


In [1]:
pip install PyMuPDF

Collecting PyMuPDF
  Downloading PyMuPDF-1.22.5-cp39-cp39-win_amd64.whl (11.8 MB)
Installing collected packages: PyMuPDF
Successfully installed PyMuPDF-1.22.5
Note: you may need to restart the kernel to use updated packages.


In [None]:
import fitz

def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as pdf_document:
        num_pages = pdf_document.page_count
        for page_num in range(num_pages):
            page = pdf_document[page_num]
            text += page.get_text()

    return text

if __name__ == "__main__":
    pdf_path = "sample.pdf"
    extracted_text = extract_text_from_pdf(pdf_path)
    print(extracted_text)

In [None]:
import PyPDF2

def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, "rb") as file:
        pdf_reader = PyPDF2.PdfFileReader(file)
        num_pages = pdf_reader.numPages

        for page_num in range(num_pages):
            page = pdf_reader.getPage(page_num)
            text += page.extractText()

            # Process text from the current page (optional)
            # process_text(text)

            # Reset text to save memory (optional)
            text = ""

    return text

if __name__ == "__main__":
    pdf_path = "large_sample.pdf"
    extracted_text = extract_text_from_pdf(pdf_path)
    print(extracted_text)


In [None]:
import pdfplumber

def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf_document:
        num_pages = len(pdf_document.pages)
        for page_num in range(num_pages):
            page = pdf_document.pages[page_num]
            text += page.extract_text()

    return text

if __name__ == "__main__":
    pdf_path = "large_sample.pdf"
    extracted_text = extract_text_from_pdf(pdf_path)
    print(extracted_text)


In [None]:
import re

def parse_technical_experience(text):
    pattern = r"TECHNICAL EXPERIENCE\n• (.*?)\n(.*?)\n(.*?)\n"
    matches = re.findall(pattern, text, re.DOTALL)
    experience_list = []

    for match in matches:
        experience = {
            "key": match[0],
            "start_date": match[1].strip(),
            "description": match[2].strip(),
        }
        experience_list.append(experience)

    return experience_list

if __name__ == "__main__":
    pdf_path = "your_large_pdf.pdf"
    with open(pdf_path, "r") as file:
        pdf_text = file.read()

    technical_experience_list = parse_technical_experience(pdf_text)
    print(technical_experience_list)


In [None]:
import spacy

def extract_technical_experience(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)

    experience_list = []
    current_experience = {}

    for token in doc:
        if token.text.lower() == "technical" and token.nbor().text.lower() == "experience":
            if current_experience:
                experience_list.append(current_experience)
                current_experience = {}
            continue

        if not current_experience.get("key"):
            current_experience["key"] = token.text
        else:
            if not current_experience.get("description"):
                current_experience["description"] = token.text
            else:
                current_experience["description"] += " " + token.text

    if current_experience:
        experience_list.append(current_experience)

    return experience_list

if __name__ == "__main__":
    pdf_path = "your_large_pdf.pdf"
    with open(pdf_path, "r") as file:
        pdf_text = file.read()

    technical_experience_list = extract_technical_experience(pdf_text)
    print(technical_experience_list)


In [None]:
import re

def parse_technical_experience(text):
    pattern = r"(\d+\.)\s(.*?\d+\.\d+.*?)\n(.*?)\n"
    matches = re.findall(pattern, text, re.DOTALL)
    experience_list = []

    for match in matches:
        experience = {
            "title": match[0].strip(),
            "key": match[1].strip(),
            "description": match[2].strip(),
        }
        experience_list.append(experience)

    return experience_list

if __name__ == "__main__":
    pdf_path = "your_large_pdf.pdf"
    with open(pdf_path, "r") as file:
        pdf_text = file.read()

    technical_experience_list = parse_technical_experience(pdf_text)
    print(technical_experience_list)


In [None]:
import fitz

def extract_text_from_pdf(pdf_path, start_page=0):
    text = ""
    with fitz.open(pdf_path) as pdf_document:
        num_pages = pdf_document.page_count
        for page_num in range(start_page, num_pages):
            page = pdf_document[page_num]
            text += page.get_text()

    return text

def parse_technical_experience(text):
    pattern = r"(\d+\.)\s(.*?\d+\.\d+.*?)\n(.*?)\n"
    matches = re.findall(pattern, text, re.DOTALL)
    experience_list = []

    for match in matches:
        experience = {
            "title": match[0].strip(),
            "key": match[1].strip(),
            "description": match[2].strip(),
        }
        experience_list.append(experience)

    return experience_list

if __name__ == "__main__":
    pdf_path = "your_large_pdf.pdf"

    # Set the starting page number to exclude the index pages (e.g., 3)
    starting_page = 3

    pdf_text = extract_text_from_pdf(pdf_path, starting_page)
    technical_experience_list = parse_technical_experience(pdf_text)
    print(technical_experience_list)