# PDF Number Extractor

In [5]:
import PyPDF2
import spacy
import re
from numwords_to_nums.numwords_to_nums import NumWordsToNum

In [6]:
num = NumWordsToNum()
nlp = spacy.load("en_core_web_sm")

In [47]:
def add_spaces_to_textual_number(number):
    # Use regex to find the boundary between the numerical part and the textual part
    number = re.sub(r'(\d)([a-zA-Z])', r'\1 \2', number)
    return number
    
def is_numeric(value):
    # Check if the value is a number (integer or float)
    try:
        float(value)
        return True
    except ValueError:
        return False


def parse_number(number):
    parsed_number_string = num.numerical_words_to_numbers(add_spaces_to_textual_number(number), convert_operator=True)
    try: 
        return float(parsed_number_string)
    except ValueError:
        # Note: for production level code, this should be handled, but for a toy exercise like this, the small number of failures can easily be reviewed and discarded manually
        print("Failed parsing: " + parsed_number_string)
        return 0

def extract_numbers_from_text(text):
    # Process the text with spaCy
    doc = nlp(text)

    string_numbers = []

    parsed_number = ""
    for token in doc:
        next_fragment = token.text
        if token.like_num is False and len(parsed_number) > 0:
            string_numbers.append(parsed_number)
            parsed_number = ""
        elif token.like_num:
            if "," in next_fragment:
                string_numbers.append(next_fragment.replace(",", ""))
            elif "." in next_fragment and "." in parsed_number:
                string_numbers.append(parsed_number)
                parsed_number = next_fragment
                
            else:
                if not is_numeric(next_fragment) and len(parsed_number) > 0:
                    parsed_number += " "
                parsed_number += next_fragment

    numbers = []
    for number in string_numbers:
        numbers.append(parse_number(number))
            
    return numbers

def extract_numbers_from_pdf(pdf_path):
    # Open the PDF file
    with open(pdf_path, 'rb') as file:
        # Initialize the PDF reader
        reader = PyPDF2.PdfReader(file)

        # Initialize a list to store all numbers
        largest_number = None

        # Iterate through each page
        nb_pages = len(reader.pages)
        for page_num in range(nb_pages):
            # Get the page object
            page = reader.pages[page_num]
            # Extract text from the page
            text = page.extract_text()
            new_numbers = extract_numbers_from_text(text)
            for nb in new_numbers:
                if largest_number == None or nb > largest_number:
                    largest_number = nb

        return largest_number

pdf_path = 'file.pdf'
biggest = extract_numbers_from_pdf(pdf_path)
print("Biggest: " + str(biggest))


Failed parsing: 1st 5
Failed parsing: 1st 5
Failed parsing: 1st
Failed parsing: 2nd
Failed parsing: 1st
Failed parsing: first30
Failed parsing: 1 3rd
Failed parsing: 1.5.14.9
Failed parsing: 3rd
Failed parsing: 1st
Biggest: 9600000000.0
