In [2]:
from unstructured.partition.pdf import partition_pdf
import PyPDF2
import anthropic
import nltk
from nltk.stem import WordNetLemmatizer
import shutil
import base64
import pickle
import os
import re

In [3]:
os.environ["ANTHROPIC_API_KEY"]  = ""

client=anthropic.Anthropic()

haiku = "claude-3-haiku-20240307"
sonnet = "claude-3-sonnet-20240229"

In [4]:
os.environ["EXTRACT_IMAGE_BLOCK_CROP_HORIZONTAL_PAD"]='20'
os.environ["EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD"]='80'

In [5]:
companyList=["American Airlines Group Inc.","Delta Air Lines, Inc.","Southwest Airlines Co."]
input_sector="Airline"
pdfList=["AmericanAirlinesAnnualReport.pdf", "DeltaAirlinesAnnualReport.pdf","SouthWestAirlinesAnnualReport.pdf"]
input_metric="Fuel consumed in gallons"
input_year="2023"

In [6]:
def categorize_elements(raw_pdf_elements):
    text_elements = []
    table_elements = []
    table_data=[]
    for element in raw_pdf_elements:
        if 'CompositeElement' in str(type(element)):
            text_elements.append(str(element))
        elif 'Table' in str(type(element)):
            table_elements.append(element)
            table_data.append(str(element))
    return text_elements, table_elements, table_data

In [7]:
def get_response_haiku(message):
    response = client.messages.create(
        model=haiku,
        max_tokens=1024,
        messages=message
    )
    return response.content[0].text

In [8]:
def get_response_sonnet(message):
    response = client.messages.create(
        model=sonnet,
        max_tokens=1024,
        messages=message
    )
    return response.content[0].text

In [9]:
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    processed_text = ' '.join(tokens)
    
    return processed_text

In [10]:
def extract_page(page_number, input_pdf, output_pdf="page.pdf"):
    with open(input_pdf, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        writer = PyPDF2.PdfWriter()

        # Extract the page and add it to the writer
        page = reader.pages[page_number-1]  # Page numbers start from 0
        writer.add_page(page)

        # Write the output PDF to a file
        with open(output_pdf, 'wb') as output_file:
            writer.write(output_file)

def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')

In [41]:
results=[]

for i in range(len(pdfList)):
    input_company=companyList[i]
    input_pdf=pdfList[i]

    pickle_name=f'{input_company}_pdf_elements.pkl'

    if not os.path.exists(pickle_name):
        pdf_elements = partition_pdf(
            input_pdf,
            chunking_strategy="by_title",
            extract_images_in_pdf=False,
            infer_table_structure=True,
            max_characters=3000,
            new_after_n_chars=2800,
            combine_text_under_n_chars=2000
            )



        with open(pickle_name, "wb") as f:
            pickle.dump(pdf_elements, f)

    with open(pickle_name, "rb") as f:
        pdf_elements = pickle.load(f)

    texts, tables, tables_text = categorize_elements(pdf_elements)

    messages=[
    {
        "role": "user",
        "content": [
        {"type": "text", "text": f'''You are provided with the following information about a specific metric used by a publicly listed company:

        Metric: {input_metric}
        Company Name: {input_company}
        Sector of the Company: {input_sector}
        
        This metric or its variations are reported in the company's annual report. Please generate a list of 5 specific keywords to search for in the annual report of {input_company}. These phrases or keywords should capture potential syntactic and semantic variations of the metric name, based on what you know about the company. The keyword can be a service provided by {input_company} that matches with {input_metric} The goal is to find the most relevant sections of the annual report with minimal irrelevant matches. Do not provide me business keywords like sales, units, shipment etc. Provide each keyword on a new line and do not include any special symbols or bullet points in your answer. All keywords should only be one single word and not be more than that. A keyword shouldn't be a concatenation of two words like PhoneSales, TransportRevenue etc. In case the metric itself has a service of the company mentioned, do include it in keywords.'''}
        ],
    }
    ]

    response=get_response_haiku(messages)

    print(response)

    response = response.split('\n')
    response = [line for line in response]


    keywords = set()

    for word in response:
        keywords.add(word.strip())

    keywords = list(keywords)

    print(keywords)

    presentText=[]

    for text in texts:
        text = preprocess_text(text)
        present = False
        for keyword in keywords:
            keyword=preprocess_text(keyword)
            if keyword in text:
                present=True
                break

        if not present:
            continue

        presentText.append(text)


    presentTableElement=[]
    presentTableText=[]

    for index, text in enumerate(tables_text):
        text = preprocess_text(text)
        present=False
        for keyword in keywords:
            keyword=preprocess_text(keyword)
            if keyword in text:
                present=True
                break

        if not present:
            continue

        presentTableElement.append(tables[index])
        presentTableText.append(text)


    finalPresentTableElement=presentTableElement
    finalPresentTableText=presentTableText

    print(len(presentTableElement))

    for index, i in enumerate(presentTableElement):
        filepath=f"tableImages/{index}"
        table_image=i.metadata.orig_elements[0].metadata.page_number
        extract_page(table_image, input_pdf)

        pdf_tables = partition_pdf(
            "page.pdf",
            strategy="hi_res",
            extract_images_in_pdf=True,
            extract_image_block_types=["Table"],
            extract_image_block_to_payload=False,
            extract_image_block_output_dir=filepath
            )

        for filename in os.listdir(filepath):
            if filename.endswith('.jpg') or filename.endswith('.png'):
                image_path = os.path.join(filepath, filename)
                base64_image = encode_image(image_path)
                
                messages = [
                        {
                            "role": "user",
                            "content": [
                                {
                                    "type": "image",
                                    "source": {
                                        "type": "base64",
                                        "media_type": "image/jpeg",
                                        "data": base64_image,
                                    },
                                },
                                {
                                    "type": "text",
                                    "text":  f'''Does the following table give me any specific numeric information about {input_metric}?

                                    This table and surrounding context has been extracted from the annual report of {input_company}.
                                    
                                    If no, then return the text 'no information' in your output.
                                    
                                    Else, explain the table to me in very brief in a 50 word paragraph, make sure to use mention all the values mentioned in the table along with what they are, their corresponding years, and their units to explain.
                                    
                                    Make sure to not go over the word limit and answer is 'no information' if table doesn't tell us about {input_metric}.'''
                                }
                            ],
                        }
                    ]

                response=get_response_sonnet(messages)

                if "no information" in response.lower() or "table does not" in response .lower() or "image does not" in response.lower():
                    os.remove(image_path)
                else:
                    print(response)

        if len(os.listdir(filepath)) == 0:
            finalPresentTableElement.remove(i)
            del finalPresentTableText[index]


    tableContext=[]

    for i in os.listdir("tableImages"):
        folderPath=os.path.join("tableImages",i)
        for j in os.listdir(folderPath):
            imagePath=os.path.join(folderPath,j)

            base64_image=encode_image(imagePath)

            messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "source": {
                            "type": "base64",
                            "media_type": "image/jpeg",
                            "data": base64_image,
                        },
                    },
                    {
                        "type": "text",
                        "text":  f'''Give me information about the metric mentioned below for the company {input_company} in the year {input_year} on the basis of the following image provided consisting of a table and a context for the table above it.

                        Metric: {input_metric}

                        Do not assume the values in the table to be information about the metric before first reading what the table is about and what the values are from the context and headers for the table mentioned in the image. (e.g., don't assume all values with a currency symbol to be revenue or sales without it being explicitly mentioned in the table). Return 'no information' instead of assuming any values.
                        
                        If the data for what is asked for is not present, just simply say 'no information'.

                        If the context given about the table is not related to Metric and something else that is similar, make no assumptions and return 'no infirmation'.
                        
                        If the information is present, then only return what is asked for along with the numerical value and why you think it is the metric we asked.'''
                    }
                ],
            }
            ]

            response=get_response_sonnet(messages)

            if "no information" in response.lower() or "no numerical information" in response.lower() or "no direct information" in response.lower() or "is not directly" in response.lower() or "does not explicitly" in response.lower():
                continue

            print(response)

            tableContext.append(response)
    

    context=tableContext+presentText
    context=" ".join(context)

    messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text":  f'''Give me numerical information about {input_metric} for the company {input_company} in the year {input_year} on the basis of the following context from their annual report:-
                        
                        Context: {context}
                        
                        Only give me the numeric information about what is asked and do not return anything extra. Make sure your answer is a value that is mentioned in the above context and also includes the unit of the value. In case their are two values that give us the required information, give me the one that also matches the unit I have mentioned.'''
                    }
                ],
            }
        ]

    operatingMetric=get_response_sonnet(messages)

    print(operatingMetric)
    results.append(operatingMetric)

    folder_path="tableImages"
    if os.path.exists(folder_path):
        shutil.rmtree(folder_path)

Gallons
Fuel
Aviation
Propulsion
Emissions
['Emissions', 'Fuel', 'Aviation', 'Propulsion', 'Gallons']
10


This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name


The table provides information about fuel consumption in gallons for American Airlines' mainline and regional operations in 2022 and 2021. In 2022, they consumed 3,901 million gallons of fuel at an average price of $3.54 per gallon, resulting in a fuel expense of $13,791 million. In 2021, they consumed 3,324 million gallons at $2.04 per gallon, with a fuel expense of $6,792 million.


This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name
This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name


The table provides information about the fuel consumption in gallons. In 2022, the fuel consumption was 3,901 million gallons, an increase of 17.4% from 3,324 million gallons in 2021. The average aircraft fuel price including related taxes also increased from $2.04 per gallon in 2021 to $3.54 per gallon in 2022, a 73.0% increase.


This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name
This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name
This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name


Hi
Hi
Based on the context, the Fuel consumed in gallons for the company American Airlines Group Inc. in the year 2023 is not explicitly stated. However, it provides the following related information:

"In 2022, we estimate that a one cent per gallon increase in the price of aircraft fuel would increase our annual fuel expense by approximately $84 million."

This implies that American Airlines Group Inc. consumed approximately 8.4 billion gallons of fuel in 2022, but does not provide the value for 2023.
Jetfuel
Aviation
Gallons
Aviation-fuel
Kerosene
['Jetfuel', 'Kerosene', 'Aviation-fuel', 'Aviation', 'Gallons']
5


This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name


KeyboardInterrupt: 

In [None]:
print(results)

['The context does not provide any numeric information about Fuel consumed in gallons for the company American Airlines Group Inc. in the year 2023.', 'The fuel consumed for the company Delta Air Lines, Inc. in the year 2023 was 4 billion gallons.', 'According to the context, the company expects to consume approximately 3.8 billion gallons of jet fuel in 2023.']
