In [1]:
from unstructured.partition.pdf import partition_pdf
import PyPDF2
import anthropic
import nltk
from nltk.stem import WordNetLemmatizer
import base64
import pickle
import os
import re

In [2]:
os.environ["ANTHROPIC_API_KEY"]  = ""

client=anthropic.Anthropic()

haiku = "claude-3-haiku-20240307"
sonnet = "claude-3-sonnet-20240229"

In [3]:
os.environ["EXTRACT_IMAGE_BLOCK_CROP_HORIZONTAL_PAD"]='20'
os.environ["EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD"]='80'

In [4]:
input_company="NIKE, Inc."
input_sector="Footwear"
input_pdf="NikeAnnualReport.pdf"
input_metric="Money from greater china"
input_year="2023"

In [5]:
pdf_elements = partition_pdf(
    input_pdf,
    chunking_strategy="by_title",
    extract_images_in_pdf=False,
    infer_table_structure=True,
    max_characters=3000,
    new_after_n_chars=2800,
    combine_text_under_n_chars=2000
    )

This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name
Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSe

In [6]:
with open("nike_pdf_elements.pkl", "wb") as f:
    pickle.dump(pdf_elements, f)

In [7]:
with open("nike_pdf_elements.pkl", "rb") as f:
    pdf_elements = pickle.load(f)

In [8]:
def categorize_elements(raw_pdf_elements):
    text_elements = []
    table_elements = []
    table_data=[]
    for element in raw_pdf_elements:
        if 'CompositeElement' in str(type(element)):
            text_elements.append(str(element))
        elif 'Table' in str(type(element)):
            table_elements.append(element)
            table_data.append(str(element))
    return text_elements, table_elements, table_data

texts, tables, tables_text = categorize_elements(pdf_elements)

In [9]:
def get_response_haiku(message):
    response = client.messages.create(
        model=haiku,
        max_tokens=1024,
        messages=message
    )
    return response.content[0].text

In [10]:
def get_response_sonnet(message):
    response = client.messages.create(
        model=sonnet,
        max_tokens=1024,
        messages=message
    )
    return response.content[0].text

In [11]:
messages=[
{
    "role": "user",
    "content": [
    {"type": "text", "text": f'''You are provided with the following information about a specific metric used by a publicly listed company:

    Metric: {input_metric}
    Company Name: {input_company}
    Sector of the Company: {input_sector}
    
    This metric or its variations are reported in the company's annual report. Please generate a list of 5 specific keywords to search for in the annual report of {input_company}. These phrases or keywords should capture potential syntactic and semantic variations of the metric name, based on what you know about the company. The keyword can be a service provided by {input_company} that matches with {input_metric} The goal is to find the most relevant sections of the annual report with minimal irrelevant matches. Do not provide me business keywords like sales, units, shipment etc. Provide each keyword on a new line and do not include any special symbols or bullet points in your answer. All keywords should only be one single word and not be more than that. A keyword shouldn't be a concatenation of two words like PhoneSales, TransportRevenue etc. In case the metric itself has a service of the company mentioned, do include it in keywords.'''}
    ],
}
]

response=get_response_haiku(messages)

print(response)

response = response.split('\n')
response = [line for line in response]


keywords = set()

for word in response:
    keywords.add(word.strip())

keywords = list(keywords)

print(keywords)

Greater
China
Revenue
Retail
Wholesale
['Greater', 'Revenue', 'China', 'Wholesale', 'Retail']


In [12]:
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    processed_text = ' '.join(tokens)
    
    return processed_text

In [13]:
presentText=[]

for text in texts:
    text = preprocess_text(text)
    present = False
    for keyword in keywords:
        keyword=preprocess_text(keyword)
        if keyword in text:
            present=True
            break

    if not present:
        continue

    presentText.append(text)


presentTableElement=[]
presentTableText=[]

for index, text in enumerate(tables_text):
    text = preprocess_text(text)
    present=False
    for keyword in keywords:
        keyword=preprocess_text(keyword)
        if keyword in text:
            present=True
            break

    if not present:
        continue

    presentTableElement.append(tables[index])
    presentTableText.append(text)

In [14]:
def extract_page(page_number, input_pdf=input_pdf, output_pdf="page.pdf"):
    with open(input_pdf, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        writer = PyPDF2.PdfWriter()

        # Extract the page and add it to the writer
        page = reader.pages[page_number-1]  # Page numbers start from 0
        writer.add_page(page)

        # Write the output PDF to a file
        with open(output_pdf, 'wb') as output_file:
            writer.write(output_file)

def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')

In [15]:
finalPresentTableElement=presentTableElement
finalPresentTableText=presentTableText

print(len(presentTableElement))

for index, i in enumerate(presentTableElement):
    filepath=f"tableImages/{index}"
    table_image=i.metadata.orig_elements[0].metadata.page_number
    extract_page(table_image)

    pdf_tables = partition_pdf(
        "page.pdf",
        strategy="hi_res",
        extract_images_in_pdf=True,
        extract_image_block_types=["Table"],
        extract_image_block_to_payload=False,
        extract_image_block_output_dir=filepath
        )

    for filename in os.listdir(filepath):
        if filename.endswith('.jpg') or filename.endswith('.png'):
            image_path = os.path.join(filepath, filename)
            base64_image = encode_image(image_path)
            
            messages = [
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "image",
                                "source": {
                                    "type": "base64",
                                    "media_type": "image/jpeg",
                                    "data": base64_image,
                                },
                            },
                            {
                                "type": "text",
                                "text":  f'''Does the following table give me any specific numeric information about {input_metric}?

                                This table and surrounding context has been extracted from the annual report of {input_company}.
                                
                                If no, then return the text 'no information' in your output.
                                
                                Else, explain the table to me in very brief in a 50 word paragraph, make sure to use mention all the values mentioned in the table along with what they are, their corresponding years, and their units to explain.
                                
                                Make sure to not go over the word limit and answer is 'no information' if table doesn't tell us about {input_metric}.'''
                            }
                        ],
                    }
                ]

            response=get_response_sonnet(messages)

            if "no information" in response.lower() or "table does not" in response .lower() or "image does not" in response.lower():
                os.remove(image_path)
            else:
                print(response)

    if len(os.listdir(filepath)) == 0:
        finalPresentTableElement.remove(i)
        del finalPresentTableText[index]
        # os.remove(filepath)

This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name


27


This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name


The table and provided context do not contain any specific numeric information about Money from Greater China. It provides biographical details about NIKE's executive leadership team members and their roles within the company, but no financial data related to Greater China is mentioned.


This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name
This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name
This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name


The table provides information about NIKE's revenues across different geographical regions and divisions for fiscal years 2023, 2022, and 2021. For the Greater China region, the revenue in fiscal 2023 was $7,248 million, down 4% from $7,547 million in fiscal 2022, and down 13% excluding currency changes compared to fiscal 2021.
The table provides information about NIKE's earnings breakdown by geographic regions and business segments for fiscal years 2023, 2022, and 2021. It lists the EBIT values in millions of dollars for Greater China as $2,283 million for fiscal 2023, $2,365 million for fiscal 2022 (a 3% decrease from the previous year), and $3,243 million for fiscal 2021.


This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name


Yes, the table provides numeric information about revenues from Greater China for NIKE, Inc. In fiscal year 2023, Greater China revenues were $7,248 million, down 4% from $7,547 million in fiscal year 2022. The percentage change excluding currency impacts was a 4% increase for fiscal 2023 compared to a 13% decrease in fiscal 2021.
The table provides numeric information about EBIT (earnings before interest and taxes) for Nike's Greater China region, which was $2,283 million for fiscal year 2023, a 3% decrease from $2,365 million in fiscal year 2022. The values are reported in millions of dollars for the corresponding fiscal years.


This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name
This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name


The table provides revenue figures for NIKE's Greater China region across different categories like Footwear ($5,435 million in Fiscal 2023), Apparel ($1,666 million in Fiscal 2023), and Equipment ($147 million in Fiscal 2023). It also breaks down revenues by sales channels like Wholesale Customers and NIKE Direct, along with Earnings Before Interest and Taxes for Fiscal 2023 ($2,283 million) and comparative figures for prior years.


This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name
This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name
This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name
This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name
This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name
This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.

The table provides revenue information for Nike, Inc. for the year ended May 31, 2023, broken down by geographic region and distribution channel. It includes a specific revenue figure of $7,248 million for the Greater China region, along with other regional and product category revenue details in millions of dollars.
The table provides revenue information for NIKE, Inc. for the year ended May 31, 2022, broken down by geography and product category. For the Greater China region, the revenues are listed as $7,547 million for Footwear ($5,416 million), Apparel ($1,938 million), and Equipment ($193 million).


This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name


The table provides revenue information for NIKE, Inc. for the year ended May 31, 2021, broken down by geography and product category. It shows that the revenue from the Greater China region was $8,290 million for that fiscal year, which includes footwear revenue of $5,748 million, apparel revenue of $2,347 million, and equipment revenue of $195 million.


This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name


The table provides revenue and earnings information for NIKE, Inc. across different geographical regions and business segments for the fiscal years 2021, 2022, and 2023. For the Greater China region, it lists the revenues as $8,290 million in 2021, $7,248 million in 2022, and $7,248 million in 2023. It also shows earnings before interest and taxes for Greater China as $3,243 million in 2021, $2,365 million in 2022, and $2,283 million in 2023.


This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name


The table provides information about accounts receivable, inventories, and property, plant and equipment for NIKE across different geographic regions, including Greater China. For accounts receivable in Greater China, the values are $162 million for 2023 and $406 million for 2022. For inventories, the values are $973 million for 2023 and $1,044 million for 2022. For property, plant and equipment, the value for Greater China is $292 million for 2023 and $303 million for 2022. All values are in millions of dollars.


This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name


The table provides information about Nike's accounts receivable, inventories, and property, plant, and equipment across different geographic regions and divisions for the fiscal years ending on May 31, 2023, and May 31, 2022. For the Greater China region specifically, the accounts receivable values are $162 million for 2023 and $406 million for 2022, the inventory values are $973 million for 2023 and $1,044 million for 2022, and the property, plant, and equipment values are $292 million for 2023 and $303 million for 2022.


In [16]:
# base64_image=encode_image(r'test/table-1-1.jpg')

# messages = [
#             {
#                 "role": "user",
#                 "content": [
#                     {
#                         "type": "image",
#                         "source": {
#                             "type": "base64",
#                             "media_type": "image/jpeg",
#                             "data": base64_image,
#                         },
#                     },
#                     {
#                         "type": "text",
#                         "text":  f'''Does the following table give me any specific numeric information about {input_metric}?

#                         This table and surrounding context has been extracted from the annual report of {input_company}.
                        
#                         If no, then return the text 'no information' in your output.
                        
#                         Else, explain the table to me in very brief in a 50 word paragraph, make sure to use mention all the values mentioned in the table along with what they are, their corresponding years, and their units to explain.
                        
#                         Make sure to not go over the word limit and answer is 'no information' if table doesn't tell us about {input_metric}.'''
#                     }
#                 ],
#             }
#         ]

# response=get_response_sonnet(messages)

# print(response)

In [17]:
tableContext=[]

for i in os.listdir("tableImages"):
    folderPath=os.path.join("tableImages",i)
    for j in os.listdir(folderPath):
        imagePath=os.path.join(folderPath,j)

        base64_image=encode_image(imagePath)

        messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "source": {
                        "type": "base64",
                        "media_type": "image/jpeg",
                        "data": base64_image,
                    },
                },
                {
                    "type": "text",
                    "text":  f'''Give me information about the metric mentioned below for the company {input_company} in the year {input_year} on the basis of the following image provided consisting of a table and a context for the table above it.

                    Metric: {input_metric}

                    Do not assume the values in the table to be information about the metric before first reading what the table is about and what the values are from the context and headers for the table mentioned in the image. (e.g., don't assume all values with a currency symbol to be revenue or sales without it being explicitly mentioned in the table). Return 'no information' instead of assuming any values.
                    
                    If the data for what is asked for is not present, just simply say 'no information'.

                    If the context given about the table is not related to Metric and something else that is similar, make no assumptions and return 'no infirmation'.
                    
                    If the information is present, then only return what is asked for along with the numerical value and why you think it is the metric we asked.'''
                }
            ],
        }
        ]

        response=get_response_sonnet(messages)

        if "no information" in response.lower() or "no numerical information" in response.lower() or "no direct information" in response.lower() or "is not directly" in response.lower() or "does not explicitly" in response.lower():
            continue

        print(response)

        tableContext.append(response)
        

Based on the image and the context provided, the metric "Money from greater china" for NIKE, Inc. in the year 2023 is $7,248 million.

The table shows financial data across different geographical regions for NIKE, Inc. for the "YEAR ENDED MAY 31, 2023". One of the columns is labeled "GREATER CHINA", which represents the Greater China region. Under the "TOTAL REVENUES" row for this column, the value is $7,248 million.

Since the question specifically asks for the "Money from greater china" metric for NIKE, Inc. in 2023, and this column represents the Greater China region's revenues for NIKE in that year, I can confidently state that $7,248 million is the money from Greater China for the company in 2023 based on the information provided in the image.
According to the table in the image, the row labeled "Greater China" under the "Asia Pacific & Latin America" column shows a revenue of $7,547 million for NIKE, Inc. in the year ended May 31, 2022. Therefore, the money from Greater China for

In [18]:
context=tableContext+presentText
context=" ".join(context)

messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text":  f'''Give me numerical information about {input_metric} for the company {input_company} in the year {input_year} on the basis of the following context from their annual report:-
                    
                    Context: {context}
                    
                    Only give me the numeric information about what is asked and do not return anything extra. Make sure your answer is a value that is mentioned in the above context and also includes the unit of the value. In case their are two values that give us the required information, give me the one that also matches the unit I have mentioned.'''
                }
            ],
        }
    ]

operatingMetric=get_response_sonnet(messages)

print(operatingMetric)

$7,248 million
