# Gain valuable corporate insights by questioning a 10-k report in 5 easy steps

Accompanying Code

In [32]:
from haystack.utils import convert_files_to_docs, export_answers_to_csv
from haystack.nodes import FARMReader, DensePassageRetriever, PreProcessor, PDFToTextConverter
from haystack.document_stores import FAISSDocumentStore
from haystack.pipelines import ExtractiveQAPipeline
import pandas as pd

![1](images/process/1.png)

In [33]:
# we can import all reports in one go with: 
# all_docs = convert_files_to_docs(dir_path="reports/")

# in this instance, we only use the report of Walmart, but feel free to use other ones with
converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
all_docs = [converter.convert(file_path="reports/walmart-10k.pdf", meta=None)[0]]

pdftotext version 4.03 [www.xpdfreader.com]
Copyright 1996-2021 Glyph & Cog, LLC


![2](images/process/2.png)

In [34]:
# Setting our parameters for the preprocessing
preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="word",
    split_length=100,
    split_respect_sentence_boundary=True,
    split_overlap=3
)
# Actual preprocessing
preprocessed_docs = preprocessor.process(all_docs)

100%|██████████| 1/1 [00:00<00:00,  6.16docs/s]


In [35]:
try: 
    # Instantiate the document store
    document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")
    # Save all preprocessed documents to the document store
    document_store.write_documents(preprocessed_docs)
except ValueError:
    # Reset document store, to make sure it is fine :)
    document_store.delete_documents()

![3](images/process/3.png)

In [36]:
# We set up our retriever with the preferred parameters.
retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
    max_seq_len_query=64,
    max_seq_len_passage=256,
    batch_size=16,
    use_gpu=True,
    embed_title=True,
    use_fast_tokenizers=True,
)
# We calculate the embeddings for all of our documents in the document store.
document_store.update_embeddings(retriever)

INFO - haystack.modeling.utils -  Using devices: CPU
INFO - haystack.modeling.utils -  Number of GPUs: 0
INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Could not find facebook/dpr-question_encoder-single-nq-base locally.
INFO - haystack.modeling.model.language_model -  Looking on Transformers Model Hub (in local cache and online)...
INFO - haystack.modeling.model.language_model -  Loaded facebook/dpr-question_encoder-single-nq-base
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizerFast'.
INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Could not find facebook/dpr-ctx_encoder-single-nq-base locally.
INFO 

![4](images/process/4.png)

In [37]:
# After having initialized our retriever, we initialize our reader
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)

INFO - haystack.modeling.utils -  Using devices: CPU
INFO - haystack.modeling.utils -  Number of GPUs: 0
INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Could not find deepset/roberta-base-squad2 locally.
INFO - haystack.modeling.model.language_model -  Looking on Transformers Model Hub (in local cache and online)...
INFO - haystack.modeling.model.language_model -  Loaded deepset/roberta-base-squad2
INFO - haystack.modeling.logger -  ML Logging is turned off. No parameters, metrics or artifacts will be logged to MLFlow.
INFO - haystack.modeling.utils -  Using devices: CPU
INFO - haystack.modeling.utils -  Number of GPUs: 0
INFO - haystack.modeling.infer -  Got ya 9 parallel workers to do inference ...
INFO - haystack.modeling.infer -   0     0     0     0     0     0     0     0     0  
INFO - haystack.modeling.infer -  /w\   /w\   /w\   /w\   /w\   /w\   /w\   /|\  /w\ 
INFO - haystack.modeling.infer -  /'\   / \   /'\   

![5](images/process/5.png)

In [38]:
pipe = ExtractiveQAPipeline(reader, retriever)

In [39]:
questions = [ "What are efforts regarding digital experiences?", 
    "What are the strategic priorities?",
    "What is the company's growth?"]

In [40]:
i = 0
for question in questions:
    prediction = pipe.run(query=question, params={"Retriever": {"top_k": 20}, "Reader": {"top_k": 5}})
    export_answers_to_csv(output_file="answers/result-" + str(i) + '.csv', agg_results=prediction)
    i += 1

In [41]:
# # let us take a look at the answers

# # change to see answers to a diff question
# no_question = 0
# sample_answers = pd.read_csv(str('answers/result-'+ str(no_question) +'.csv'), sep=',')
# sample_answers.columns = ['query', str('Answer to: ' + questions[no_question]), 'Rank', 'prediction_context']
# sample_answers[['Rank', str('Answer to: ' + questions[no_question])]].style.set_properties(**{'text-align': 'left'})

__Answers to: What are efforts regarding digital experiences?__

Rank |  Answer
---|---
1|  eCommerce efforts and innovation
2|	investments in eCommerce, technology, acquisitions, joint ventures, store remodels and other customer initiatives
3|	Same Day Pickup and Same Day Delivery
4|	social media, online advertising, and email
5|	security of our digital platforms and keep them operating within acceptable parameters

__Answers to: What are the strategic priorities?__

Rank |  Answer
---|---
1|	improving our customer-facing initiatives in stores and clubs and creating a seamless omni-channel experience for our customers
2|	Price transparency, assortment of products, customer experience, convenience, ease and the speed and cost of shipping
3|	to make every day easier for busy families, operate with discipline, sharpen our culture and become digital, and make trust a competitive advantage
4|	improving our customer-facing initiatives in stores and clubs and creating a seamless omni-channel experience for our customers
5| strategic capital allocation

__Answers to: What is the company's growth?__

Rank |	Answer
---|---
1|	ticket and transaction growth
2|	net sales
3|	fiscal 2019
4|	$2.8 billion or 2.3%
5|	23% of our consolidated net sales

## Appendix

In [42]:
## Dependencies to install on a Mac OS M1X chip.

## In order to run this script on my local machine, I had to download / install the following dependencies.

## Download & install xpdf tools to process PDF documents with OCR
# !wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-mac-4.03.tar.gz
# !tar -xvf xpdf-tools-linux-4.03.tar.gz && sudo cp xpdf-tools-mac-4.03/bin64/pdftotext /usr/local/bin

## Download & install ocr (optical character recognition to read in pdfs with Python)
# !pip install 'farm-haystack[ocr]' -q

## Download & install FAISS (i.e., Facebook AI Similarity Search, a library that allows developers to quickly search for embeddings of multimedia documents that are similar to each other.)
# !pip install 'farm-haystack[faiss]' -q

## Reset FAISS document store by deleting it, if it throws an error. FAISS is a library for efficient similarity search on a cluster of dense vectors.
# !rm faiss_document_store.db

## Reduce the output of a nasty error models
# import os
# os.environ["TOKENIZERS_PARALLELISM"] = "false"

## configuration to have nicer printing in pandas
# pd.set_option('display.max_colwidth', None)

## Installing pygraphviz for pipe.draw() on Mac M1
# brew install graphviz
# python -m pip install \
#     --global-option=build_ext \
#     --global-option="-I$(brew --prefix graphviz)/include/" \
#     --global-option="-L$(brew --prefix graphviz)/lib/" \
#     pygraphviz