# Chat With Anything - From PDFs Files to Image Documents: 
Author: Zoumana KEITA   
https://medium.com/@zoumanakeita

In [1]:
import warnings
warnings.filterwarnings('ignore')

### Install the requirements

In [4]:
#%%bash 

pip -q install langchain faiss-cpu unstructured
pip -q install openai tiktoken
pip -q install pytesseract pypdf

SyntaxError: invalid syntax (3190082503.py, line 3)

# Chat & Query your PDF files

## Detect Document Type

In [5]:
from filetype import guess

def detect_document_type(document_path):
    
    guess_file = guess(document_path)
    file_type = ""
    image_types = ['jpg', 'jpeg', 'png', 'gif']
    
    if(guess_file.extension.lower() == "pdf"):
        file_type = "pdf"
        
    elif(guess_file.extension.lower() in image_types):
        file_type = "image"
        
    else:
        file_type = "unkown"
        
    return file_type
    

In [6]:
research_paper_path = "1.pdf"
article_information_path = "zoumana_article_information.png"

print(f"Research Paper Type: {detect_document_type(research_paper_path)}")
print(f"Article Information Document Type: {detect_document_type(article_information_path)}")

Research Paper Type: pdf
Article Information Document Type: image


## Extract Documents Content

In [7]:
from langchain.document_loaders.image import UnstructuredImageLoader
from langchain.document_loaders import UnstructuredFileLoader

"""
YOU CAN UNCOMMENT THE CODE BELOW TO UNDERSTAND THE LOGIC OF THE FUNCTIONS
"""
"""

def extract_text_from_pdf(pdf_file):
    
    loader = UnstructuredFileLoader(pdf_file)
    documents = loader.load()
    pdf_pages_content = '\n'.join(doc.page_content for doc in documents)
    
    return pdf_pages_content

def extract_text_from_image(image_file):

    loader = UnstructuredImageLoader(image_file)
    documents = loader.load()
    
    image_content = '\n'.join(doc.page_content for doc in documents)
    
    return image_content


"""

def extract_file_content(file_path):
    
    file_type = detect_document_type(file_path)
    
    if(file_type == "pdf"):
        loader = UnstructuredFileLoader(file_path)
        
    elif(file_type == "image"):
        loader = UnstructuredImageLoader(file_path)
        
    documents = loader.load()
    documents_content = '\n'.join(doc.page_content for doc in documents)
    
    return documents_content

In [13]:
#research_paper_content = extract_text_from_pdf(research_paper_path)
#article_information_content = extract_text_from_image(article_information_path)


research_paper_content = extract_file_content(research_paper_path)
#article_information_content = extract_file_content(article_information_path)

TesseractNotFoundError: tesseract is not installed or it's not in your PATH. See README file for more information.

In [41]:
nb_characters = 400

print(f"First {nb_characters} Characters of the Paper: \n{research_paper_content[:nb_characters]}...")
print("---"*5)
print(f"First {nb_characters} Characters of Article Information Document :\n {article_information_content[:nb_characters]}...")


First 400 Characters of the Paper: 
Provided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works.

Attention Is All You Need

3 2 0 2

Ashish Vaswani∗ Google Brain avaswani@google.com

Noam Shazeer∗ Google Brain noam@google.com

Niki Parmar∗ Google Research nikip@google.com

Jakob Uszkoreit∗ Google Research usz@google.com
...
---------------
First 400 Characters of Article Information Document :
 This document provides a quick summary of some of Zoumana’s article on Medium. It can be considered as the compilation of his 80+ articles about Data Science, Machine Learning and

Machine Learning Operations.

Whether you are just getting started or you're an experienced professional looking to upskill, these

materials can be helpful.

Data Science section covers basic to advanced concepts such ...


## Chat Implementation

### Create Chunks

In [42]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(        
    separator = "\n\n",
    chunk_size = 1000,
    chunk_overlap  = 200,
    length_function = len,
)

In [43]:
research_paper_chunks = text_splitter.split_text(research_paper_content)
article_information_chunks = text_splitter.split_text(article_information_content)

print(f"# Chunks in Research Paper: {len(research_paper_chunks)}")
print(f"# Chunks in Article Document: {len(article_information_chunks)}")

Created a chunk of size 1140, which is longer than the specified 1000


# Chunks in Research Paper: 51
# Chunks in Article Document: 2


### Create Embeddings

In [51]:
from langchain.embeddings.openai import OpenAIEmbeddings
import os

os.environ["OPENAI_API_KEY"] = "<YOUR KEY>"

embeddings = OpenAIEmbeddings()

### Create Vector Index

In [52]:
from langchain.vectorstores import FAISS

def get_doc_search(text_splitter):
    
    return FAISS.from_texts(text_splitter, embeddings)

In [54]:
doc_search_paper = get_doc_search(research_paper_chunks)
print(doc_search_paper)

<langchain.vectorstores.faiss.FAISS object at 0x141578340>


### Start chatting with your document

In [68]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain
chain = load_qa_chain(OpenAI(), chain_type = "map_rerank",  
                      return_intermediate_steps=True)

def chat_with_file(file_path, query):
    
    file_content = extract_file_content(file_path)
    file_splitter = text_splitter.split_text(file_content)
    
    document_search = get_doc_search(file_splitter)
    documents = document_search.similarity_search(query)
    
    results = chain({
                        "input_documents":documents, 
                        "question": query
                    }, 
                    return_only_outputs=True)
    results = results['intermediate_steps'][0]
    
    return results

##### Chat with the image file

In [70]:
query = "What is the document about"

results = chat_with_file(article_information_path, query)

answer = results["answer"]
confidence_score = results["score"]

print(f"Answer: {answer}\n\nConfidence Score: {confidence_score}")



Answer:  The document is a quick summary of Zoumana’s article on Medium related to Data Science, Machine Learning and Machine Learning Operations. It covers topics such as statistics, model evaluation metrics, SQL queries, NoSQL courses, data visualization, MLOps, and Natural Language Processing. 

Confidence Score: 100


##### Chat with the PDF file

In [71]:
query = "Why is the self-attention approach used in this document?"

results = chat_with_file(research_paper_path, query)

answer = results["answer"]
confidence_score = results["score"]

print(f"Answer: {answer}\n\nConfidence Score: {confidence_score}")

Created a chunk of size 1140, which is longer than the specified 1000


Answer:  Self-attention is used in this document to compute representations of its input and output without using sequence-aligned RNNs or convolution, making the Transformer the first transduction model relying entirely on self-attention.

Confidence Score: 100


# Congratulations!  

Made with ❤️ by Zoumana KEITA