In [3]:
import configparser

# Read the configuration file
config = configparser.ConfigParser()
config.read('config.ini')

# Access the API token from the configuration file
huggingfacehub_api_token = config.get('huggingface', 'api_token')
# Set your API Key from OpenAI
openai_api_key = config.get('openai', 'api_key')


In [6]:

#load, split and vectorize and save locally a document using the langchain library (step by step)
# Set your API Key from OpenAI
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAI
import os



#load the document 
loader = PyPDFLoader('database/MANIFESTO_OF_SURREALISM.pdf')
data = loader.load()
print(type(data))
print(data)

chunk_size = 200
chunk_overlap = 50

# Split the pdf into chunk using RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, 
    chunk_overlap=chunk_overlap)
docs = splitter.split_documents(data) 
print(type(docs))
print(docs[:5])

# Define an OpenAI embeddings model  and the target directory 
embedding_model = OpenAIEmbeddings(openai_api_key=openai_api_key)

persist_directory = 'persist_directory'

# Create the Chroma vector DB , using the OpenAI embedding function; persist the database to the directory
vectordb = Chroma(    
    persist_directory=persist_directory,    
    embedding_function=embedding_model
    )

vectordb.persist()



<class 'list'>
[Document(page_content='MANIFESTO OF SURREALISM BY ANDRÉ BRETON (1924)\u2029 \nMain Author is Andre  Breton. \nSo strong is the belief in life, in what is most fragile in life – real life, I mean – that in the end this belief\nis lost. Man, that inveterate dreamer, daily more discontent with his destiny, has trouble assessing the\nobjects he has been led to use, objects that his nonchalance has brought his way, or that he has earned\nthrough his own efforts, almost always through his own efforts, for he has agreed to work, at least he has\nnot refused to try his luck (or what he calls his luck!). At this point he feels extremely modest: he knows\nwhat women he has had, what silly affairs he has been involved in; he is unimpressed by his wealth or his\npoverty, in this respect he is still a newborn babe and, as for the approval of his conscience, I confess that\nhe does very nicely without it. If he still retains a certain lucidity, all he can do is turn back toward his\n

In [52]:
# load, split and vectorize a document using the langchain library (all in one) and make a query on doucment content

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA

loader = PyPDFLoader('database/MANIFESTO_OF_SURREALISM.pdf')
data = loader.load()
splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=50,
    separators=['.'])
docs = splitter.split_documents(data) 

# Embed the documents and store them in a Chroma DB
embedding_model = OpenAIEmbeddings(openai_api_key=openai_api_key)
docstorage = Chroma.from_documents(docs, embedding_model)


# Define the Retrieval QA Chain to integrate the database and LLM
qa = RetrievalQA.from_chain_type(
    OpenAI(model_name="gpt-3.5-turbo-instruct", temperature=0, openai_api_key=openai_api_key), chain_type='stuff', retriever=docstorage.as_retriever())

# Run the chain on the query provided
query = "how are qualified the people that dispute Breton's right to employ the term 'surrealism' in the sense he understand it?"
result = qa.run(query)

# Print the retrieved answer
print("Answer:", result)

Ignoring wrong pointing object 11 0 (offset 0)
Ignoring wrong pointing object 35 0 (offset 0)


Answer:  They are being extremely dishonest.


In [54]:
#same as above but on top display the source of the answer aka general model or new docs 
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.chains import RetrievalQAWithSourcesChain


loader = PyPDFLoader('database/MANIFESTO_OF_SURREALISM.pdf')
data = loader.load()
splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=50,
    separators=['.'])
docs = splitter.split_documents(data) 

# Embed the documents and store them in a Chroma DB-
embedding_model = OpenAIEmbeddings(openai_api_key=openai_api_key)
docstorage = Chroma.from_documents(docs, embedding_model)


# Define the Retrieval QA Chain to integrate the database and LLM
qa = RetrievalQAWithSourcesChain.from_chain_type(
    OpenAI(model_name="gpt-3.5-turbo-instruct", temperature=0, openai_api_key=openai_api_key), chain_type='stuff', retriever=docstorage.as_retriever())

# Run the chain on the query provided and get the source


results = qa({"question": "how are qualified the people that dispute Breton's right to employ the term 'surrealism' in the sense he understand it?"}, return_only_outputs=True)
print(results)

Ignoring wrong pointing object 11 0 (offset 0)
Ignoring wrong pointing object 35 0 (offset 0)


{'answer': " The people who dispute Breton's right to employ the term 'surrealism' in the sense he understands it are being extremely dishonest.\n", 'sources': '../database/MANIFESTO_OF_SURREALISM.pdf'}
