# Steps

* Embedding: Cohere
* Vector Database: Pinecone
* Generation: Openai

In [1]:
!pip install openai
!pip install cohere
!pip install pinecone-client

Collecting openai
  Obtaining dependency information for openai from https://files.pythonhosted.org/packages/46/85/8681046cd9cc13a36ac76e4a1b047338c90dbeab2e9b14fb36de7f314c93/openai-1.10.0-py3-none-any.whl.metadata
  Downloading openai-1.10.0-py3-none-any.whl.metadata (18 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Obtaining dependency information for distro<2,>=1.7.0 from https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl.metadata
  Downloading distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Obtaining dependency information for httpx<1,>=0.23.0 from https://files.pythonhosted.org/packages/39/9b/4937d841aee9c2c8102d9a4eeb800c7dad25386caabb4a1bf5010df81a57/httpx-0.26.0-py3-none-any.whl.metadata
  Downloading httpx-0.26.0-py3-none-any.whl.metadata (7.6 kB)
Collecting typing-extensions<5,>=4.7 (from openai)
  Obtaining dependency information for 

In [2]:
!pip install nltk
!pip install fitz
!pip install PyMuPDF
!pip install uuid

Collecting fitz
  Downloading fitz-0.0.1.dev2-py2.py3-none-any.whl (20 kB)
Collecting configobj (from fitz)
  Downloading configobj-5.0.8-py2.py3-none-any.whl (36 kB)
Collecting configparser (from fitz)
  Obtaining dependency information for configparser from https://files.pythonhosted.org/packages/81/a3/0e5ed11da4b7770c15f6f319abf053f46b5a06c7d4273c48469b7899bd89/configparser-6.0.0-py3-none-any.whl.metadata
  Downloading configparser-6.0.0-py3-none-any.whl.metadata (11 kB)
Collecting nipype (from fitz)
  Downloading nipype-1.8.6-py3-none-any.whl (3.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m51.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyxnat (from fitz)
  Obtaining dependency information for pyxnat from https://files.pythonhosted.org/packages/97/f6/dbe92707b35f3fec297228e81cbf35e0b1028e20c98a0cea1650e967f88a/pyxnat-1.6.2-py3-none-any.whl.metadata
  Downloading pyxnat-1.6.2-py3-none-any.whl.metadata (5.3 kB)
Collecting prov>=

In [3]:
from openai import OpenAI
import cohere
import os

# open_api = ""
# client = OpenAI(api_key=open_api)
# co = cohere.Client(api_key="")
# COHERE_EMBEDDING_MODEL = "embed-english-v3.0"

def fetch_embeddings(text, embedding_type):
    result = co.embed(texts=text, model=COHERE_EMBEDDING_MODEL, input_type=embedding_type).embeddings
    return result

In [4]:
def qna_prompt(question, context_lst):
    context_str = "\n".join(context_lst)
    return f"""
    Context information is below.
    ---------------------
    {context_str}
    ---------------------
    Given the context information and not prior knowledge, answer the query.
    Query: {question}
    Answer: 
    """

def synthesize_answer (question, context_lst):
    response = client.chat.completions.create(
        model = "gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": qna_prompt(question, context_lst)}
        ],
        temperature=0
    )
    
    answer = response.choices[0].message.content
    print(f"Price: {response.usage.total_tokens * 0.003 / 1000} $")
    return answer

In [5]:
import pinecone
from pinecone.core.client.configuration import Configuration as OpenApiConfiguration
from fastapi import HTTPException
import fitz

# TOP_K_DOCUMENTS = 3
# INDEX_NAME = 'document-indexer'

# openapi_config = OpenApiConfiguration.get_default_copy()

# pinecone.init(
#     api_key='', 
#     environment='gcp-starter',
#     openapi_config=openapi_config)

# if INDEX_NAME not in pinecone.list_indexes():
#     pinecone.create_index(INDEX_NAME, dimension=1024
#                           , metadata_config={"indexed": ["document_id"]})

# index = pinecone.Index(INDEX_NAME)

def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

def add_document_to_db(document_id, paragraphs, embeddings):
    embeddings = [
        (f"{document_id}_{i}",
        embedding,
        {"document_id": document_id, "sentence_id":i, "text":paragraph})
        for i, (paragraph, embedding) in enumerate(zip(paragraphs, embeddings))
    ]

def fetch_top_paragraphs(document_id, embedding):
    query_response = index.query(top_k=TOP_K_DOCUMENTS, vector=embedding,
                                filter={
                                    "document_id": {"$eq": document_id},
                                },
                                include_metadata=True
                                )
    answers = [q['metadata']['text'] for q in query_response['matches']]
    return answers

In [6]:
from nltk.tokenize import sent_tokenize
import uuid
def split_document_to_paragraphs(document, paragraph_len):
    sentences = sent_tokenize(document)

    paragraphs = []
    paragraph = ''
    for sentence in sentences:
        paragraph += ' ' + sentence
        if len(paragraph) >= paragraph_len:
            paragraphs.append(paragraph)
            paragraph = ''

    if len(paragraph) > 0:
        paragraphs.append(paragraph)
    
    return paragraphs

def read_with_fitz(filepath):
    with fitz.open(filepath) as doc:
        text = '\n'.join([page.get_text() for page in doc])
        return text


def add_document(filepath):
    document_text = read_with_fitz(filepath)
    paragraphs = split_document_to_paragraphs(document_text, 1000)
    embeddings = fetch_embeddings(paragraphs, embedding_type="search_document")
    document_id = str(uuid.uuid4())
    add_document_to_db(document_id, paragraphs, embeddings)
    print(document_id)
    return document_id

def get_answer(question, document_id):
    embedding = fetch_embeddings([question], embedding_type='search_query')[0]
    relevant_paragraphs = fetch_top_paragraphs(document_id, embedding)
    return synthesize_answer(question, relevant_paragraphs)

In [7]:
# document_id = add_document("/kaggle/input/rag-pdf-file/actsc231.pdf")

In [8]:
# answer = get_answer("What is the differnce between yield and interest rate? And what is the notation for each of them?", document_id)
# answer