In [1]:
import chromadb

from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
import numpy as np
from pypdf import PdfReader
from tqdm import tqdm

In [2]:
## helper

def _read_pdf(filename):
    reader = PdfReader(filename)
    
    pdf_texts = [p.extract_text().strip() for p in reader.pages]

    # Filter the empty strings
    pdf_texts = [text for text in pdf_texts if text]
    return pdf_texts


def _chunk_texts(texts):
    character_splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n", ". ", " ", ""],
        chunk_size=1000,
        chunk_overlap=0
    )
    character_split_texts = character_splitter.split_text('\n\n'.join(texts))

    token_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0, tokens_per_chunk=256)

    token_split_texts = []
    for text in character_split_texts:
        token_split_texts += token_splitter.split_text(text)

    return token_split_texts


def load_chroma(filename, collection_name, embedding_function):
    texts = _read_pdf(filename)
    chunks = _chunk_texts(texts)

    chroma_cliet = chromadb.Client()
    chroma_collection = chroma_cliet.create_collection(name=collection_name, embedding_function=embedding_function)

    ids = [str(i) for i in range(len(chunks))]

    chroma_collection.add(ids=ids, documents=chunks)

    return chroma_collection

def word_wrap(string, n_chars=72):
    # Wrap a string at the next space after n_chars
    if len(string) < n_chars:
        return string
    else:
        return string[:n_chars].rsplit(' ', 1)[0] + '\n' + word_wrap(string[len(string[:n_chars].rsplit(' ', 1)[0])+1:], n_chars)

   
def project_embeddings(embeddings, umap_transform):
    umap_embeddings = np.empty((len(embeddings),2))
    for i, embedding in enumerate(tqdm(embeddings)): 
        umap_embeddings[i] = umap_transform.transform([embedding])
    return umap_embeddings

In [6]:

from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
import numpy as np

In [7]:
embedding_function = SentenceTransformerEmbeddingFunction()

chroma_collection = load_chroma(filename='microsoft_annual_report_2022.pdf', collection_name='microsoft_annual_report_2022', embedding_function=embedding_function)
chroma_collection.count()

349

In [16]:
query = "What has been the investment in research and development?"
results =  chroma_collection.query(query_texts=query,n_results=10,include=['documents', 'embeddings'])
retrieved_results = results['documents'][0]

In [17]:
from sentence_transformers import CrossEncoder
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

In [18]:
pairs = [[query,ans] for ans in retrieved_results]
scores = cross_encoder.predict(pairs)

In [19]:
scores

array([  0.9869355 ,   2.6445756 ,  -0.26802987, -10.731592  ,
        -7.7066054 ,  -5.6469975 ,  -4.2970343 , -10.933231  ,
        -7.0384264 ,  -7.3246937 ], dtype=float32)

In [21]:
for i in np.argsort(scores)[::-1]:
    print(i+1)

2
1
3
7
6
9
10
5
4
8


In [88]:
import os
import openai
from openai import OpenAI
%set_env OPENAI_API_KEY=
openai.api_key = os.environ['OPENAI_API_KEY']

openai_client = OpenAI()

env: OPENAI_API_KEY=


In [28]:
query = "What were the most important factors that contributed to increases in revenue?"
messages = [
    {
            "role": "system",
            "content" : "You are a helpful expert financial research assistant. Your users are asking questions about an annual report. "
            "Suggest up to five additional related questions to help them find the information they need, for the provided question. "
            "Suggest only short questions without compound sentences. Suggest a variety of questions that cover different aspects of the topic."
            "Make sure they are complete questions, and that they are related to the original question."
            "Output one question per line. Do not number the questions."
    },
    {
        "role":"user",
        "content":query
    }
    
]

In [32]:
response =  openai_client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=messages
)

In [33]:
content = response.choices[0].message.content


In [35]:
augement_queries = content

In [40]:
qrs = augement_queries.split("\n")

In [42]:
queries = [query] + qrs

In [44]:
queries

['What were the most important factors that contributed to increases in revenue?',
 '- What were the main sources of revenue for the company?',
 "- How did the company's revenue compare to the previous year?",
 '- Were there any significant changes in pricing strategies that impacted revenue?',
 '- What market trends or economic conditions influenced revenue growth?',
 '- Did the company expand into new markets or launch new products/services that drove revenue growth?']

In [60]:
results =  chroma_collection.query(query_texts=queries, n_results=10, include=['documents', 'embeddings'])
retrieved_documents = results['documents']

In [61]:
# retrieved_documents

In [62]:
# Deduplicate the retrieved documents
unique_documents = set()
for documents in retrieved_documents:
    for document in documents:
        unique_documents.add(document)

unique_documents = list(unique_documents)

In [64]:
# unique_documents

In [66]:
paris = [ [query,i] for i in unique_documents]

In [67]:
scores = cross_encoder.predict(pairs)

In [69]:
print("New Ordering:")
final_docs = []
l = []
for o in np.argsort(scores)[::-1]:
    l.append(o)

New Ordering:


In [72]:
top5_indx = l[:5]
top_5_docs = []
for i in top5_indx:
    top_5_docs.append(unique_documents[i])

In [74]:
# top_5_docs

In [75]:
template = """
Answer the question based on the following context:
{context}

Question:{question}
"""

In [78]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.output_parsers import StrOutputParser

prompt = ChatPromptTemplate.from_template(template)
# prompt
model = ChatOpenAI(openai_api_key=os.environ['OPENAI_API_KEY'])

In [81]:
final_text = "\n".join(top_5_docs)


In [83]:
type(final_text)

str

In [85]:
chain  = (
    {"context":lambda x: final_text,"question":RunnablePassthrough()}
    |
    prompt
    |
    model
    |
    StrOutputParser()
)

In [86]:
chain.invoke("what is microsoft revenue?")

"Microsoft's revenue for the year ended June 30, 2022, was $198,270 million."

In [87]:
chain.invoke("reasons y revenue increased?")

'The revenue increased due to growth in various product and service offerings such as server products and cloud services, office products and cloud services, gaming, LinkedIn, search and news advertising, enterprise services, and devices. Additionally, there was significant growth in dynamics products and cloud services, particularly driven by dynamics 365. The increase in revenue was also attributed to investments in cloud services, including Azure, and improvements in office 365 commercial and LinkedIn, leading to higher gross margins and operating income.'