## Setup

In [1]:
# !pip install langchain
# !pip install gpt4all
# !pip install qdrant-client
# !pip install sentence-transformers

## Set huggingface cache home (optional)

In [2]:
import os
cache_dir="huggingface/"
os.environ["HF_HOME"] = cache_dir

## Check Device (assuming no GPU is available)

I have a GPU available but for you it will print "cpu" if you don't.

In [3]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

## Enable to see Logs while running

In [4]:
# import logging

# # Configure logging
# logging.basicConfig(level=logging.DEBUG,
#                     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# # Create a logger
# logger = logging.getLogger(__name__)

## Load Data, preprocessing and Creating Qdrant Index

In [5]:
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Qdrant
from langchain_community.embeddings import HuggingFaceEmbeddings
import re

def preprocess_text(text):
    text_lower = text.lower()
    text_no_punctuation = re.sub(r'[^\w\s\$\%\.\,\"\'\!\?\(\)]', '', text_lower)
    text_normalized_tabs = re.sub(r'(\t)+', '', text_no_punctuation)
    return text_normalized_tabs

loader = PyPDFLoader("data/msft_annual_2023_report.pdf")
# loader = TextLoader(sample_texts)
# loader = WebBaseLoader("https://cleartax.in/s/top-performing-nps-schemes")
documents = loader.load()
for x in range(len(documents)):
    # do preprocessing
    documents[x].page_content=preprocess_text(documents[x].page_content)
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0,separator="\n")
docs = text_splitter.split_documents(documents)

embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en-v1.5", 
                                   model_kwargs = {'device': "cpu"})  # forcefully setting device as cpu
qdrant = Qdrant.from_documents(
    docs,
    embeddings,
    location=":memory:",  # Local mode with in-memory storage only
    collection_name="msft_data",
    force_recreate=True
)

## Run if not able to download model using GPT4All directly.

In [6]:
# # download model

# !mkdir models
# !wget https://gpt4all.io/models/gguf/mistral-7b-instruct-v0.1.Q4_0.gguf -O models/mistral-7b-instruct-v0.1.Q4_0.gguf

## Directly calling GPT4All module 

In [7]:
from gpt4all import GPT4All
model = GPT4All(model_name="mistral-7b-instruct-v0.1.Q4_0.gguf",
                             n_threads = 4,
                             allow_download=True) # set allow_download as true to fetch it.

In [8]:
prompt = "Jason Bourne movies list:"
print(model.generate(prompt, max_tokens=100))


1. The Bourne Identity (2004)
2. The Bourne Supremacy (2006)
3. The Bourne Ultimatum (2009)
4. The Bourne Legacy (2012)
5. Jason Bourne (2016)


## Creating prompts, tuning parameters and creating Chain using Langchain

In [9]:
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain import PromptTemplate, LLMChain
from langchain_community.llms import GPT4All

template = '''[INST]: You are a financial expert analyst bot, below presents a context from which the a question will be asked, give your valuable insights as well.[\INST]\n
Context: {context}.\n
Question: {question}\n
Answer: '''
rag_prompt = PromptTemplate(template=template, input_variables=["context","question"])

callbacks = [StreamingStdOutCallbackHandler()]

llm = GPT4All(
            model="mistral-7b-instruct-v0.1.Q4_0.gguf",
            max_tokens=300,
            n_threads = 4, 
            temp=0.3,
            top_p=0.2,
            top_k=40,
            n_batch=8,
            seed=100,
            allow_download=True,
            verbose=True)

llm_chain = LLMChain(prompt=rag_prompt, llm=llm, verbose=True)

## Define format_docs for formatting context candidates.

In [11]:
def format_docs(query):
    found_docs = qdrant.similarity_search_with_score(query,k=1)
    return "\n\n".join(doc[0].page_content for doc in found_docs)

## Run Queries

In [12]:
%%time
query = "who were the top performers in 2023 for microsoft?"
resp = llm_chain.invoke(
    input={"question":query,
           "context": format_docs(query)
          }
)
print(resp['text'])



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m[INST]: You are a financial expert analyst bot, below presents a context from which the a question will be asked, give your valuable insights as well.[\INST]

Context: and selling our other products and service s and income taxes.  
highlights from fiscal year 2023 compared with fiscal year 2022 included  
 microsoft cloud revenue increased 22% to $111.6  billion.  
 office commercial products and cloud services revenue increased 10% driven by office 365 commercial 
growth of 13%.  
 office consumer products and cloud services revenue increased 2% and microsoft 365 consumer subscribers 
increased to 67.0  million.  
 linkedin revenue increased 10%.  
 dynamics products and cloud services revenue increased 16% driven by dynamics 365 growth of 24%.  
 server products and cloud services revenue increased 19% driven by azure and other cloud services growth 
of 29%.  
 windows original equipment manufacturer li

In [13]:
%%time
query = "who were the worst performers in 2023 for microsoft?"
resp = llm_chain.invoke(
    input={"question":query,
           "context": format_docs(query)
          }
)
print(resp['text'])



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m[INST]: You are a financial expert analyst bot, below presents a context from which the a question will be asked, give your valuable insights as well.[\INST]

Context: and selling our other products and service s and income taxes.  
highlights from fiscal year 2023 compared with fiscal year 2022 included  
 microsoft cloud revenue increased 22% to $111.6  billion.  
 office commercial products and cloud services revenue increased 10% driven by office 365 commercial 
growth of 13%.  
 office consumer products and cloud services revenue increased 2% and microsoft 365 consumer subscribers 
increased to 67.0  million.  
 linkedin revenue increased 10%.  
 dynamics products and cloud services revenue increased 16% driven by dynamics 365 growth of 24%.  
 server products and cloud services revenue increased 19% driven by azure and other cloud services growth 
of 29%.  
 windows original equipment manufacturer li

In [14]:
query = "what was the revenue in united states versus in other countries in 2023?"
resp = llm_chain.invoke(
    input={"question":query,
           "context": format_docs(query)
          }
)
print(resp['text'])



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m[INST]: You are a financial expert analyst bot, below presents a context from which the a question will be asked, give your valuable insights as well.[\INST]

Context: year ended june  30, 2023  2022  2021  
        
united states (a) $ 106,744   $ 100,218   $ 83,953   
other countries   105,171    98,052    84,135         
total  $  211,915   $  198,270   $  168,088           
(a) includes billings to oems and certain multinational organizations because of the nature of these businesses and the 
impracticability of determining the geographic source of the revenue.  
revenue, classified by significant product and service offerings, was as follows  
  
(in millions)          
        
year ended june  30, 2023  2022  2021  
        
server products and cloud services  $ 79,970   $ 67,350   $ 52,589   
office products and cloud services   48,728    44,862    39,872   
windows   21,507    24,732    22,488   


## Other Queries ideas

query = "what was the expenses for research and development in 2023?"

query = "what is microsoft 365 consumer count in 2023?"

query = "what was the revenue generated by windows?"

query = "what was the main focus for 2023?"

query = "what's the overall stock information?"

query = "what sector and companies did microsoft invest in 2023?"

query = "how many people were laid off by microsoft and what were they offered?"