# Install LangChain and Ragas


In [1]:
!pip install langchain openai weaviate-client ragas

Collecting langchain
  Downloading langchain-0.1.16-py3-none-any.whl (817 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m817.7/817.7 kB[0m [31m35.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting openai
  Downloading openai-1.23.2-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m47.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting weaviate-client
  Downloading weaviate_client-4.5.5-py3-none-any.whl (306 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m306.8/306.8 kB[0m [31m42.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ragas
  Downloading ragas-0.1.7-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.2/81.2 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dataclasses-json<0.7,>=0.5.7
  Downloading dataclasses_json-0.6.4-py3-none-any.whl (28 kB)
Collecting langsmith<0.2.0,>=0.1.17
  Downloading langsmith-0.1.49-py3-none-any.whl

# 1. Load Data and Split
Run when restaring Kernel

In [1]:
import requests
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter

url = "https://raw.githubusercontent.com/langchain-ai/langchain/master/docs/docs/modules/state_of_the_union.txt"
res = requests.get(url)
with open("state_of_the_union.txt", "w") as f:
    f.write(res.text)

# Load the data
loader = TextLoader('./state_of_the_union.txt')
documents = loader.load()

# Chunk the data
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(documents)

# 2. Embedding

### Install Ollama Embedding

In [2]:
!ollama pull mxbai-embed-large


[?25lpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest ⠦ [?25h[?25l[2K[1Gpulling manifest ⠧ [?25h[?25l[2K[1Gpulling manifest ⠇ [?25h[?25l[2K[1Gpulling manifest ⠏ [?25h[?25l[2K[1Gpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest 
pulling 819c2adf5ce6... 100% ▕████████████████▏ 669 MB                         
pulling c71d239df917... 100% ▕████████████████▏  11 KB                         
pulling b837481ff855... 100% ▕████████████████▏   16 B                         
pulling 38badd946f91... 100% ▕████████████████▏  408 B                         
verifying sha256 digest ⠋ [?25h[?25l[2K[1G[A[2K[1G[A[2K[1G[A[2K[1G[A[2K[1G[A[2K[1Gpulling manifest 
pulling 819c2adf5ce6... 100% ▕████████████████▏ 669 MB               

### Embedding

In [3]:
from langchain_community.embeddings import OllamaEmbeddings
embeddings = OllamaEmbeddings(
    model="gemma:2b",
)

# Vector Store

### Install ChromaDB

In [4]:
!pip install chromadb

[0m

In [9]:
!pip install -U pysqlite3-binary

[0m

### ChromaDB

In [5]:
# Add to ChromaDB vector store
from langchain_community.vectorstores import Chroma

vectorstore = Chroma.from_documents(
    documents=chunks,
    collection_name="rag-chroma",
    embedding=embeddings,
)
retriever = vectorstore.as_retriever()


In [11]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Weaviate
import weaviate
from weaviate.embedded import EmbeddedOptions



# Setup vector database
client = weaviate.Client(
  embedded_options = EmbeddedOptions()
)

# Populate vector database
vectorstore = Weaviate.from_documents(
    client = client,    
    documents = chunks,
    embedding = embeddings,
    by_text = False
)

# Define vectorstore as retriever to enable semantic search
retriever = vectorstore.as_retriever()

            Consider upgrading to the new and improved v4 client instead!
            See here for usage: https://weaviate.io/developers/weaviate/client-libraries/python
            
{"level":"info","msg":"Created shard langchain_1ccf7cff11d64bd680c7330dc084c618_Ph1uW8a86rJt in 2.035784ms","time":"2024-04-20T05:21:25Z"}
{"action":"hnsw_vector_cache_prefill","count":1000,"index_id":"main","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2024-04-20T05:21:25Z","took":143817}


embedded weaviate is already listening on port 8079


ValueError: Error raised by inference endpoint: HTTPConnectionPool(host='localhost', port=11434): Max retries exceeded with url: /api/embeddings (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f97c100cc10>: Failed to establish a new connection: [Errno 111] Connection refused'))

# Ollama

## Install Ollama On a Unix Machine

In [7]:
!curl https://ollama.ai/install.sh | sh


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0>>> Downloading ollama...
100 10044    0 10044    0     0  24378      0 --:--:-- --:--:-- --:--:-- 24378
######################################################################## 100.0%#=#=#                                                                         ################        91.8%
>>> Installing ollama to /usr/local/bin...
>>> Creating ollama user...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.


## Server for Ollama

In [8]:
!sudo apt install -y neofetch

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  chafa libchafa0
The following NEW packages will be installed:
  chafa libchafa0 neofetch
0 upgraded, 3 newly installed, 0 to remove and 3 not upgraded.
Need to get 149 kB of archives.
After this operation, 633 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu focal/universe amd64 libchafa0 amd64 1.2.1-1 [42.1 kB]
Get:2 http://archive.ubuntu.com/ubuntu focal/universe amd64 chafa amd64 1.2.1-1 [29.7 kB]
Get:3 http://archive.ubuntu.com/ubuntu focal/universe amd64 neofetch all 7.0.0-1 [77.5 kB]
Fetched 149 kB in 0s (1468 kB/s)  [0m[33m

7[0;23r8[1ASelecting previously unselected package libchafa0:amd64.
(Reading database ... 69943 files and directories currently installed.)
Preparing to unpack .../libchafa0_1.2.1-1_amd64.deb ...
7[24;0f[42m[30mProgress: [  0%][49m[39m [..............................

In [9]:
!neofetch


[?25l[?7l[0m[31m[1m            .-/+oossssoo+/-.
        `:+ssssssssssssssssss+:`
      -+ssssssssssssssssssyyssss+-
    .ossssssssssssssssss[37m[0m[1mdMMMNy[0m[31m[1msssso.
   /sssssssssss[37m[0m[1mhdmmNNmmyNMMMMh[0m[31m[1mssssss/
  +sssssssss[37m[0m[1mhm[0m[31m[1myd[37m[0m[1mMMMMMMMNddddy[0m[31m[1mssssssss+
 /ssssssss[37m[0m[1mhNMMM[0m[31m[1myh[37m[0m[1mhyyyyhmNMMMNh[0m[31m[1mssssssss/
.ssssssss[37m[0m[1mdMMMNh[0m[31m[1mssssssssss[37m[0m[1mhNMMMd[0m[31m[1mssssssss.
+ssss[37m[0m[1mhhhyNMMNy[0m[31m[1mssssssssssss[37m[0m[1myNMMMy[0m[31m[1msssssss+
oss[37m[0m[1myNMMMNyMMh[0m[31m[1mssssssssssssss[37m[0m[1mhmmmh[0m[31m[1mssssssso
oss[37m[0m[1myNMMMNyMMh[0m[31m[1msssssssssssssshmmmh[0m[31m[1mssssssso
+ssss[37m[0m[1mhhhyNMMNy[0m[31m[1mssssssssssss[37m[0m[1myNMMMy[0m[31m[1msssssss+
.ssssssss[37m[0m[1mdMMMNh[0m[31m[1mssssssssss[37m[0m[1mhNMMMd[0m[31m[1mssssssss.
 /ssssssss[37m[0m[1mh

## Start the server

In [1]:
import subprocess
import time

# Start ollama as a backrgound process
command = "nohup ollama serve&"

# Use subprocess.Popen to start the process in the background
process = subprocess.Popen(command,
                            shell=True,
                           stdout=subprocess.PIPE,
                           stderr=subprocess.PIPE)
print("Process ID:", process.pid)
# Let's use fly.io resources
!OLLAMA_HOST=https://ollama-demo.fly.dev:443
time.sleep(5)  # Makes Python wait for 5 seconds

Process ID: 1912


## To install Ollama on Mac and Windows, please follow the instructions on my article ere: https://www.linkedin.com/pulse/ollama-langchain-local-gemma-applications-rany-elhousieny-phd%25E1%25B4%25AC%25E1%25B4%25AE%25E1%25B4%25B0-mlomc/

In [26]:
!ollama list

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


NAME                    	ID          	SIZE  	MODIFIED     
gemma:2b                	b50d6c999e59	1.7 GB	7 days ago  	
llama3:latest           	a6990ed6be41	4.7 GB	40 hours ago	
mxbai-embed-large:latest	468836162de7	669 MB	4 days ago  	
zephyr:latest           	bbe38b81adec	4.1 GB	4 days ago  	


### Pull gemma:2B

In [1]:
!pip install Ollama

[0m

In [20]:
!ollama pull gemma:2b

[?25lpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠦ [?25h[?25l[2K[1Gpulling manifest ⠦ [?25h[?25l[2K[1Gpulling manifest ⠧ [?25h[?25l[2K[1Gpulling manifest ⠇ [?25h[?25l[2K[1Gpulling manifest ⠏ [?25h[?25l[2K[1Gpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest 
pulling c1864a5eb193...   0% ▕                ▏    0 B/1.7 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling c1864a5eb193...   0% ▕                ▏    0 B/1.7 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling c1864a5eb193...   0% ▕                ▏    0 B/1.7 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling c1864a5eb193...   0% ▕                ▏    0 B/1.7 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manife

## Create an instance of Gemma 2B

In [2]:
from langchain_community.llms import Ollama

In [3]:
gemma = Ollama(model="gemma:2b")


In [None]:
!ollama list

NAME    	ID          	SIZE  	MODIFIED       
gemma:2b	b50d6c999e59	1.7 GB	22 seconds ago	


In [8]:
gemma.invoke("tell me a joke?")

"What do you call a joke that's too long?\n\n... A drag on."

## Pull Zephyr

In [30]:
!ollama pull zephyr

[?25lpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest ⠧ [?25h[?25l[2K[1Gpulling manifest ⠇ [?25h[?25l[2K[1Gpulling manifest ⠇ [?25h[?25l[2K[1Gpulling manifest ⠏ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest 
pulling 730ebed2578e...   0% ▕                ▏    0 B/4.1 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling 730ebed2578e...   0% ▕                ▏    0 B/4.1 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling 730ebed2578e...   0% ▕                ▏    0 B/4.1 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling 730ebed2578e...   0% ▕                ▏    0 B/4.1 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manife

## Create an instance of Zephyr

In [5]:
!ollama list

NAME         	ID          	SIZE  	MODIFIED    
gemma:2b     	b50d6c999e59	1.7 GB	3 hours ago	
zephyr:latest	bbe38b81adec	4.1 GB	2 hours ago	


In [9]:
zephyr = Ollama(model='zephyr:latest')

In [10]:
zephyr.invoke('tell me a joke')

'Why don\'t scientists trust atoms?\n\nBecause they make up everything! (This is a play on the phrase "everything is made up of atoms," which refers to how all matter in the universe is made up of tiny particles called atoms.)\n\nAlternative version: Why did the tomato turn red?\n\nBecause it saw the salad dressing! (This is a pun because the word "salad" can mean both a dish made with greens and vegetables, as well as a group of people working together. The joke plays off the confusion by making it seem like the tomato was somehow affected by the dressing.)'

# RAG chain

In [11]:
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser



# Define prompt template
template = """You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. 
Use two sentences maximum and keep the answer concise.
Question: {question} 
Context: {context} 
Answer:
"""

prompt = ChatPromptTemplate.from_template(template)

# Setup RAG pipeline
rag_chain = (
    {"context": retriever,  "question": RunnablePassthrough()} 
    | prompt 
    | zephyr
    | StrOutputParser() 
)

# Load VectorDB

In [1]:
from langchain.embeddings import SentenceTransformerEmbeddings

embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

In [2]:
from langchain_community.vectorstores import Chroma

vectordb = Chroma(
    persist_directory='./docs/chromaDB2/', 
    collection_name="rag-chroma",
    embedding_function=embeddings)

In [3]:
print(vectordb._collection.count())

65183


In [4]:
question = "What is NVidia CUDA Toolkit?"
docs = vectordb.similarity_search(question)
len(docs)

4

In [5]:
# print metadata
for doc in docs:
    print(doc.metadata['source'])

https://docs.nvidia.com/
https://docs.nvidia.com/cuda/doc/index.html
https://docs.nvidia.com/deeplearning/frameworks/tensorflow-wheel-release-notes/tf-wheel-rel.html
https://docs.nvidia.com/nsight-developer-tools/index.html


In [6]:
retriever = vectordb.as_retriever()

In [15]:
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnableLambda, RunnablePassthrough

# Prompt
template = """Answer the question 



Question: {question}
Check if the following context can improve your response: {context}


"""
prompt = ChatPromptTemplate.from_template(template)

# Local LLM
ollama_llm = "gemma:2b"
model_local = ChatOllama(model=ollama_llm)

# Chain
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model_local
    | StrOutputParser()
)


In [17]:
from pprint import pprint

pprint(chain.invoke("What is the difference between NVIDIA's BioMegatron and Megatron 530B LLM?"))

("Sure, here's the difference between NVIDIA's BioMegatron and Megatron 530B "
 'LLM:\n'
 '\n'
 '**BioMegatron**\n'
 '\n'
 '* An AI framework specifically designed for biological sequence analysis.\n'
 '* Uses a hybrid architecture that combines transformer and recurrent '
 'networks.\n'
 '* Offers specialized modules for protein sequence analysis, RNA analysis, '
 'and more.\n'
 '* Has a large pre-trained model that can be fine-tuned for specific '
 'biological tasks.\n'
 '\n'
 '**Megatron 530B LLM**\n'
 '\n'
 '* A large language model (LLM) with 530 billion parameters.\n'
 '* Can be used for various natural language processing (NLP) tasks.\n'
 '* Uses a multi-head attention mechanism to capture relationships between '
 'words.\n'
 '* Has a wider range of pre-training data compared to BioMegatron.\n'
 '\n'
 '**Context and relevance:**\n'
 '\n'
 'The context you provided does not mention the difference between BioMegatron '
 'and Megatron 530B LLM, so I cannot answer this question from

In [19]:
from langchain_community.llms import Ollama
zephyr = Ollama(model='zephyr:latest')

In [20]:
pprint(zephyr.invoke("What is the difference between NVIDIA's BioMegatron and Megatron 530B LLM?"))

('NVIDIA has announced two new large language models (LLMs) for training and '
 'inference: BioMegatron and Megatron 530B. While both are LLMs, there are a '
 'few key differences between the two:\n'
 '\n'
 '1. Size: The most obvious difference is their size. BioMegatron has 8 '
 'billion parameters, while Megatron 530B has 530 billion parameters. This '
 'makes Megatron 530B more than 60 times larger than BioMegatron and one of '
 'the largest LLMs in the world.\n'
 '\n'
 '2. Application: The primary application of BioMegatron is for biomedical '
 'research, specifically for analyzing and understanding biological data such '
 'as protein sequences, genetic code, and drug compounds. Megatron 530B, on '
 'the other hand, has more general-purpose applications, such as answering '
 'complex questions, generating human-like text, and making predictions based '
 'on large datasets.\n'
 '\n'
 '3. Architecture: Another difference is their architecture. BioMegatron was '
 'trained using a BERT

In [21]:
llama3 = Ollama(model="llama3")

In [None]:
pprint(llama3.invoke("What is the difference between NVIDIA's BioMegatron and Megatron 530B LLM?"))

In [42]:
vectordb.similarity_search("The NVIDIA CUDA Toolkit is a software development kit (SDK) that allows developers to program GPU-accelerated applications using the CUDA ")

[Document(page_content='NVIDIA CUDA - NVIDIA DocsSubmit SearchNVIDIA DeveloperBlogForumsJoinSubmit SearchNVIDIA DeveloperBlogForumsJoinMenuNVIDIA CUDASubmit SearchSubmit SearchNVIDIA Docs Hub\xa0\xa0NVIDIA CUDA        NVIDIA CUDA    The NVIDIA® CUDA® Toolkit provides a comprehensive development environment for C and C++ developers building GPU-accelerated applications. With the CUDA Toolkit, you can develop, optimize, and deploy your applications on GPU-accelerated embedded systems, desktop workstations, enterprise data centers, cloud-based platforms and HPC supercomputers. The toolkit includes GPU-accelerated libraries, debugging and optimization tools, a C/C++ compiler, and a runtime library to deploy your application.Using built-in capabilities for distributing computations across multi-GPU configurations, scientists and researchers can develop applications that scale from single GPU workstations to cloud installations with thousands of GPUs.Documentation CenterNVIDIA CUDA Toolkit D

# Chat Pod  for interactive testing
By running the following cell, you will be able to have an interactive Chat Pod for Q/A

In [23]:
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
import ipywidgets as widgets
from IPython.display import display, clear_output, HTML

# Assuming chain is properly initialized elsewhere in your code
# chain = SomeInitializedChain()

# Function to handle the input and display the response
def handle_query(sender):
    with output:
        clear_output(wait=True)  # Ensure the output is cleared only once ready to display new output
        print("Processing...")
        try:
            # Directly invoke the chain with the query from the input box
            response = chain.invoke(input_box.value)
            display(HTML(f"<div style='word-wrap: break-word; white-space: pre-wrap;'>Response: {response}</div>"))
        except Exception as e:
            print("An error occurred:", str(e))

# Create widgets for input and output
input_box = widgets.Text(description="Enter a query:")
button = widgets.Button(description="Submit Query")
output = widgets.Output()

# Set up the button's event to handle the query
button.on_click(handle_query)

# Display the widgets
display(input_box, button, output)


Text(value='', description='Enter a query:')

Button(description='Submit Query', style=ButtonStyle())

Output()

# Testing using test set

In [25]:



questions = [
        "What is the CUDA Toolkit used for?",
        "What are some components included in the CUDA Toolkit?",
        "Can the CUDA Toolkit distribute computations across multiple GPUs?",
        "What is the purpose of the CUDA Programming Guide?",
        "What is the CUDA Best Practices Guide?",
        "How does the CUDA Compatibility Guide assist developers?",
        "What is the PTX ISA guide about?",
        "What is the role of the CUDA Math API?",
        "What does the CUDA Nsight Systems tool provide?"
    ]



# Inference
for query in questions:
    print('********************')
    pprint(f"Query: {query}")
    pprint(chain.invoke(query))
    print('********************')

    
  


********************
'Query: What is the CUDA Toolkit used for?'
("Sure. Here's the answer to the question:\n"
 '\n'
 'The CUDA Toolkit is a comprehensive development environment for C and C++ '
 'developers building GPU-accelerated applications. It provides a wide range '
 'of tools and resources for developing, optimizing, and deploying '
 'applications on GPU-accelerated systems, including embedded systems, desktop '
 'workstations, enterprise data centers, cloud platforms, and HPC '
 'supercomputers.\n'
 '\n'
 'The context does not mention any ways in which the CUDA Toolkit can improve '
 'the response, so I cannot answer this question from the provided context.')
********************
********************
'Query: What are some components included in the CUDA Toolkit?'
("Sure, here's the information you requested:\n"
 '\n'
 '**Components included in the CUDA Toolkit:**\n'
 '\n'
 '* CUDA Runtime (GPU driver)\n'
 '* CUDA Toolkit (includes headers and libraries for developing and runni