In [1]:
! pip install langchain_community tiktoken langchain_openai langchainhub chromadb langchain
! pip install -q -U google-generativeai
! pip install langchain_google_genai
! pip install gpt4all


Collecting langchain_community
  Downloading langchain_community-0.3.5-py3-none-any.whl.metadata (2.9 kB)
Collecting tiktoken
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting langchain_openai
  Downloading langchain_openai-0.2.5-py3-none-any.whl.metadata (2.6 kB)
Collecting langchainhub
  Downloading langchainhub-0.1.21-py3-none-any.whl.metadata (659 bytes)
Collecting chromadb
  Downloading chromadb-0.5.17-py3-none-any.whl.metadata (6.8 kB)
Collecting SQLAlchemy<2.0.36,>=1.4 (from langchain_community)
  Downloading SQLAlchemy-2.0.35-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain
  Downloading langchain-0.3.7-

In [2]:
import os
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = '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'
os.environ['GOOGLE_API_KEY'] ="AIzaSyCRT014yGuVIhoT_ZaS5AG7Ssm5FOtVfM0"

In [8]:
import requests
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from io import StringIO
from io import BytesIO
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI

def load_markdown_from_path(file_path):
    # Open and read the content of the local Markdown file
    with open(file_path, 'r', encoding='utf-8') as file:
        markdown_text = file.read()
    return markdown_text

# Define the path of the Markdown file
markdown_path = "/content/document.md"
# Load Markdown text from the local path
markdown_text = load_markdown_from_path(markdown_path)

# Create a Document object from the Markdown text
docs = [Document(page_content=markdown_text, metadata={'source': markdown_path})]

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

# Embed
vectorstore = Chroma.from_documents(documents=splits, embedding=GPT4AllEmbeddings())

retriever = vectorstore.as_retriever()

#### RETRIEVAL and GENERATION ####

# Prompt
prompt = hub.pull("rlm/rag-prompt")

# LLM
llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.3)

# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# Question
response = rag_chain.invoke("pyhton?")
print(response)

PyTorch is an open-source machine learning library developed by Facebook. It is known for its flexibility and dynamic computation graph, making it popular for research and production.


In [4]:
question = "python?"
document = "Python: Known for its simplicity and readability, widely used for web development, data science, AI, and scripting."

In [5]:
import tiktoken

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

num_tokens_from_string(question, "cl100k_base")

2

In [6]:
from langchain_community.embeddings import GPT4AllEmbeddings
embd = GPT4AllEmbeddings()
query_result = embd.embed_query(question)
document_result = embd.embed_query(document)
len(query_result)

384

In [7]:
import numpy as np

def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

similarity = cosine_similarity(query_result, document_result)
print("Cosine Similarity:", similarity)

Cosine Similarity: 0.7074665252502643


In [9]:
#### INDEXING ####

def load_markdown_from_path(file_path):
    # Open and read the content of the local Markdown file
    with open(file_path, 'r', encoding='utf-8') as file:
        markdown_text = file.read()
    return markdown_text

# Define the path of the Markdown file
markdown_path = "/content/document.md"
# Load Markdown text from the local path
markdown_text = load_markdown_from_path(markdown_path)

# Create a Document object from the PDF text
blog_docs = [Document(page_content=markdown_text, metadata={'source': markdown_path})]



In [10]:
# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300,
    chunk_overlap=50)

# Make splits
splits = text_splitter.split_documents(blog_docs)

In [11]:
# Index

vectorstore = Chroma.from_documents(documents=splits,
                                    embedding=GPT4AllEmbeddings())

retriever = vectorstore.as_retriever()

In [12]:
# Index

vectorstore = Chroma.from_documents(documents=splits,
                                    embedding=GPT4AllEmbeddings())


retriever = vectorstore.as_retriever(search_kwargs={"k": 2})

In [13]:
docs = retriever.get_relevant_documents("pyhton?")

  docs = retriever.get_relevant_documents("pyhton?")


In [14]:
len(docs)

2

In [15]:
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

# Prompt
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt

ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='Answer the question based only on the following context:\n{context}\n\nQuestion: {question}\n'), additional_kwargs={})])

In [16]:
llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.3)

In [17]:
# Chain
chain = prompt | llm

In [18]:
chain.invoke({"context":docs,"question":"pyhton?"})

AIMessage(content='The provided context does not mention Python.', additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'safety_ratings': [{'category': 'HARM_CATEGORY_SEXUALLY_EXPLICIT', 'probability': 'NEGLIGIBLE', 'blocked': False}, {'category': 'HARM_CATEGORY_HATE_SPEECH', 'probability': 'NEGLIGIBLE', 'blocked': False}, {'category': 'HARM_CATEGORY_HARASSMENT', 'probability': 'NEGLIGIBLE', 'blocked': False}, {'category': 'HARM_CATEGORY_DANGEROUS_CONTENT', 'probability': 'NEGLIGIBLE', 'blocked': False}]}, id='run-e4278d19-b261-400c-9ea0-569d401a7762-0', usage_metadata={'input_tokens': 400, 'output_tokens': 8, 'total_tokens': 408, 'input_token_details': {'cache_read': 0}})

In [19]:
from langchain import hub
prompt_hub_rag = hub.pull("rlm/rag-prompt")

In [20]:
prompt_hub_rag

ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})])

In [21]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("python?")

'Yes, Python is mentioned in the context as a beginner-friendly language for starting to code.'