In [None]:
pip install langchain_community

In [9]:
import os
from langchain_community.document_loaders import TextLoader
from sentence_transformers import SentenceTransformer
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from transformers import pipeline
from langchain_text_splitters import MarkdownHeaderTextSplitter

# New section

In [None]:
# Step 1: Loading a File
from langchain_community.document_loaders import TextLoader
loader = TextLoader("/content/sample_data/tennis_details.md")
text_doc = loader.load()
print(text_doc[0].page_content)

In [11]:
# Step 2: split the data into chunks
from langchain_text_splitters import MarkdownHeaderTextSplitter
split_condition = [("##", "title")]
splitter = MarkdownHeaderTextSplitter(split_condition)
doc_splits = splitter.split_text(text_doc[0].page_content)
#print(doc_splits)
text_chunks = [split.page_content for split in doc_splits]
print(text_chunks)

['# Tennis', "Tennis is a popular sport played between two players (singles) or two teams of two players each (doubles). The game involves using a racket to hit a ball over a net into the opponent's court.", '- A match can be played as best of three or five sets.\n- Each set consists of games, and each game consists of points.\n- Points are scored as **0 (Love), 15, 30, 40**, and then **game**.\n- A player must win a game by at least **two points**.\n- The ball must land within the designated court boundaries.', '```plaintext\n0 points  -> Love\n1 point   -> 15\n2 points  -> 30\n3 points  -> 40\n4 points  -> Game (if leading by 2)\nDeuce     -> 40-40 (must win two consecutive points to win the game)\nAdvantage -> If a player wins a point at deuce, they gain the advantage\n```', '- **Grand Slam Events**:\n- Australian Open\n- French Open\n- Wimbledon\n- US Open', '- **Racket**: Used to hit the ball.\n- **Tennis Ball**: Yellow-green in color, designed for optimal bounce.\n- **Court**: Ca

In [12]:
# Step 3: Generate Embeddings

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

def embed_chunk(chunk):
  return embedding_model.encode([chunk], normalize_embeddings = True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [13]:
sample_embedding = embed_chunk(text_chunks[1]).tolist()[0]

In [None]:
print(text_chunks[1])

In [None]:
print(sample_embedding)

In [16]:
len(sample_embedding)

384

In [None]:
pip install chromadb

In [17]:
# Step 4: Store embeddings in ChromaDB

vector_db = Chroma.from_texts(text_chunks, HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2"), persist_directory="/tmp/chroma_db")

  vector_db = Chroma.from_texts(text_chunks, HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2"), persist_directory="/tmp/chroma_db")


In [None]:
vector_db._collection.get(include=['embeddings','documents'])

In [None]:
#step 5: Set up a LLM
pipe = pipeline("text-generation", model="Qwen/Qwen2.5-1.5B-Instruct")

In [26]:
# Step 6: Retrieval and Generation
def retrieve_and_generate(query, threshold=1):
    """Retrieves relevant context from the vector database and generates an answer."""
    search_results = vector_db.similarity_search_with_score(query, k=1)

    print(search_results)

    if not search_results or search_results[0][1] > threshold:
        return "I don't know the answer. There is no available context in vector DB."

    retrieved_context = search_results[0][0].page_content
    similarity_score = search_results[0][1]
    print(f"Similarity Score: {similarity_score}")
    print(f"Retrieved Context: {retrieved_context}")

    prompt = f"Answer the question using the given context\nContext: {retrieved_context}\nQuestion: {query}\nAnswer: "
    print(prompt)
    response = pipe(prompt, max_new_tokens=100)
    return response[0]["generated_text"]

In [27]:
question = "what are famous tournaments?"
response = retrieve_and_generate(question)
print(response)


[(Document(metadata={}, page_content='- **Grand Slam Events**:\n- Australian Open\n- French Open\n- Wimbledon\n- US Open'), 0.9606983065605164)]
Similarity Score: 0.9606983065605164
Retrieved Context: - **Grand Slam Events**:
- Australian Open
- French Open
- Wimbledon
- US Open
Answer the question using the given context
Context: - **Grand Slam Events**:
- Australian Open
- French Open
- Wimbledon
- US Open
Question: what are famous tournaments?
Answer: 
Answer the question using the given context
Context: - **Grand Slam Events**:
- Australian Open
- French Open
- Wimbledon
- US Open
Question: what are famous tournaments?
Answer: 1. Grand Slam Events

2. Australian Open
3. French Open
4. Wimbledon
5. US Open


In [None]:
from google.colab import drive
drive.mount('/content/drive')