In [1]:
%%capture --no-stderr
%pip install --upgrade --quiet  langchain langchain-community langchainhub langchain-chroma bs4
%pip install -qU langchain-openai
!pip install gradio_client==0.2.10
!pip install gradio==3.38.0

In [49]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_community.document_loaders import WebBaseLoader
import os
import bs4

In [50]:
print(bs4.__version__)

4.12.3


In [51]:
from urllib.request import Request, urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import ssl

# Web Scraping for References

In [52]:
def get_sitemap(url):
    req = Request(
        url=url,
        headers={"User-Agent": "Mozilla/5.0"}
    )
    response = urlopen(req)
    xml = BeautifulSoup(
        response,
        "lxml-xml",
        from_encoding=response.info().get_param("charset")
    )
    return xml

In [53]:
def get_urls(xml, name=None, data=None, verbose=False):
    urls = []
    for url in xml.find_all("url"):
        if xml.find("loc"):
            loc = url.findNext("loc").text
            urls.append(loc)
    return urls

In [97]:
import requests
from bs4 import BeautifulSoup

def get_wikipedia_glossary(url):
    headers = {
        "User-Agent": "Mozilla/5.0"
    }
    res = requests.get(url, headers=headers)
    soup = BeautifulSoup(res.text, "html.parser")
    return soup

def extract_glossary(soup, sport_name):
    terms = []
    current_term = None

    # ✅ Loop through ALL <dl> blocks on the page
    for dl in soup.find_all("dl"):
        for tag in dl.children:
            if tag.name == "dt":
                current_term = tag.text.strip()
            elif tag.name == "dd" and current_term:
                definition = tag.text.strip()
                terms.append({
                    "sport": sport_name,
                    "term": current_term,
                    "definition": definition
                })
                current_term = None
    return terms




In [98]:
sport_glossary_urls = {
    "Basketball": "https://en.wikipedia.org/wiki/Glossary_of_basketball_terms",
    "Football": "https://en.wikipedia.org/wiki/Glossary_of_American_football",
    "Baseball": "https://en.wikipedia.org/wiki/Glossary_of_baseball_terms",
    "Hockey": "https://en.wikipedia.org/wiki/Glossary_of_ice_hockey_terms",
    "Soccer": "https://en.wikipedia.org/wiki/Glossary_of_association_football_terms",
    "Golf": "https://en.wikipedia.org/wiki/Glossary_of_golf",
    "Tennis": "https://en.wikipedia.org/wiki/Glossary_of_tennis_terms"
}

all_terms = []

for sport, url in sport_glossary_urls.items():
    print(f"Scraping {sport} glossary...")
    soup = get_wikipedia_glossary(url)
    terms = extract_glossary(soup, sport)
    all_terms.extend(terms)

print(f"\n✅ Total glossary terms collected: {len(all_terms)}")
print("Sample:", all_terms[:3])


Scraping Basketball glossary...
Scraping Football glossary...
Scraping Baseball glossary...
Scraping Hockey glossary...
Scraping Soccer glossary...
Scraping Golf glossary...
Scraping Tennis glossary...

✅ Total glossary terms collected: 1125
Sample: [{'sport': 'Basketball', 'term': '2-for-1', 'definition': 'A strategy used within the last minute of a period or quarter, in which the team with possession times its shot to ensure that it will regain possession with enough time to shoot again before time runs out. Applicable in competitions that use a shot clock (all except NFHS in most US states).[1]'}, {'sport': 'Basketball', 'term': '3-and-D', 'definition': 'Any player, typically not a star, who specializes mainly in three-point shooting ("3") and defense ("D"). The term is most often used in the NBA, where this specific skill set has been increasingly valued in the 21st century.[2][3]'}, {'sport': 'Basketball', 'term': '3x3', 'definition': 'A formalized version of a half-court basketba

In [100]:
print(all_terms[:5])  # Check first few
print(all_terms[-5:])  

[{'sport': 'Basketball', 'term': '2-for-1', 'definition': 'A strategy used within the last minute of a period or quarter, in which the team with possession times its shot to ensure that it will regain possession with enough time to shoot again before time runs out. Applicable in competitions that use a shot clock (all except NFHS in most US states).[1]'}, {'sport': 'Basketball', 'term': '3-and-D', 'definition': 'Any player, typically not a star, who specializes mainly in three-point shooting ("3") and defense ("D"). The term is most often used in the NBA, where this specific skill set has been increasingly valued in the 21st century.[2][3]'}, {'sport': 'Basketball', 'term': '3x3', 'definition': 'A formalized version of a half-court basketball game with three players on each team, officially sanctioned by FIBA. This variant made its Olympic debut in 2021 (delayed from 2020).'}, {'sport': 'Basketball', 'term': 'three seconds rule', 'definition': "A rule which requires that a player shall

# Building the RAG for WhistleWise

In [101]:
os.environ["GOOGLE_API_KEY"] = "AIzaSyDoOv96fCayBc-5vA_tBoIAFyNglFDizAQ"

# Vector Database for Gemini Embeddings

In [102]:
from langchain_core.documents import Document

docs = []
for item in all_terms:
    content = f"{item['term']}: {item['definition']}"
    metadata = {"sport": item['sport'], "term": item['term']}
    docs.append(Document(page_content=content, metadata=metadata))

In [109]:
docs[104]

Document(metadata={'sport': 'Basketball', 'term': 'four-point play'}, page_content='four-point play: A rare play in which a player is fouled while making a three-point field goal and then makes the resulting free throw, thereby scoring a total of four points.')

In [105]:
# 1. Split the text
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

# 2. Use Gemini-based embeddings
embedding = GoogleGenerativeAIEmbeddings(model="models/embedding-001")  # Gemini embedding model

# 3. Store in vector DB
vectorstore = Chroma.from_documents(documents=splits, embedding=embedding)

# 4. Create retriever
retriever = vectorstore.as_retriever()


In [112]:
splits[107]

Document(metadata={'sport': 'Basketball', 'term': 'free throw'}, page_content='free throw: An unopposed attempt to score a basket, worth one point, from the free-throw line. Generally, two attempts are awarded when the player is fouled in the act of shooting (three attempts are awarded in the case of a three-point shot), fouled flagrantly, or when the opposing team fouls while over the foul limit. For technical fouls, FIBA rules award one free throw; NBA and NFHS rules award two free throws; and NCAA rules award either one or two free throws, depending on the specific type of technical foul. In 3x3 rules, where regular baskets are worth 1\xa0point and shots from behind the arc are worth 2\xa0points, one attempt is normally awarded; two attempts are awarded when a player is fouled on a missed shot from behind the arc, the opposing team has committed more than six fouls in a game, and on any technical foul.')

# Testing

In [113]:
res = vectorstore.similarity_search("What is a four point play")

In [114]:
# prompt: create pretty print function for res

import pprint

def pretty_print(res):
  pp = pprint.PrettyPrinter(indent=2)
  pp.pprint(res)

pretty_print(res)


[ Document(metadata={'sport': 'Basketball', 'term': 'four-point play'}, page_content='four-point play: A rare play in which a player is fouled while making a three-point field goal and then makes the resulting free throw, thereby scoring a total of four points.'),
  Document(metadata={'term': 'four-point play', 'sport': 'Basketball'}, page_content="four-point play: A specialized type of layup shot where the ball is rolled off the tips of the player's fingers using the momentum of the jump. The advantage of the finger roll is that the ball can travel in a higher arc over a defender that might otherwise block the shot."),
  Document(metadata={'term': 'four-point stance', 'sport': 'Football'}, page_content='four-point stance: The final of a set of four downs. Unless a first down is achieved or a penalty forces a replay of the down, the team will lose control of the ball after this play. If a team does not think they can get a first down, they often punt on fourth down or attempt a field g

# LLM Prompting

In [123]:
# 2. Incorporate the retriever into a question-answering chain.
system_prompt = (
    "You are a sports terminology assistant with expert knowledge across multiple sports, including football, basketball, baseball, ice hockey, soccer, golf, and tennis. "
    "Your job is to accurately answer questions about sports terms, rules, player roles, and statistics based on the provided context. "
    "Use the retrieved context below to generate a clear and concise answer (no more than three sentences). Find the MOST relevant information for the question; the most"
    "important thing is to be concise and make sure to answer exactly what is being asked. Also I want you to specify exactly what SPORT the question is from. For example,"
    "say if the answer is for golf, football, or some other sport "
    "If the information is not found in the context, say you don't know. If the question is unclear, ask a clarifying follow-up."
    "\n\n"
    "{context}"
)

# the variable context is used by create_stuff_documents_chain to "stuff"/concatenate all context docs

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),  # input is a variable, also context in system prompt
    ]
)

In [124]:
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash")

In [125]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [126]:
response = rag_chain.invoke({"input": "What is a hole in one?"})
print("Response is\n------")
print(response["answer"])
print("------")
print(response["context"])

Response is
------
In golf, a hole in one is when a player hits the ball from the tee into the hole using only one stroke. Some tournaments offer large prizes if a player achieves this on a particular hole.
------
[Document(metadata={'term': 'Hole in one', 'sport': 'Golf'}, page_content='Hole in one: Hitting the ball from the tee into the hole, using only one stroke.'), Document(metadata={'sport': 'Golf', 'term': 'Hole in one'}, page_content='Hole in one: Hitting the ball from the tee into the hole, using only one stroke.'), Document(metadata={'term': 'Hole in one insurance', 'sport': 'Golf'}, page_content='Hole in one insurance: Many tournaments offer large prizes if a player shoots a hole in one on a particular hole. Indemnity insurance is often purchased to cover the cost should anyone make the hole in one. Hole in one insurance is also available for individuals to cover the cost of a round of drinks in the event of their achieving a hole in one.[13]'), Document(metadata={'term': 'H

# Sample Testing

In [131]:
response = rag_chain.invoke({"input": "What is a three point play?"})
print("Response is\n------")
print(response["answer"])

Response is
------
In basketball, a three-point play happens when a player is fouled while making a two-point field goal and then successfully makes the free throw, resulting in a total of three points scored. It is also known as an "and one".


# Adding History to WhistleBot

In [132]:
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import MessagesPlaceholder

contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
# If there is no chat_history, then the input is just passed directly to the
# retriever. If there is chat_history, then the prompt and LLM will be used to
# generate a search query. That search query is then passed to the retriever.

# This chain prepends a rephrasing of the input query to our retriever,
# so that the retrieval incorporates the context of the conversation.

history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

In [133]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

# history_aware_retriever and question_answer_chain in sequence, retaining
# intermediate outputs such as the retrieved context for convenience.
# It has input keys input and chat_history, and includes input, chat_history,
# context, and answer in its output.
rag_chain = create_retrieval_chain(history_aware_retriever,
                                   question_answer_chain)

# compare code above to QA chain (from the previos section)
#question_answer_chain = create_stuff_documents_chain(llm, prompt)
#rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [136]:
from langchain_core.messages import AIMessage, HumanMessage

chat_history = []

question = "What is a three point play?"
ai_msg_1 = rag_chain.invoke({"input": question, "chat_history": chat_history})
chat_history.extend(
    [
        HumanMessage(content=question),
        AIMessage(content=ai_msg_1["answer"]),
    ]
)

print("Question: ", question)
print(ai_msg_1["answer"])

second_question = "What sport is this?" # What are examples of index funds?
ai_msg_2 = rag_chain.invoke({"input": second_question, "chat_history": chat_history})

print("--------------")
print("Question: ", second_question)
print(ai_msg_2["answer"])

Question:  What is a three point play?
In basketball, a three-point play is when a player is fouled while making a two-point field goal and then successfully makes the free throw, resulting in a total of three points scored. It is also known as an "and one."
--------------
Question:  What sport is this?
Basketball


In [137]:
ai_msg_2.keys()

dict_keys(['input', 'chat_history', 'context', 'answer'])

# Gradio Visualizations

In [138]:
import gradio as gr

In [139]:
from langchain_core.messages import AIMessage, HumanMessage

def predict(question, chat_history):
  history_for_llm = []
  for tup in chat_history:
    history_for_llm.extend([HumanMessage(tup[0]), AIMessage(tup[1])])

  ai_msg = rag_chain.invoke({"input": question, "chat_history": history_for_llm})
  return ai_msg["answer"]

In [142]:
demo = gr.ChatInterface(predict,
    chatbot=gr.Chatbot(height=200),
    textbox=gr.Textbox(placeholder="Hi I am WhistleBot, your virtual AI sports expert, how can I help you today?", container=False, scale=7),
    title="WhistleBot",
    theme="soft",
    examples=["What is a three point play?", "Where is a hole-in-one?"],
    retry_btn=None,
    undo_btn="Delete Previous",
    clear_btn="Clear",)
demo.launch(share=True, debug=True)

Running on local URL:  http://127.0.0.1:7860
IMPORTANT: You are using gradio version 3.38.0, however version 4.44.1 is available, please upgrade.
--------
Running on public URL: https://5d3adfbd887600c102.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://5d3adfbd887600c102.gradio.live


