# Web Scraper and Vector Database Testing
Sources: 
- GROQ API COOKBOOK: https://github.com/groq/groq-api-cookbook/blob/dan/replit-conversion/presidential-speeches-rag/presidential-speeches-rag.ipynb
- OPEN AI COOKBOOK: https://cookbook.openai.com/examples/embedding_wikipedia_articles_for_search

## Imports + Setup

In [2]:
import pandas as pd
import numpy as np
from groq import Groq
import os
from pinecone import Pinecone
import mwclient  # for downloading example Wikipedia articles
import mwparserfromhell  # for splitting Wikipedia articles into sections
import re
import tiktoken
from dotenv import load_dotenv


from langchain_community.vectorstores import Chroma
from langchain.text_splitter import TokenTextSplitter
from langchain.docstore.document import Document
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain_pinecone import PineconeVectorStore
from transformers import AutoModelForCausalLM, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity

from IPython.display import display, HTML

  from tqdm.autonotebook import tqdm


In [3]:
load_dotenv()
groq_api_key = os.getenv('GROQ_API_KEY')
pinecone_api_key = os.getenv('PINECONE_API_KEY')
client = Groq(api_key = groq_api_key)
model = "mixtral-8x7b-32768"


## Get all wikipedia pages related to Taylor Swift

In [5]:
CATEGORY_TITLE = "Category:Taylor Swift"
WIKI_SITE = "en.wikipedia.org"


def titles_from_category(
    category: mwclient.listing.Category, max_depth: int
) -> set[str]:
    """Return a set of page titles in a given Wiki category and its subcategories."""
    titles = set()
    for cm in category.members():
        if type(cm) == mwclient.page.Page:
            # ^type() used instead of isinstance() to catch match w/ no inheritance
            titles.add(cm.name)
        elif isinstance(cm, mwclient.listing.Category) and max_depth > 0:
            deeper_titles = titles_from_category(cm, max_depth=max_depth - 1)
            titles.update(deeper_titles)
    return titles


site = mwclient.Site(WIKI_SITE)
category_page = site.pages[CATEGORY_TITLE]
titles = titles_from_category(category_page, max_depth=3)
# ^note: max_depth=1 means we go one level deep in the category tree
print(f"Found {len(titles)} article titles in {CATEGORY_TITLE}.")

Found 367 article titles in Category:Taylor Swift.


## Split pages into sections

In [6]:
# define functions to split Wikipedia pages into sections

SECTIONS_TO_IGNORE = [
    "See also",
    "References",
    "External links",
    "Further reading",
    "Footnotes",
    "Bibliography",
    "Sources",
    "Citations",
    "Literature",
    "Footnotes",
    "Notes and references",
    "Photo gallery",
    "Works cited",
    "Photos",
    "Gallery",
    "Notes",
    "References and sources",
    "References and notes",
]


def all_subsections_from_section(
    section: mwparserfromhell.wikicode.Wikicode,
    parent_titles: list[str],
    sections_to_ignore: set[str],
) -> list[tuple[list[str], str]]:
    """
    From a Wikipedia section, return a flattened list of all nested subsections.
    Each subsection is a tuple, where:
        - the first element is a list of parent subtitles, starting with the page title
        - the second element is the text of the subsection (but not any children)
    """
    headings = [str(h) for h in section.filter_headings()]
    title = headings[0]
    if title.strip("=" + " ") in sections_to_ignore:
        # ^wiki headings are wrapped like "== Heading =="
        return []
    titles = parent_titles + [title]
    full_text = str(section)
    section_text = full_text.split(title)[1]
    if len(headings) == 1:
        return [(titles, section_text)]
    else:
        first_subtitle = headings[1]
        section_text = section_text.split(first_subtitle)[0]
        results = [(titles, section_text)]
        for subsection in section.get_sections(levels=[len(titles) + 1]):
            results.extend(all_subsections_from_section(subsection, titles, sections_to_ignore))
        return results


def all_subsections_from_title(
    title: str,
    sections_to_ignore: set[str] = SECTIONS_TO_IGNORE,
    site_name: str = WIKI_SITE,
) -> list[tuple[list[str], str]]:
    """From a Wikipedia page title, return a flattened list of all nested subsections.
    Each subsection is a tuple, where:
        - the first element is a list of parent subtitles, starting with the page title
        - the second element is the text of the subsection (but not any children)
    """
    site = mwclient.Site(site_name)
    page = site.pages[title]
    text = page.text()
    parsed_text = mwparserfromhell.parse(text)
    headings = [str(h) for h in parsed_text.filter_headings()]
    if headings:
        summary_text = str(parsed_text).split(headings[0])[0]
    else:
        summary_text = str(parsed_text)
    results = [([title], summary_text)]
    for subsection in parsed_text.get_sections(levels=[2]):
        results.extend(all_subsections_from_section(subsection, [title], sections_to_ignore))
    return results

In [7]:
# split pages into sections
# may take ~1 minute per 100 articles
wikipedia_sections = []
for title in titles:
    wikipedia_sections.extend(all_subsections_from_title(title))
print(f"Found {len(wikipedia_sections)} sections in {len(titles)} pages.")

Found 3090 sections in 367 pages.


## Clean Text

In [8]:
# clean text
def clean_section(section: tuple[list[str], str]) -> tuple[list[str], str]:
    """
    Return a cleaned up section with:
        - <ref>xyz</ref> patterns removed
        - leading/trailing whitespace removed
    """
    titles, text = section
    text = re.sub(r"<ref.*?</ref>", "", text)
    text = text.strip()
    return (titles, text)


wikipedia_sections = [clean_section(ws) for ws in wikipedia_sections]

# filter out short/blank sections
def keep_section(section: tuple[list[str], str]) -> bool:
    """Return True if the section should be kept, False otherwise."""
    titles, text = section
    if len(text) < 16:
        return False
    else:
        return True


original_num_sections = len(wikipedia_sections)
wikipedia_sections = [ws for ws in wikipedia_sections if keep_section(ws)]
print(f"Filtered out {original_num_sections-len(wikipedia_sections)} sections, leaving {len(wikipedia_sections)} sections.")

Filtered out 99 sections, leaving 2991 sections.


In [9]:
# EXAMPLE DATA
for ws in wikipedia_sections[20:30]:
    print(ws[0])
    display(ws[1])
    print()

['Speak Now (song)', '==Charts==']


'{|class="wikitable sortable plainrowheaders" style="text-align:center;"\n|+Chart performance for "Speak Now"\n!scope="col"|Chart (2010)\n!scope="col"|Peak<br />position\n|-\n{{single chart|Australia|20|artist=Taylor Swift|song=Speak Now|rowheader=true|access-date=December 6, 2021}}\n|-\n{{singlechart|Canada|8|artist=Taylor Swift|rowheader=true|access-date=December 6, 2021}}\n|-\n{{single chart|New Zealand|34|artist=Taylor Swift|song=Speak Now|rowheader=true|access-date=December 6, 2021}}\n|-\n! scope="row"| South Korea ([[Circle Music Chart|Circle]])\n| 89\n|-\n{{singlechart|Billboardhot100|8|artist=Taylor Swift|rowheader=true|access-date=December 6, 2021}}\n|-\n{{singlechart|Billboardcountrysongs|58|artist=Taylor Swift|rowheader=true|access-date=December 6, 2021}}\n|}'


['Speak Now (song)', '==Certifications==']


'{{Certification Table Top|caption=Certifications for "Speak Now"}}\n{{Certification Table Entry|region=Australia|award=Platinum|type=single|relyear=2020|certyear=2024|access-date=February 14, 2024}}\n{{Certification Table Entry|region=United States|artist=Taylor Swift|title=Speak Now|type=single|award=Gold|relyear=2010|certyear=2011|digital=true|access-date= July 15, 2020}}\n{{Certification Table Bottom|noshipments=true|streaming=true}}'


['Speak Now (song)', '== Release history ==']


'{| class="wikitable"\n|+ Release date for "Speak Now"\n|-\n! scope="col" | Country\n! scope="col" | Date\n! scope="col" | Format\n! scope="col" | Label\n|-\n| rowspan="1" |United States\n| October 5, 2010\n| Digital download\n| rowspan="1" |[[Big Machine Records|Big Machine]]\n|}'


['Speak Now (song)', '== "Speak Now (Taylor\'s Version)" ==']


'{{Infobox song\n| name          = Speak Now (Taylor\'s Version)\n| artist        = [[Taylor Swift]]\n| album         = [[Speak Now (Taylor\'s Version)]]\n| released      = {{start date|2023|07|7}}\n| genre         = [[Country pop]]\n| length        = 4:02\n| label         = [[Republic Records|Republic]]\n| writer        = Taylor Swift\n| producer      = * Taylor Swift\n* [[Christopher Rowe (record producer) |Christopher Rowe]]\n| misc          = {{External music video|type=song|header=Lyric video|{{YouTube|JlZnvyBqceY|"Speak Now (Taylor\'s Version)"}}}}\n}}\nAfter signing a new contract with [[Republic Records]], Swift began re-recording her first six studio albums in November 2020. The decision came after the [[Taylor Swift masters dispute|public 2019 dispute]] between Swift and talent manager [[Scooter Braun]], who acquired Big Machine Records, including the [[Mastering (audio)|masters]] of Swift\'s albums the label had released. By re-recording her catalog, Swift had full ownership


['Speak Now (song)', '== "Speak Now (Taylor\'s Version)" ==', '=== Personnel ===']


"Adapted from ''Speak Now (Taylor's Version)'' digital album inline notes\n\n* Taylor Swift&nbsp;– vocals, background vocals, songwriter, producer\n* [[Christopher Rowe (record producer)|Christopher Rowe]]&nbsp;– producer, vocal engineer\n* David Payne&nbsp;– recording [[Audio engineer|engineer]]\n* Lowell Reynolds&nbsp;– assistant recording engineer, editor\n* Derek Garten&nbsp;– engineer, editor, [[Programming (music)|programming]]\n* [[Serban Ghenea]]&nbsp;– [[Audio mixing (recorded music)|mixing]]\n* Bryce Bordone&nbsp;– mix engineer\n* [[Randy Merrill]]&nbsp;– [[Mastering (audio)|mastering]]\n* Matt Billingslea&nbsp;– [[Drum kit|drums]], [[Percussion instrument|percussion]], [[clapping]]\n* Amos Heller&nbsp;– [[bass guitar]], clapping\n* Paul Sidoti&nbsp;– [[electric guitar]]\n* Mike Meadows&nbsp;– acoustic guitar, clapping, [[Organ (music)|organ]]\n* Max Bernstein&nbsp;– electric guitar\n* [[Liz Huett]]&nbsp;– background vocals"


['Speak Now (song)', '== "Speak Now (Taylor\'s Version)" ==', '=== Charts ===']


'{| class="wikitable sortable plainrowheaders" style="text-align:center"\n|+ Chart performance for "Speak Now (Taylor\'s Version)"\n! scope="col"| Chart (2023)\n! scope="col"| Peak<br />position\n|-\n! scope="row"| Australia ([[ARIA Charts|ARIA]])\n| 22\n|-\n{{single chart|Canada|31|artist=Taylor Swift|rowheader=true|access-date=July 18, 2023|refname=Canada2023}}\n|-\n{{single chart|Billboardglobal200|24|artist=Taylor Swift|rowheader=true|access-date=July 18, 2023|refname=Global2023}}\n|-\n! scope="row"| Greece ([[IFPI Greece|IFPI]])\n| 54\n|-\n! scope="row"| Malaysia International ([[Recording Industry Association of Malaysia|RIM]])\n| 17\n|-\n! scope="row"| New Zealand ([[Recorded Music NZ]])\n| 26\n|-\n! scope="row"| [[Philippines Songs|Philippines]] (\'\'[[Billboard (magazine)|Billboard]]\'\')\n| 5\n|-\n! scope="row"| Singapore ([[Recording Industry Association Singapore|RIAS]])\n| 11\n|-\n! scope="row"| [[UK Streaming Chart|UK Streaming]] ([[Official Charts Company|OCC]])\n| 45\n|


['Speak Now (song)', '==Certifications==']


'{{Certification Table Top|caption=Certifications for "Speak Now (Taylor\'s Version)"}}\n{{Certification Table Entry|region=Brazil|artist=Taylor Swift|title=Speak Now (Taylor\'s Version)|award=Gold|type=single|relyear=2023|certyear=2024|access-date=July 24, 2024}}\n{{Certification Table Bottom|streaming=true | nosales=true | noshipments=true}}'


['List of awards and nominations received by Taylor Swift']


'{{Short description|none}}\n{{Featured list}}\n{{Use American English|date=February 2021}}\n{{Use mdy dates|date=December 2019}}\n{{Infobox awards list\n| name = [[Taylor Swift]]\n| image = 191125 Taylor Swift at the 2019 American Music Awards (cropped).png\n| alt = Taylor Swift is looking towards the camera.\n| caption = Swift at the [[American Music Awards of 2019]]; she is the most-awarded artist by [[American Music Awards|the organization]].\n| wins = 667 <!-- Updated August 4, 2024 --->\n| nominations = 1,262 <!-- Updated August 7, 2024; includes pending awards --->\n<!-- | pending     = 41 Unsupported parameter; updated July 9, 2024 --->\n| awards = {{Custom award|[[Academy of Country Music Awards]]|8|31}}\n{{Custom award|[[ADG Excellence in Production Design Award]]|1|4}}\n{{Custom award|[[American Country Awards]]|4|24}}\n{{Custom award|[[American Country Countdown Awards]]|0|1}}\n{{Custom award|[[American Music Awards]]|40|48}}\n{{Custom award|[[Webby Awards#Anthem Awards|Ant


['List of awards and nominations received by Taylor Swift', '== Awards and nominations ==']


'{{Compact TOC|center=yes|name=no|v=[[#V2|V]]|k=|x=|z=|allowtoc=yes}}\n{| class="wikitable sortable plainrowheaders" style="width: 100%;"\n|+\n|-\n! scope="col" | Award\n! scope="col" | Year{{efn|Indicates the year of ceremony. Each year is linked to the article about the awards held that year, wherever possible.}}\n! scope="col" | Recipient(s)\n! scope="col" | Category\n! scope="col" width=9% | Result\n! scope="col" class="unsortable" |{{Abbr|Ref.|References}}\n|-\n! rowspan="31" scope="row" |[[Academy of Country Music Awards]] {{Anchor|A}}\n|[[42nd Academy of Country Music Awards|2007]]\n| rowspan="3" | Swift\n| rowspan="2" | New Female Vocalist of the Year\n|{{nominated}}\n| style="text-align:center;" |\n|-\n| rowspan="3" |[[43rd Academy of Country Music Awards|2008]]\n|{{won}}\n| rowspan="3" style="text-align:center;" |\n|-\n| Female Vocalist of the Year\n| {{nom}}\n|-\n|[[Taylor Swift (album)|\'\'Taylor Swift\'\']]\n| Album of the Year\n| {{nom}}\n|-\n| rowspan="4" | [[44th Academ


['List of awards and nominations received by Taylor Swift', '==Other accolades==', "=== ''Guinness World Records'' ==="]


'As of December 2023, Swift has acquired 118 \'\'[[Guinness World Records]]\'\'.\n{| class="wikitable" style="font-size:90%;"\n|+Key\n| style="background-color:#CCF6FF" |{{dagger|alt=Indicates a record that was eventually broken}}<!-- Do not remove the dagger; this is a requirement per Wikipedia\'s guidelines on color. -->\n| Indicates a former world-record holder\n|}\n{| class="wikitable sortable plainrowheaders"\n|- style="background:#ccc; text-align:center;"\n|+Year the record was awarded, title of the record, and the record holder\n! scope="col"| Year\n! scope="col"| Record\n! scope="col" | Record holder\n! scope="col" class="unsortable"| {{Abbr|Ref.|References}}\n|-\n| style="text-align:center;"| 2008\n| Most Entries in the [[Billboard Hot 100|US Top-20]] in a Year by a Solo Artist{{efn|In 2008, Swift had six tracks enter the top 20 of the [[Billboard Hot 100|\'\'Billboard\'\' Hot 100]]: "[[Change (Taylor Swift song)|Change]]", "[[Love Story (Taylor Swift song)|Love Story]]", "[[F




## Tokenize

In [10]:
tokenizer = tiktoken.get_encoding('p50k_base')

# create the length function
def token_len(text):
    tokens = tokenizer.encode(
        text
    )
    return len(tokens)

token_len(wikipedia_sections[0][1])
for section in wikipedia_sections:
    titles, text = section
    string = "\n\n".join(titles + [text])
    print(string)
    break

When Emma Falls in Love

{{Use mdy dates|date=January 2024}}{{short description|2023 song by Taylor Swift}}
{{good article}}
{{Infobox song
| name          = When Emma Falls in Love
| type          = song
| artist        = [[Taylor Swift]]
| genre         = 
| cover         = 
| alt           = 
| caption       = 
| album         = [[Speak Now (Taylor's Version)]]
| written       = 
| released      = {{start date|2023|7|7}}
| studio        = 
| length        = {{duration|m=4|s=12}}
| label         = [[Republic Records|Republic]]
| writer        = Taylor Swift
| producer      = *Taylor Swift
*[[Aaron Dessner]]
| misc       = {{External music video|type=song|header=Lyric video|{{YouTube|IYqgVYjN3Go|"When Emma Falls in Love"}}}}
}}
"'''When Emma Falls in Love'''"{{efn|Officially titled "'''When Emma Falls in Love (Taylor's Version) (From the Vault)'''"<ref name="Green" />}} is a song written and recorded by the American singer-songwriter [[Taylor Swift]]. Originally intended for but left 

In [11]:
text_splitter = TokenTextSplitter(
    chunk_size=450, # 500 tokens is the max
    chunk_overlap=20 # Overlap of N tokens between chunks (to reduce chance of cutting out relevant connected text like middle of sentence)
)
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")




In [12]:
documents = []
for section in wikipedia_sections:
    title, text = section
    string = "\n\n".join(title + [text])
    chunks = text_splitter.split_text(string)
    total_chunks = len(chunks)
    for chunk_num in range(total_chunks):
        header = title[0]
        curr_chunk = chunks[chunk_num]
        documents.append(Document(page_content=header+curr_chunk, metadata={"title": title[0]}))
print(len(documents))

5464


## Store documents in Pinecone (Vector Database) - DO NOT RUN AGAIN
This took 20 minutes to run and create the vector database. Now that it's created, do not run the next cell! 

In [39]:
# docsearch = Chroma.from_documents(documents, embedding_function)
'''
index: Optional[Any] = None,
        embedding: Optional[Embeddings] = None,
        text_key: Optional[str] = "text",
        namespace: Optional[str] = None,
        distance_strategy: Optional[DistanceStrategy] = DistanceStrategy.COSINE,
        *,
        pinecone_api_key: Optional[str] = None,
        index_name: Optional[str] = None,
'''
pinecone_index_name = "ai-bot"  
doc_search = PineconeVectorStore.from_documents(documents, embedding_function, index_name=pinecone_index_name)


# Connect to vector store
Now that vector store has been created, we can easily access it (we don't have to rebuild it each time; it's stored in the cloud)!

In [4]:
def get_relevant_excerpts(user_question, docsearch):
    """
    This function retrieves the most relevant excerpts based on the user's question.
    Parameters:
    user_question (str): The question asked by the user.
    docsearch (PineconeVectorStore): The Pinecone vector store containing the presidential speeches.
    Returns:
    str: A string containing the most relevant excerpts from presidential speeches.
    """

    # Perform a similarity search on the Pinecone vector store using the user's question
    relevent_docs = docsearch.similarity_search(user_question)

    # Extract the page content from the top 3 most relevant documents and join them into a single string
    relevant_excerpts = '\n\n------------------------------------------------------\n\n'.join([doc.page_content for doc in relevent_docs[:3]])

    return relevant_excerpts

In [5]:
def chatbot_completion(client, model, user_question, relevant_excerpts):
    """
    This function generates a response to the user's question using a pre-trained model.
    Parameters:
    client (Groq): The Groq client used to interact with the pre-trained model.
    model (str): The name of the pre-trained model.
    user_question (str): The question asked by the user.
    relevant_excerpts (str): A string containing the most relevant excerpts.
    Returns:
    str: A string containing the response to the user's question.
    """

    # Define the system prompt
    system_prompt = '''
    You are the Taylor Swift Expert Chatbot. Your job is to answer questions about Taylor Swift's albums and songs.
    Given the user's question and relevant content from the knowledge base of articles, answer the question accurately. 
    '''

    # Generate a response to the user's question using the pre-trained model
    chat_completion = client.chat.completions.create(
        messages = [
            {
                "role": "system",
                "content":  system_prompt
            },
            {
                "role": "user",
                "content": "User Question: " + user_question + "\n\nRelevant Content:\n\n" + relevant_excerpts,
            }
        ],
        model = model
    )

    # Extract the response from the chat completion
    response = chat_completion.choices[0].message.content

    return response

In [6]:
model = 'llama3-8b-8192'

embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# Initialize the Groq client
groq_api_key = os.getenv('GROQ_API_KEY')
pinecone_api_key=os.getenv('PINECONE_API_KEY')
pinecone_index_name = "presidential-speeches"
client = Groq(
    api_key=groq_api_key
)

pc = Pinecone(api_key = pinecone_api_key)
pinecone_index_name = "ai-bot"  
docsearch = PineconeVectorStore(index_name=pinecone_index_name, embedding=embedding_function)





## Get relevant docs by similarity search

In [8]:
while True:
    # Get the user's question
    user_question = input("Ask a question about Taylor swift or enter 'q' to quit: ")
    if user_question == 'q':
        break
    if user_question:
        relevant_excerpts = get_relevant_excerpts(user_question, docsearch)
        response = chatbot_completion(client, model, user_question, relevant_excerpts)
        print(response)

Taylor Swift is an American singer-songwriter! She is a renowned artist known for her incredibly successful music career, outstanding songwriting skills, and memorable lyrics. With over a decade of experience in the music industry, Taylor Swift has garnered a massive following, referred to as the "Swifties," who deeply admire and appreciate her work.

Throughout her career, Taylor Swift has consistently broken records, experimented with different genres, and pushed boundaries in the music industry. Her impact goes beyond music, as she has inspired creators from various fields, such as authors, film directors, and screenwriters, demonstrating her broad appeal and universal influence.

Taylor Swift has redefined the artist-fan relationship by foster a deep connection with her Swifties. Her commitment to engaging with her fans, offering support, and prioritizing their creativity has led to an unprecedented level of devotion and loyalty.

That's Taylor Swift in a nutshell!
Based on the con