In [7]:
import re
import requests
from bs4 import BeautifulSoup 
import pickle
import json
from tqdm import tqdm

In [23]:
scraped = {}
pages = [
    "https://starwars.fandom.com/wiki/N-1_starfighter", 
    "https://starwars.fandom.com/wiki/Ahsoka_Tano", 
    "https://starwars.fandom.com/wiki/Din_Djarin"]

last_number = 0
for page_url in pages:
    try:
        
        # Get page
        result = requests.get(page_url)
        content = result.content
        soup = BeautifulSoup(content, "html.parser")

        # Get title
        heading = soup.find('h1', id='firstHeading')
        if heading is None: continue
        heading = heading.text

        # Extract Sidebar
        is_character = False
        side_bar = {}
        sec = soup.find_all('section', class_='pi-item')
        for s in sec:
            title = s.find('h2')
            if title is None:
                title = '<no category>'
            else:
                title = title.text
            side_bar[title] = {}
            items = s.find_all('div', class_='pi-item')
            for item in items:
                attr = item.find('h3', class_='pi-data-label')
                if attr is None:
                    attr = '<no attribute>'
                else:
                    attr = attr.text
                if attr == 'Species': is_character = True
                value = re.sub("[\(\[].*?[\)\]]" ,'', '], '.join(item.find('div', class_='pi-data-value').text.split(']')))
                value = value.strip()[:-1].replace(',,', ',')
                if ',' in value:
                    value = [i.strip() for i in value.split(',') if i.strip() != '']
                side_bar[title][attr] = value

        # Raw page content
        raw_content = soup.find('div', class_='mw-parser-output')
        if raw_content is not None:
            content_pgs = []
            for raw_paragraph in raw_content.find_all('p', recursive=False):
                if 'aside' in str(raw_paragraph): continue
                content_pgs.append(re.sub("[\(\[].*?[\)\]]" ,'', raw_paragraph.text) )
            # paragraph = value = re.sub("[\(\[].*?[\)\]]" ,'', raw_paragraph.text)


        else:
            # Empty page
            paragraph = ''

        # Data object
        scraped[page_url] = {
            'url': page_url,
            'title': heading,
            'is_character': is_character,
            'side_bar': side_bar,
            'paragraph': content_pgs
        }

    except:
        print(f'Failed! {page_url}')


# Save final part to disk
fn =  './starwars_small_canon_data.pickle'
with open(fn, 'wb') as f:
    pickle.dump(scraped, f, protocol=pickle.HIGHEST_PROTOCOL)

In [24]:
## Let's do a quick test to make sure it worked we. Even if the data is big
## we can chunk it up with the above code and load it in sections.

from pathlib import Path

bookFilePath = "starwars_*_canon_data*.pickle"
files = sorted(Path('.').glob(bookFilePath))
for fn in files:
  with open(fn,'rb') as f:
      part = pickle.load(f)
      for key, value in part.items():
          title = value['title'].strip()
          print(title)

N-1 starfighter
Ahsoka Tano
Din Djarin


In [4]:
from langchain.embeddings import HuggingFaceEmbeddings

def setup_embeddings():
    # Huggingface embedding setup
    print(">> Prep. Huggingface embedding setup")
    model_name = "sentence-transformers/msmarco-roberta-base-v2"
    return HuggingFaceEmbeddings(model_name=model_name)

hf = setup_embeddings()

>> Prep. Huggingface embedding setup


  from .autonotebook import tqdm as notebook_tqdm


In [14]:
##example: es_url = f"http://******:**********@localhost:9200/"

es_url = f"http://{username}:{password}@{localhost}:{port}/"

In [5]:
from langchain.vectorstores import ElasticVectorSearch

index_name = "book_wookieepedia_small1"

db = ElasticVectorSearch(embedding=hf, elasticsearch_url=es_url, index_name=index_name)




In [27]:
from pathlib import Path

count = 0
bookFilePath = "starwars_*_canon_data*.pickle"
files = sorted(Path('.').glob(bookFilePath))
print(files)
batchtext = []
for fn in files:
    print(f"Starting book: {fn}")
    with open(fn,'rb') as f:
        part = pickle.load(f)
        
        for ix, (key, value) in tqdm(enumerate(part.items()), total=len(part)):
            paragraphs = value['paragraph']
            for px, p in enumerate(paragraphs):
                # print(f"{ix} {px} {title}")
                batchtext.append(p)
                count = count + 1
        
print("")
print(len(batchtext))
db.from_texts(batchtext, embedding=hf, elasticsearch_url=es_url, index_name=index_name)

[WindowsPath('starwars_small_canon_data.pickle')]
Starting book: starwars_small_canon_data.pickle


100%|██████████| 3/3 [00:00<?, ?it/s]


627





<langchain.vectorstores.elastic_vector_search.ElasticVectorSearch at 0x226641e9dd0>

In [28]:
from langchain import PromptTemplate, HuggingFaceHub, LLMChain
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, pipeline, AutoModelForSeq2SeqLM

topic = "Star Wars"
index_name = "book_wookieepedia_small1"

cache_dir = "./cache"

def getFlanLarge():
    model_id = 'google/flan-t5-large'
    print(f">> Prep. Get {model_id} ready to go")
    tokenizer = AutoTokenizer.from_pretrained(model_id) 
    model = AutoModelForSeq2SeqLM.from_pretrained(
        model_id, cache_dir=cache_dir) 
    
    pipe = pipeline(
        "text2text-generation",
        model=model, 
        tokenizer=tokenizer, 
        max_length=100
    )
    llm = HuggingFacePipeline(pipeline=pipe)
    return llm

def make_the_llm():
    template_informed = """
    I am a helpful AI that answers questions. When I don't know the answer I say I don't know. 
    I know context: {context}
    when asked: {question}
    my response using only information in the context is: """

    prompt_informed = PromptTemplate(template=template_informed, input_variables=["context", "question"])

    llm = getFlanLarge()

    return LLMChain(prompt=prompt_informed, llm=llm)

llm_chain_informed= make_the_llm()




>> Prep. Get google/flan-t5-large ready to go


In [29]:

## how to ask a question
def ask_a_question(question):
    similar_docs = db.similarity_search(question)
    print(similar_docs)
    print(f'The most relevant passage: \n\t{similar_docs[0].page_content}')

    ## Ask Local LLM context informed prompt
    informed_context= similar_docs[0].page_content
    informed_response = llm_chain_informed.run(context=informed_context,question=question)
    
    return informed_response



print(f'I am a trivia chat bot, ask me any question about {topic}')

while True:
    question = input("User Question >> ")
    response= ask_a_question(question)
    print(f"\tAnswer  : {response}")


I am a trivia chat bot, ask me any question about Star Wars
[Document(page_content='The N-1 starfighter first appeared in the 1999 film Star Wars: Episode I The Phantom Menace, the first installment of the Star Wars prequel trilogy. A few practical models of the ship were built for the prequel trilogy. N-1 starfighters were later added to Star Wars: Episode VI Return of the Jedi as part of the celebration on Naboo in the ending montage in the DVD release.\n', metadata={}), Document(page_content='The Mandalorian bounty hunter Din Djarin acquired a modified N-1 starfighter following the destruction of his previous starship, the Razor Crest. Djarin and the engineer Peli Motto constructed the ship at Hangar 3-5 in Mos Eisley, Tatooine.\n', metadata={}), Document(page_content='In 35 ABY, N-1 starfighters were part of a fleet assembled by Lando Calrissian to assist the Resistance during the battle against the Sith Eternal forces over the planet Exegol.\n', metadata={}), Document(page_content