In [42]:
import requests
import gutenbergpy.textget
import os
from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin

# Data Collection

In [43]:
def fetch_gutenberg_text(book_id, save_dir="gutenberg_data"):
    os.makedirs(save_dir, exist_ok=True)
    text = gutenbergpy.textget.get_text_by_id(book_id)
    if text:
        with open(os.path.join(save_dir, f"gutenberg_{book_id}.txt"), "w", encoding="utf-8") as f:
            f.write(text.decode("utf-8"))
        print(f"Saved Gutenberg text: {book_id}")
    else:
        print(f"Gutenberg book {book_id} not found.")

gutenberg_books = [14994, 19789]  # Example book IDs: Bulfinch's Mythology, The Library of Apollodorus
for book_id in gutenberg_books:
    fetch_gutenberg_text(book_id)

Saved Gutenberg text: 14994
Saved Gutenberg text: 19789


In [44]:
def fetch_all_urls(start_url, save_dir="theoi_urls"):
    os.makedirs(save_dir, exist_ok=True)
    
    # Get the main page content
    response = requests.get(start_url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Extract all URLs from <a> tags
        all_urls = set()
        for a_tag in soup.find_all("a", href=True):
            href = a_tag.get("href")
            # Join relative URLs with the base URL
            full_url = urljoin(start_url, href)
            if full_url.startswith("https://www.theoi.com"):
                all_urls.add(full_url)
        
        # Save all URLs to a text file
        file_name = os.path.join(save_dir, "all_urls.txt")
        with open(file_name, "w", encoding="utf-8") as f:
            for url in all_urls:
                f.write(url + "\n")
        
        print(f"Saved {len(all_urls)} URLs from: {start_url}")
    else:
        print(f"Failed to fetch data from: {start_url}")

# Example usage
start_url = "https://www.theoi.com/"  # Starting URL to get links from
fetch_all_urls(start_url)


Saved 162 URLs from: https://www.theoi.com/


In [None]:
import requests
from bs4 import BeautifulSoup
import os

# Set the path to the file containing the list of URLs
urls_file_path = os.path.join('theoi_urls', 'all_urls.txt')

# Function to fetch and parse the content of a URL
def fetch_and_parse_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for any HTTP errors
        soup = BeautifulSoup(response.content, 'html.parser')
        return soup
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

# Function to process the URLs and save the content to text files
def process_urls_from_file(file_path):
    # Open and read the file containing URLs
    try:
        with open(file_path, 'r') as file:
            urls = file.readlines()

        # Clean up the URLs (strip any extra whitespace or newline characters)
        urls = [url.strip() for url in urls]

        # Create a folder to store the text files if it doesn't already exist
        if not os.path.exists('page_contents'):
            os.makedirs('page_contents')

        # Loop through the URLs and fetch the content
        for url in urls:
            print(f"Processing URL: {url}")
            soup = fetch_and_parse_url(url)
            if soup:
                # Extract the title of the page to use as the filename
                title = soup.title.string if soup.title else "no_title"
                filename = title.replace(" ", "_").replace("/", "_") + ".txt"
                filepath = os.path.join('theoi_data', filename)

                # Save the page content to a text file
                with open(filepath, 'w', encoding='utf-8') as f:
                    # Example: Save the whole HTML content of the page to the text file
                    f.write(soup.prettify())

                print(f"Page content saved to {filepath}")
            print('-' * 40)

    except FileNotFoundError:
        print(f"The file {file_path} was not found.")

# Run the process for the given file
process_urls_from_file(urls_file_path)


Processing URL: https://www.theoi.com/Heros/Midas.html
Page content saved to theoi_data/MIDAS_-_Phrygian_King_of_Greek_Mythology.txt
----------------------------------------
Processing URL: https://www.theoi.com/Khthonios/Persephone.html
Page content saved to theoi_data/PERSEPHONE_-_Greek_Goddess_of_Spring,_Queen_of_the_Underworld_(Roman_Proserpina).txt
----------------------------------------
Processing URL: https://www.theoi.com/gallery-mosaics-1.html
Page content saved to theoi_data/Greco-Roman_Mosaics_Gallery_1.txt
----------------------------------------
Processing URL: https://www.theoi.com/Olympios/Aphrodite.html
Page content saved to theoi_data/APHRODITE_-_Greek_Goddess_of_Love_&_Beauty_(Roman_Venus).txt
----------------------------------------
Processing URL: https://www.theoi.com/Heros/Lykaon.html
Page content saved to theoi_data/LYCAON_(Lykaon)_-_Arcadian_King_of_Greek_Mythology.txt
----------------------------------------
Processing URL: https://www.theoi.com/Titan/Hekatonk

In [None]:
import torch
from haystack.document_stores import InMemoryDocumentStore

document_store = InMemoryDocumentStore(use_bm25=True)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import os
from haystack.pipelines.standard_pipelines import TextIndexingPipeline

doc_dir = "theoi_data"

files_to_index = [doc_dir + "/" + f for f in os.listdir(doc_dir)]
indexing_pipeline = TextIndexingPipeline(document_store)
indexing_pipeline.run_batch(file_paths=files_to_index)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/rachel/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
Converting files: 100%|██████████| 148/148 [00:02<00:00, 58.16it/s]
Preprocessing:   0%|          | 0/148 [00:00<?, ?docs/s]We found one or more sentences whose split count is higher than the split length.
Document c225f9fc53d79be17c0931337aacf2e0 is 11347 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time. This document will be now hard-split at 10000 chars recursively.
Document 6bee08f69e9e6aa54634a2148c81956b is 18532 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time. This document will be now hard-split at 10000 chars recursively.
Preprocessing:  15%|█▍        | 22/148 [00:00<00:

{'documents': [<Document: {'content': '<!DOCTYPE html>\n<html lang="en">\n<!-- InstanceBegin template="/Templates/Entry_Olympian.dwt" codeOutsideHTMLIsLocked="false" -->\n<head>\n<meta charset="utf-8"/>\n<meta content="IE=edge" http-equiv="X-UA-Compatible"/>\n<meta content="width=device-width, initial-scale=1" name="viewport"/>\n<!-- InstanceBeginEditable name="doctitle" -->\n<title>\nDEMETER - Greek Goddess of Grain &amp; Agriculture (Roman Ceres)\n</title>\n<meta content="Demeter was the ancient Greek goddess of agriculture, grain and bread who sustained mankind with the earth\'s rich bounty. She was depicted as a mature woman, often wearing a crown and bearing sheafs of wheat or a cornucopia (horn of plenty), and a torch. Her Roman name was Ceres."name="description"/>\n<meta content="Demeter, Olympian goddess, Greek mythology, Greek goddess of agriculture, Greek goddess of grain, Greek goddess of farming, Greek goddess of bread, Demeter pictures, Ceres" name="keywords"/>\n<meta cont

# Initializaing Retriever

In [None]:
from haystack.nodes import BM25Retriever

retriever = BM25Retriever(document_store=document_store)

# Initializing Reader

In [None]:
from haystack.nodes import FARMReader

reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)

# Creating the Retriever-Reader Pipeline

In [None]:
from haystack.pipelines import ExtractiveQAPipeline

pipe = ExtractiveQAPipeline(reader, retriever)

# Asking a Question

In [17]:
import difflib
from pprint import pprint

In [18]:
correct_answers = {
    "Who was the king of the Greek gods?": "Zeus",
    "What was the name of the Greek goddess of wisdom, courage, and warfare?": "Athena",
    "Which hero performed the Twelve Labors?": "Heracles",
    "Who was the Greek god of the sea?": "Poseidon",
    "What is the name of the monster with the body of a lion, the head of a goat, and a tail that ends in a snake’s head?": "Chimera",
    "Who was the Greek god of the underworld?": "Hades",
    "Which mortal hero was known for his journey to the Underworld to rescue his wife, Eurydice?": "Orpheus",
    "What is the name of the winged horse born from the blood of Medusa when she was slain by Perseus?": "Pegasus",
    "Who was the goddess of love and beauty in Greek mythology?": "Aphrodite",
    "Who was turned into a spider after challenging Athena to a weaving contest?": "Arachne"
}

# Run the pipeline for each question
questions = list(correct_answers.keys())

In [19]:
predictions = {}
for question in questions:
    prediction = pipe.run(query=question, params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}})
    predictions[question] = prediction

Inferencing Samples: 100%|██████████| 1/1 [00:02<00:00,  2.22s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.08 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:01<00:00,  1.60s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:01<00:00,  1.17s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:01<00:00,  1.34s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:01<00:00,  1.43s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:01<00:00,  1.28s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:01<00:00,  1.05s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:01<00:00,  1.35s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:01<00:00,  1.19s/ Batches]


In [20]:
# Function to calculate string similarity
def get_similarity(a, b):
    return difflib.SequenceMatcher(None, a.lower(), b.lower()).ratio()

In [37]:
# Compare predictions with the correct answers
similarity_scores = {}
for question, prediction in predictions.items():
    predicted_answer = prediction['answers'][0].answer
    correct_answer = correct_answers[question]
    
    similarity = get_similarity(predicted_answer, correct_answer)
    similarity_scores[question] = {
        'predicted_answer': predicted_answer,
        'correct_answer': correct_answer,
        'similarity_score': similarity
    }

# Print out the results
pprint(similarity_scores)

{'What is the name of the monster with the body of a lion, the head of a goat, and a tail that ends in a snake’s head?': {'correct_answer': 'Chimera',
                                                                                                                          'predicted_answer': 'Chimera',
                                                                                                                          'similarity_score': 1.0},
 'What is the name of the winged horse born from the blood of Medusa when she was slain by Perseus?': {'correct_answer': 'Pegasus',
                                                                                                       'predicted_answer': 'Pegasos',
                                                                                                       'similarity_score': 0.8571428571428571},
 'What was the name of the Greek goddess of wisdom, courage, and warfare?': {'correct_answer': 'Athena',
                                 