In [7]:

import gutenbergpy.textget
from urllib.parse import urljoin

import requests
import os
import asyncio
import aiohttp
import aiofiles
from bs4 import BeautifulSoup
from pathlib import Path

import torch
from haystack.document_stores import InMemoryDocumentStore
from haystack.pipelines.standard_pipelines import TextIndexingPipeline
from haystack.nodes import BM25Retriever
from haystack.nodes import FARMReader
from haystack.pipelines import ExtractiveQAPipeline

import difflib
from pprint import pprint
import json

# Data Collection

In [None]:
def fetch_gutenberg_text(book_id, save_dir="gutenberg_data"):
    os.makedirs(save_dir, exist_ok=True)
    text = gutenbergpy.textget.get_text_by_id(book_id)
    if text:
        with open(os.path.join(save_dir, f"gutenberg_{book_id}.txt"), "w", encoding="utf-8") as f:
            f.write(text.decode("utf-8"))
        print(f"Saved Gutenberg text: {book_id}")
    else:
        print(f"Gutenberg book {book_id} not found.")

gutenberg_books = [14994, 19789]  # Example book IDs: Bulfinch's Mythology, The Library of Apollodorus
for book_id in gutenberg_books:
    fetch_gutenberg_text(book_id)

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed

def fetch_urls_from_page(url, session, all_urls, urls_to_process):
    try:
        response = session.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "html.parser")
            
            # Extract all URLs from <a> tags on the current page
            for a_tag in soup.find_all("a", href=True):
                href = a_tag.get("href")
                full_url = urljoin(url, href)
                if full_url.startswith("https://www.theoi.com") and full_url not in all_urls:
                    all_urls.add(full_url)
                    urls_to_process.append(full_url)
            
            # Find all "More" links
            more_links = soup.find_all("a", string="More")
            for more_link in more_links:
                more_url = urljoin(url, more_link.get("href"))
                if more_url not in all_urls:
                    all_urls.add(more_url)
                    urls_to_process.append(more_url)
                    print(f"Following 'More' link: {more_url}")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")

def fetch_all_urls(start_url, save_dir="theoi_urls", max_workers=10):
    os.makedirs(save_dir, exist_ok=True)
    
    all_urls = set()  # Store all unique URLs
    urls_to_process = [start_url]  # Queue of URLs to process
    
    with requests.Session() as session:
        # Create a ThreadPoolExecutor for parallel processing
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = []
            
            # Loop through the URLs to process
            while urls_to_process:
                # Fetch a new batch of URLs to process
                batch_urls = urls_to_process[:min(max_workers, len(urls_to_process))]
                urls_to_process = urls_to_process[min(max_workers, len(urls_to_process)):]
                
                for url in batch_urls:
                    futures.append(executor.submit(fetch_urls_from_page, url, session, all_urls, urls_to_process))
                
                # Wait for all tasks to complete
                for future in as_completed(futures):
                    future.result()  # Retrieve result and check for exceptions
                
                futures = []  # Reset futures after processing the batch
                
    # Save all URLs to a text file
    file_name = os.path.join(save_dir, "all_urls.txt")
    with open(file_name, "w", encoding="utf-8") as f:
        for url in all_urls:
            f.write(url + "\n")
    
    print(f"Saved {len(all_urls)} URLs.")

# Example usage
start_url = "https://www.theoi.com/"  # Starting URL to get links from
fetch_all_urls(start_url)

# Data Preparation

In [9]:
urls_file_path = os.path.join('theoi_urls', 'all_urls.txt')

with open(urls_file_path, 'r') as file:
    urls = file.readlines()
    print(urls)
    print(f"Total URLs: {len(urls)}")

['https://www.theoi.com/Georgikos/Ariadne.html#Corona\n', 'https://www.theoi.com/articles/top-10-greek-heroes-in-mythology/greek-mythology/bestiary.html\n', 'https://www.theoi.com/Olympios/ArtemisFavour.html#Klinis\n', 'https://www.theoi.com/Text/ValeriusFlaccus8.html#11\n', 'https://www.theoi.com/Text/OvidFasti5.html#36\n', 'https://www.theoi.com/Text/PlutarchTheseus.html#n41\n', 'https://www.theoi.com/Olympios/HephaistosMyths.html#Skamandros\n', 'https://www.theoi.com/Gallery/T24.2.html\n', 'https://www.theoi.com/Text/ApEc.html#234\n', 'https://www.theoi.com/Text/Parthenius.html#n26\n', 'https://www.theoi.com/Titan/AsterEosphoros.html#Encyclopedia\n', 'https://www.theoi.com/Text/StatiusThebaid5.html#52\n', 'https://www.theoi.com/Text/SenecaAgamemnon.html#100\n', 'https://www.theoi.com/articles/who-was-hercules-mother-and-was-she-a-god/greek-mythology/bestiary.html\n', 'https://www.theoi.com/Text/Ap3b.html#172\n', 'https://www.theoi.com/Cult/DionysosCult.html#Messenia\n', 'https://www

In [60]:
# Directory to save scraped text
theoi_data_dir = "theoi_data"
os.makedirs(theoi_data_dir, exist_ok=True)  # Ensure directory exists

# Async function to fetch and parse a URL
async def fetch_and_parse_url(session, url):
    try:
        async with session.get(url) as response:
            response.raise_for_status()  # Raise error if response is not 200
            html_content = await response.text()
            soup = BeautifulSoup(html_content, "html.parser")

            # Try extracting <main> tag first
            main_content = soup.find("main") or soup.find(id="main") or soup.find("html")
            
            if main_content:
                title = soup.title.string if soup.title else "no_title"
                print(f"✅ Processed: {url} ({'Main found' if main_content.name == 'main' else 'Using full HTML'})")
                return title, main_content.get_text(separator="\n", strip=True)
            else:
                print(f"⚠️ No extractable content found for {url}")
                return None, None
    except Exception as e:
        print(f"❌ Error fetching {url}: {e}")
        return None, None

# Async function to save text files
async def save_text_file(title, text_content):
    filename = title.replace(" ", "_").replace("/", "_") + ".txt"
    filepath = os.path.join(theoi_data_dir, filename)

    try:
        if text_content.strip():
            async with aiofiles.open(filepath, "w", encoding="utf-8") as f:
                await f.write(text_content)
            print(f"✅ Saved: {filepath}")
            
            # Verify if file exists
            if os.path.exists(filepath):
                print(f"✅ File confirmed: {filepath}")
            else:
                print(f"❌ Failed to confirm file save: {filepath}")
        else:
            print(f"⚠️ No content to save for {title}.")
    except Exception as e:
        print(f"❌ Error saving {filename}: {e}")


# Async function to process URLs
async def process_urls(urls):
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_and_parse_url(session, url) for url in urls]
        results = await asyncio.gather(*tasks)

        # Save files concurrently
        save_tasks = [save_text_file(title, text) for title, text in results if title and text]
        saved_files = await asyncio.gather(*save_tasks)
        
        saved_count = sum(saved_files)
        failed_count = len(urls) - saved_count
        
        print(f"\n📊 Summary: {saved_count} saved, {failed_count} not saved.")

# Function to load URLs from file and run the scraper
def run_async_scraper(file_path):
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            urls = [url.strip() for url in file.readlines()]

        if not urls:
            print("⚠️ No URLs found in the file.")
            return

        # Check if an event loop is already running (Jupyter issue)
        try:
            loop = asyncio.get_running_loop()
        except RuntimeError:
            loop = None

        if loop and loop.is_running():
            print("⚠️ Running inside an active event loop. Using create_task instead.")
            task = asyncio.create_task(process_urls(urls))  # Non-blocking
            return task  # This will allow the Jupyter event loop to handle the task
        else:
            asyncio.run(process_urls(urls))  # Normal execution in standalone scripts
    
    except FileNotFoundError:
        print(f"❌ File not found: {file_path}")

# Run the scraper
run_async_scraper(urls_file_path)

⚠️ Running inside an active event loop. Using create_task instead.


<Task pending name='Task-213857' coro=<process_urls() running at /var/folders/wm/x1bxnlw9385b04dkk5my3x8m0000gn/T/ipykernel_1999/4197183328.py:50>>

✅ Processed: https://www.theoi.com/Text/Parthenius.html#n26 (Main found)
✅ Processed: https://www.theoi.com/Nymphe/NympheAnippe.html (Main found)
❌ Error fetching https://www.theoi.com/articles/troy-greece-was-troy-greek/greek-mythology/nymphs.html: 404, message='Not Found', url='https://www.theoi.com/articles/troy-greece-was-troy-greek/greek-mythology/nymphs.html'
✅ Processed: https://www.theoi.com/Text/LycophronAlexandra2.html#b2 (Main found)
✅ Processed: https://www.theoi.com/Text/OvidHeroides5.html#s3 (Main found)
✅ Processed: https://www.theoi.com/Text/PatternPoems.html#a25 (Main found)
✅ Processed: https://www.theoi.com/Ouranios/Hestia.html#Sources (Main found)
✅ Processed: https://www.theoi.com/Gallery/Z26.1B.html (Main found)
✅ Processed: https://www.theoi.com/Text/PlutarchParallelStories.html#15 (Main found)
✅ Processed: https://www.theoi.com/Text/NonnusDionysiaca6.html#25 (Main found)
✅ Processed: https://www.theoi.com/Text/AeschylusFragments2.html#Phryges (Main found)
✅ Proc

In [66]:
def count_files(directory):
    return len(list(Path(directory).glob("*")))

directory_path = Path("theoi_data")
print(f"Number of files: {count_files(directory_path)}")

Number of files: 1383


# Data Storage

In [43]:
document_store = InMemoryDocumentStore(use_bm25=True)

In [44]:
doc_dirs = ["theoi_data1", "gutenberg_data"]

files_to_index = []
for doc_dir in doc_dirs:
    files_to_index.extend([doc_dir + "/" + f for f in os.listdir(doc_dir)])

indexing_pipeline = TextIndexingPipeline(document_store)
indexing_pipeline.run_batch(file_paths=files_to_index)

Converting files: 100%|██████████| 1996/1996 [00:07<00:00, 277.25it/s]
Preprocessing:   3%|▎         | 52/1996 [00:01<00:44, 43.88docs/s]We found one or more sentences whose split count is higher than the split length.
Preprocessing: 100%|██████████| 1996/1996 [00:43<00:00, 46.17docs/s]
Updating BM25 representation...: 100%|██████████| 28513/28513 [00:01<00:00, 20478.27 docs/s]


{'documents': [<Document: {'content': "Greek Mythology\n>>\nGreek Gods\n>>\nSea Gods\n>> Ichthyocentaurs (Ikhthyokentauroi)\nIKHTHYOKENTAUROI\nGreek Name\nΙχθυοκενταυρος\nΙχθυοκενταυροι\nTransliteration\nIkhthyokentauros\nIkhthyokentauroi\nEnglish Spelling\nIchthyocentaur\nIchthyocentaurs\nTranslation\nFish-Centaur\n(\nikhthys, kentauros\n)\nAphrodite and the fish-centaurs Aphros and Bythos, Greco-Roman mosaic from Zeugma C1st-2nd A.D., Gaziantep Museum of Archaeology\nTHE IKHTHYOKENTAUROI (Ichthyocentaurs) were a pair of centaurine sea-gods with the upper bodies of men, the lower fore-quarters of horses, and the serpentine tails of fish. Their brows were crowned with a pair of lobster-claw horns.\nThe fish-centaurs were named Bythos (Sea-Depths) and Aphros (Sea-Foam). They were brothers of the wise kentauros Kheiron (centaur Chiron) and like him were perhaps regarded as wise teachers.\nThe sea-centaurs were probably derived from the\nFish-Deities\nof Syrian mythology which carried Ash

In [45]:
docs = document_store.get_all_documents()
with open("documents1.json", "w") as f:
    json.dump([doc.to_dict() for doc in docs], f)

# Initializaing Retriever

In [46]:
retriever = BM25Retriever(document_store=document_store)

# Initializing Reader

In [47]:
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)

# Fine Tuning Reader

In [78]:
fine_tunded_dir = "fine_tuned_reader"

In [79]:
reader_fine_tunned = FARMReader(model_name_or_path=fine_tunded_dir, use_gpu=True)

Some unused parameters are passed to the QuestionAnsweringHead. Might not be a problem. Params: {"training": false, "num_labels": 2, "ph_output_type": "per_token_squad", "model_type": "span_classification", "label_tensor_name": "question_answering_label_ids", "label_list": ["start_token", "end_token"], "metric": "squad", "name": "QuestionAnsweringHead"}


In [None]:
training_dir = './training'
training_json = 'train_greek_mythology.json'

reader.train(
    data_dir=training_dir,  
    train_filename=training_json,
    use_gpu=True,
    n_epochs=3,
    batch_size=8,
    save_dir="fine_tuned_reader2"
)

Preprocessing dataset: 100%|██████████| 2/2 [00:00<00:00,  8.62 Dicts/s]
  self.scaler = torch.cuda.amp.GradScaler(enabled=self.use_amp)
  with torch.cuda.amp.autocast(enabled=self.use_amp):
Train epoch 0/2 (Cur. train loss: 0.0019):  41%|████      | 51/125 [4:02:38<8:01:18, 390.25s/it] 

In [80]:
pipe_fine_tuned = ExtractiveQAPipeline(reader_fine_tunned, retriever)

# Creating the Retriever-Reader Pipeline

In [48]:
pipe = ExtractiveQAPipeline(reader, retriever)

# Asking a Question

In [13]:
correct_answers = {
    "Who was the king of the Greek gods?": "Zeus",
    "What was the name of the Greek goddess of wisdom, courage, and warfare?": "Athena",
    "Which hero performed the Twelve Labors?": "Heracles",
    "Who was the Greek god of the sea?": "Poseidon",
    "What is the name of the monster with the body of a lion, the head of a goat, and a tail that ends in a snake’s head?": "Chimera",
    "Who was the Greek god of the underworld?": "Hades",
    "Which mortal hero was known for his journey to the Underworld to rescue his wife, Eurydice?": "Orpheus",
    "What is the name of the winged horse born from the blood of Medusa when she was slain by Perseus?": "Pegasus",
    "Who was the goddess of love and beauty in Greek mythology?": "Aphrodite",
    "Who was turned into a spider after challenging Athena to a weaving contest?": "Arachne",
    "What was the name of the three-headed dog that guarded the underworld?": "Cerberus",
    "Who was the Greek god of war?": "Ares",
    "Who was the mother of Achilles?": "Thetis",
    "Which Titan was punished to hold up the sky for eternity?": "Atlas",
    "Who was the Greek god of wine and revelry?": "Dionysus",
    "What was the name of the river that separated the world of the living from the underworld?": "Styx",
    "Who was the father of Icarus, the boy who flew too close to the sun?": "Daedalus",
    "Who was the goddess of the harvest and agriculture?": "Demeter",
    "Which Greek hero was known for his cunning and played a key role in the Trojan War?": "Odysseus",
    "What is the name of the Greek goddess of the hunt and the moon?": "Artemis"
}

# Run the pipeline for each question
questions = list(correct_answers.keys())

In [49]:
predictions = {}
for question in questions:
    prediction = pipe.run(query=question, params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}})
    predictions[question] = prediction

Inferencing Samples: 100%|██████████| 1/1 [00:03<00:00,  3.83s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.14 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:01<00:00,  1.58s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:01<00:00,  1.57s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:01<00:00,  1.76s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.23 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.04 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:02<00:00,  2.09s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:01<00:00,  1.50s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.16 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.07 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:01<00:00,  1.02s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.16 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00

In [15]:
# Function to calculate string similarity
def get_similarity(a, b):
    return difflib.SequenceMatcher(None, a.lower(), b.lower()).ratio()

In [None]:
# Compare predictions with the correct answers
similarity_scores = {}
total_similarity = 0
num_questions = len(predictions)

for question, prediction in predictions.items():
    predicted_answer = prediction['answers'][0].answer
    correct_answer = correct_answers[question]
    
    similarity = get_similarity(predicted_answer, correct_answer)
    similarity_scores[question] = {
        'predicted_answer': predicted_answer,
        'correct_answer': correct_answer,
        'similarity_score': similarity
    }
    
    total_similarity += similarity

# Compute average similarity score
average_similarity = total_similarity / num_questions if num_questions > 0 else 0

# Print out the results
pprint(similarity_scores)
print(f"\n🔹 Average Similarity Score: {average_similarity:.4f}")

{'What is the name of the Greek goddess of the hunt and the moon?': {'correct_answer': 'Artemis',
                                                                     'predicted_answer': 'Selene',
                                                                     'similarity_score': 0.15384615384615385},
 'What is the name of the monster with the body of a lion, the head of a goat, and a tail that ends in a snake’s head?': {'correct_answer': 'Chimera',
                                                                                                                          'predicted_answer': 'Chimera',
                                                                                                                          'similarity_score': 1.0},
 'What is the name of the winged horse born from the blood of Medusa when she was slain by Perseus?': {'correct_answer': 'Pegasus',
                                                                                                       'pred

In [50]:
# Compare predictions with the correct answers
similarity_scores = {}
total_similarity = 0
num_questions = len(predictions)

for question, prediction in predictions.items():
    predicted_answer = prediction['answers'][0].answer
    correct_answer = correct_answers[question]
    
    similarity = get_similarity(predicted_answer, correct_answer)
    similarity_scores[question] = {
        'predicted_answer': predicted_answer,
        'correct_answer': correct_answer,
        'similarity_score': similarity
    }
    
    total_similarity += similarity

# Compute average similarity score
average_similarity = total_similarity / num_questions if num_questions > 0 else 0

# Print out the results
pprint(similarity_scores)
print(f"\n🔹 Average Similarity Score: {average_similarity:.4f}")

{'What is the name of the Greek goddess of the hunt and the moon?': {'correct_answer': 'Artemis',
                                                                     'predicted_answer': 'Selene',
                                                                     'similarity_score': 0.15384615384615385},
 'What is the name of the monster with the body of a lion, the head of a goat, and a tail that ends in a snake’s head?': {'correct_answer': 'Chimera',
                                                                                                                          'predicted_answer': 'Chimera',
                                                                                                                          'similarity_score': 1.0},
 'What is the name of the winged horse born from the blood of Medusa when she was slain by Perseus?': {'correct_answer': 'Pegasus',
                                                                                                       'pred