In [2]:
import requests
import time
import json
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
from transformers import pipeline
import sentencepiece

In [None]:
# Initialize the question generation pipeline
qg_pipeline = pipeline("text2text-generation", model="valhalla/t5-base-qg-hl")

In [35]:
# Stack Exchange API
def fetch_stackexchange_qa(site='space', tagged='rocket', pages=2):
    questions = []
    for page in range(1, pages+1):
        url = f"https://api.stackexchange.com/2.3/questions?page={page}&pagesize=20&order=desc&sort=activity&tagged={tagged}&site={site}&filter=withbody"
        resp = requests.get(url).json()
        for item in resp.get('items', []):
            q = {
                'question': item['title'],
                'body': item.get('body', ''),
                'answers': []
            }
            if item.get('is_answered'):
                a_url = f"https://api.stackexchange.com/2.3/questions/{item['question_id']}/answers?order=desc&sort=votes&site={site}&filter=withbody"
                a_resp = requests.get(a_url).json()
                for answer in a_resp.get('items', []):
                    q['answers'].append(answer['body'])
            questions.append(q)
        time.sleep(1)  # Respect rate limits
    return questions

 
# arXiv API Q&A extraction from abstract (pseudo-QA from title and abstract)
def fetch_arxiv_abstracts(query='rocket propulsion', max_results=10):
    url = f"http://export.arxiv.org/api/query?search_query=all:{query.replace(' ', '+')}&start=0&max_results={max_results}"
    resp = requests.get(url)
    root = ET.fromstring(resp.content)

    qas = []
    ns = {'atom': 'http://www.w3.org/2005/Atom'}  # arXiv uses Atom XML namespace

    for entry in root.findall('atom:entry', ns):
        title = entry.find('atom:title', ns).text.strip()
        
        summary = entry.find('atom:summary', ns).text.strip()
        # Generate question from text
        input_text = f"generate question: {summary}"
        generated_question = qg_pipeline(input_text, max_length=64, do_sample=False)[0]['generated_text']
        
        qas.append({
            'title': title,
            'question': generated_question,
            'answer': summary
        })
    return qas


In [26]:
# Wikipedia scraping

# Initialize the question generation pipeline
# qg_pipeline = pipeline("text2text-generation", model="valhalla/t5-base-qg-hl")

def clean_text(text):
    return ' '.join(text.strip().split())

def get_all_rocket_propulsion_links(base_url='https://en.wikibooks.org/wiki/Rocket_Propulsion'):
    """Grab all unique subpage links under Rocket Propulsion."""
    resp = requests.get(base_url)
    soup = BeautifulSoup(resp.text, 'html.parser')

    content_div = soup.select_one('#mw-content-text')
    links = content_div.find_all('a', href=True)
    
    urls = set()
    for link in links:
        href = link['href']
        if 'https' in href:
            full_url = href
        else:
            full_url = 'https://en.wikibooks.org' + href
            # print(f'full_url {full_url}')
        urls.add(full_url)
        # if href.startswith('/wiki/Rocket_Propulsion/') and ':' not in href:
        #     print(f'url: https://en.wikibooks.org + { href}')
        #     urls.add('https://en.wikibooks.org' + href)

    return list(urls)

def scrape_pages_with_qg(urls, visited, M, batch_size=10):
    """Scrape up to `batch_size` new pages not in `visited`, return new QAs."""
    qa_pairs = []
    count = 0
    print(f'visited {visited}')

    for url in urls:
        if url in visited:
            continue
        visited.add(url)
        try:
            resp = requests.get(url)
            soup = BeautifulSoup(resp.text, 'html.parser')
            content_div = soup.select_one('#mw-content-text')
            paragraphs = content_div.find_all('p')
            text = clean_text(' '.join(p.get_text() for p in paragraphs[:3]))
            if not text or len(text) < M:
                continue

            # Generate question from text
            input_text = f"generate question: {text}"
            output = qg_pipeline(input_text, max_length=64, do_sample=False)[0]['generated_text']

            qa_pairs.append({
                'question': output,
                'answer': text,
                'source_url': url
            })
            print(f"✅ Generated Q&A from: {url}")
            count += 1
            time.sleep(1)  # polite scraping

            if count >= batch_size:
                break
        except Exception as e:
            print(f"⚠️ Failed to process {url}: {e}")
    return qa_pairs

def scrape_wikibook_qas(M):
    all_links = get_all_rocket_propulsion_links()
    visited = set()
    all_qas = []

    for i in range(10):
        print(f"\n🔁 Batch {i+1}/10")
        batch_qas = scrape_pages_with_qg(all_links, visited, M, batch_size=10)
        all_qas.extend(batch_qas)
        if len(all_qas) >= M:
            break

    # with open('wikibook_qas_100.json', 'w', encoding='utf-8') as f:
    #     json.dump(all_qas, f, indent=2)

    print(f"\n✅ Done. Saved {len(all_qas)} Q&A pairs to wikibook_qas_100.json")
    return all_qas

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


full_url https://en.wikibooks.org/w/index.php?title=Space_Transport_and_Engineering_Methods&veaction=edit&section=1
full_url https://en.wikibooks.org/w/index.php?title=Space_Transport_and_Engineering_Methods&action=edit&section=1
full_url https://en.wikibooks.org/wiki/File:STS-135_final_flyaround_of_ISS_1.jpg
full_url https://en.wikibooks.org/wiki/Space_Transport_and_Engineering_Methods/Introduction
full_url https://en.wikibooks.org/w/index.php?title=Space_Transport_and_Engineering_Methods&veaction=edit&section=2
full_url https://en.wikibooks.org/w/index.php?title=Space_Transport_and_Engineering_Methods&action=edit&section=2
full_url https://en.wikibooks.org/wiki/Space_Transport_and_Engineering_Methods/Fundamentals
full_url https://en.wikibooks.org/w/index.php?title=Space_Transport_and_Engineering_Methods&veaction=edit&section=3
full_url https://en.wikibooks.org/w/index.php?title=Space_Transport_and_Engineering_Methods&action=edit&section=3
full_url https://en.wikibooks.org/wiki/Space_

Token indices sequence length is longer than the specified maximum sequence length for this model (545 > 512). Running this sequence through the model will result in indexing errors


✅ Generated Q&A from: https://en.wikibooks.org/wiki/Space_Transport_and_Engineering_Methods/Methodologies
✅ Generated Q&A from: https://en.wikibooks.org/wiki/Space_Transport_and_Engineering_Methods/Existing_Programs2
✅ Generated Q&A from: https://en.wikibooks.org/wiki/Space_Transport_and_Engineering_Methods/Environment_Ranges
✅ Generated Q&A from: https://en.wikibooks.org/wiki/Space_Transport_and_Engineering_Methods/Existing_Programs
✅ Generated Q&A from: https://en.wikibooks.org/wiki/Space_Transport_and_Engineering_Methods/Guns_and_Accelerators
✅ Generated Q&A from: https://en.wikibooks.org/wiki/Space_Transport_and_Engineering_Methods/Combined_Systems
✅ Generated Q&A from: https://en.wikibooks.org/wiki/Space_Transport_and_Engineering_Methods/Photon_Engines

🔁 Batch 2/10
visited {'https://en.wikibooks.org/wiki/Space_Transport_and_Engineering_Methods/System_Elements', 'https://en.wikibooks.org/wiki/Space_Transport_and_Engineering_Methods/Orbital_Mechanics', 'https://en.wikibooks.org/wik

In [36]:
if __name__ == "__main__":
    # se_data = fetch_stackexchange_qa()
    arxiv_data = fetch_arxiv_abstracts()
    # reddit_data = fetch_reddit_qa()
    # wikibook_data = scrape_wikibook_qas(M=100)

    # all_qas = {
    #     "stackexchange": se_data,
    #     "arxiv": arxiv_data,
    #     "reddit": reddit_data,
    #     "wikibook": wikibook_data
    # }

    # save_qas_to_file(all_qas)
    # print("Data collection complete. Saved to qa_data.json")


In [37]:
arxiv_data

[{'title': 'What fuel for a rocket?',
  'question': 'What is the exit velocity of gases?',
  'answer': 'Elementary concepts from general physics and thermodynamics have been used to\nanalyze rocket propulsion. Making some reasonable assumptions, an expression\nfor the exit velocity of the gases is found. From that expression one can\nconclude what are the desired properties for a rocket fuel.'},
 {'title': 'The Ultimate Limits of the Relativistic Rocket Equation. The Planck\n  Photon Rocket',
  'question': 'What is the maximum velocity for a photon propulsion rocket?',
  'answer': 'In this paper we look at the ultimate limits of a photon propulsion rocket.\nThe maximum velocity for a photon propulsion rocket is just below the speed of\nlight and is a function of the reduced Compton wavelength of the heaviest\nsubatomic particles in the rocket. We are basically combining the relativistic\nrocket equation with Haug\'s new insight on the maximum velocity for anything\nwith rest mass.\n  A