In [2]:
import requests
import time
import json
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
import requests
import sentencepiece

In [None]:
# Stack Exchange API
def fetch_stackexchange_qa(site='space', tagged='rocket', pages=2):
    questions = []
    for page in range(1, pages+1):
        url = f"https://api.stackexchange.com/2.3/questions?page={page}&pagesize=20&order=desc&sort=activity&tagged={tagged}&site={site}&filter=withbody"
        resp = requests.get(url).json()
        for item in resp.get('items', []):
            q = {
                'question': item['title'],
                'body': item.get('body', ''),
                'answers': []
            }
            if item.get('is_answered'):
                a_url = f"https://api.stackexchange.com/2.3/questions/{item['question_id']}/answers?order=desc&sort=votes&site={site}&filter=withbody"
                a_resp = requests.get(a_url).json()
                for answer in a_resp.get('items', []):
                    q['answers'].append(answer['body'])
            questions.append(q)
        time.sleep(1)  # Respect rate limits
    return questions

 
# arXiv API Q&A extraction from abstract (pseudo-QA from title and abstract)
def fetch_arxiv_abstracts(query='rocket propulsion', max_results=10):
    url = f"http://export.arxiv.org/api/query?search_query=all:{query.replace(' ', '+')}&start=0&max_results={max_results}"
    resp = requests.get(url)
    root = ET.fromstring(resp.content)

    qas = []
    ns = {'atom': 'http://www.w3.org/2005/Atom'}  # arXiv uses Atom XML namespace

    for entry in root.findall('atom:entry', ns):
        title = entry.find('atom:title', ns).text.strip()
        summary = entry.find('atom:summary', ns).text.strip()
        qas.append({
            'question': title,
            'answer': summary
        })
    return qas

# def fetch_arxiv_abstracts(query='rocket propulsion', max_results=10):
#     url = f"http://export.arxiv.org/api/query?search_query=all:{query.replace(' ', '+')}&start=0&max_results={max_results}"
#     resp = requests.get(url)
#     print(f"resp {resp.content}")
#     soup = BeautifulSoup(resp.content, 'lxml-xml')
#     qas = []
#     for entry in soup.find_all('entry'):
#         qas.append({
#             'question': entry.title.text,
#             'answer': entry.summary.text
#         })
#     return qas

# Reddit (Pushshift API via requests) - subreddit Q&A
import praw

def fetch_reddit_qa_praw(subreddit_name='rocketry', limit=10):
    reddit = praw.Reddit(
        client_id='ct6SQ8LOeOQby1osL4ttbQ',
        client_secret='nZTk52Ls5pq0CLCNzntswcMKrdt1YQ',
        user_agent='MutedMidnight9302',
        username='Shadrack Antwi'
        # password='YOUR_PASSWORD'
    )

    subreddit = reddit.subreddit(subreddit_name)
    qas = []

    for post in subreddit.hot(limit=limit):
        if post.selftext:
            qas.append({
                'question': post.title,
                'answer': post.selftext
            })

    return qas


# def fetch_reddit_qa(subreddit='rocketry', size=10):
#     url = f"https://api.pushshift.io/reddit/search/submission/?subreddit={subreddit}&size={size}&selftext:not=null"
#     resp = requests.get(url).json()
#     print(f'reddit resp {resp}')
#     qas = []
#     for post in resp.get('data', []):
#         qas.append({
#             'question': post.get('title'),
#             'answer': post.get('selftext')
#         })
#     return qas

# Wikibooks scraping
def scrape_wikibook_qa(url='https://en.wikibooks.org/wiki/Rocket_Propulsion'):
    resp = requests.get(url)
    soup = BeautifulSoup(resp.text, 'html.parser')
    # print(f'wiki soup {soup}')
    content = soup.select('#mw-content-text')[0].get_text()
    # print(f'wiki content {content}')
    sections = content.split('\n\n')
    # print(f'sections {sections[5]}')
    qas = []
    for i in range(len(sections)-1):
        print(f'section {sections[i]}')
        if 'Chapter' in sections[i]:
            title = sections[i].split(' - ')[1]
            print(f'i {i} ==> {title}')
        if sections[i].endswith('?'):
            qas.append({
                'question': sections[i],
                'answer': sections[i+1]
            })
    return qas


In [12]:
import requests
from bs4 import BeautifulSoup
from transformers import pipeline
import time
import json
import sentencepiece

# Initialize the question generation pipeline
qg_pipeline = pipeline("text2text-generation", model="valhalla/t5-base-qg-hl")
# qg_pipeline = pipeline("text2text-generation", model="iarfmoose/t5-base-question-generator")

def clean_text(text):
    return ' '.join(text.strip().split())

def scrape_wikibook_qa_with_qg(base_url='https://en.wikibooks.org/wiki/Rocket_Propulsion', max_links=10):
    resp = requests.get(base_url)
    soup = BeautifulSoup(resp.text, 'html.parser')

    content_div = soup.select_one('#mw-content-text')
    links = content_div.find_all('a', href=True)
    # print(f'links {links}')

    qa_pairs = []
    visited = set()

    for link in links:
        href = link['href']
        
        # if not href in visited:#if not href.startswith('/wiki/Rocket_Propulsion/') or href in visited:
        #     continue
        visited.add(href)

        full_url = 'https://en.wikibooks.org' + href
        # print(f'full_url {full_url}')
        try:
            sub_resp = requests.get(full_url)
            # print(f'sub_resp {sub_resp}')
            sub_soup = BeautifulSoup(sub_resp.text, 'html.parser')
            sub_text_div = sub_soup.select_one('#mw-content-text')
            paragraphs = sub_text_div.find_all('p')
            text = clean_text(' '.join(p.get_text() for p in paragraphs[:3]))  # Limit to first few paragraphs
            # print(f'text {text}')
            if not text or len(text) < 100:
                continue

            # Format input for T5-style question generation
            input_text = f"generate question: {text}"
            output = qg_pipeline(input_text, max_length=64, do_sample=False)[0]['generated_text']

            qa_pairs.append({
                'question': output,
                'answer': text,
                'source_url': full_url
            })
            print(f"Generated Q&A from: {full_url}")
            time.sleep(1)  # Respectful scraping
        except Exception as e:
            print(f"Failed to process {full_url}: {e}")
        
        if len(qa_pairs) >= max_links:
            break

    return qa_pairs

def save_qas_to_file(data, filename='wikibook_qas.json'):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2)

# Run everything
if __name__ == "__main__":
    qa_data = scrape_wikibook_qa_with_qg()
    save_qas_to_file(qa_data)
    print("Q&A data saved to wikibook_qas.json")


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Failed to process https://en.wikibooks.orghttps://en.wikibooks.org/wiki/Seed_Factories: HTTPSConnectionPool(host='en.wikibooks.orghttps', port=443): Max retries exceeded with url: /en.wikibooks.org/wiki/Seed_Factories (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x157dd8250>: Failed to resolve 'en.wikibooks.orghttps' ([Errno 8] nodename nor servname provided, or not known)"))
Generated Q&A from: https://en.wikibooks.org/w/index.php?title=Space_Transport_and_Engineering_Methods&action=edit&section=1
Generated Q&A from: https://en.wikibooks.org/wiki/File:STS-135_final_flyaround_of_ISS_1.jpg
Generated Q&A from: https://en.wikibooks.org/wiki/Space_Transport_and_Engineering_Methods/Introduction
Generated Q&A from: https://en.wikibooks.org/w/index.php?title=Space_Transport_and_Engineering_Methods&action=edit&section=2
Generated Q&A from: https://en.wikibooks.org/w/index.php?title=Space_Transport_and_Engineering_Methods&action=edit&section=3
Generated Q&A from

Token indices sequence length is longer than the specified maximum sequence length for this model (545 > 512). Running this sequence through the model will result in indexing errors


Generated Q&A from: https://en.wikibooks.org/wiki/Space_Transport_and_Engineering_Methods/Methodologies
Generated Q&A from: https://en.wikibooks.org/wiki/Space_Transport_and_Engineering_Methods/System_Elements
Q&A data saved to wikibook_qas.json


In [13]:
qa_data

[{'question': 'What is the name of the book?',
  'answer': '{{}} {{{}}} {{BookCat}} [[Category:]] [[Media:]] --~~~~ <math></math> <includeonly></includeonly> <noinclude></noinclude> <blockquote><ref></ref></blockquote> <!----> {{BOOKNAME}} {{FULLPAGENAME}} {{NAMESPACE}} {{PAGENAME}} {{SUBPAGENAME}} ← → ↑ ↓ ↖ ↗ ↘ ↙ ↔ ⇐ ⇑ ⇒ ⇓ ⇔ ✓ ✗ ♀ ♂ ± − × ÷ ⋅ √ ≠ ≤ ≥ ≡ ∼ ≈ ∞ ∅ § © ® ™ ‿ ‘’ ‹› ‚‘ Ȝ ȝ ʻ',
  'source_url': 'https://en.wikibooks.org/w/index.php?title=Space_Transport_and_Engineering_Methods&action=edit&section=1'},
 {'question': 'What is the name of the file from Wikimedia Commons?',
  'answer': 'Original file (4,256 × 2,832 pixels, file size: 962 KB, MIME type: image/jpeg) This is a file from the Wikimedia Commons. The description on its description page there is shown below. Commons is a freely licensed media file repository. You can help.',
  'source_url': 'https://en.wikibooks.org/wiki/File:STS-135_final_flyaround_of_ISS_1.jpg'},
 {'question': 'What is the title of the book?',
  'answer

In [None]:
def save_qas_to_file(data, filename='qa_data.json'):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2)


In [None]:
if __name__ == "__main__":
    # se_data = fetch_stackexchange_qa()
    arxiv_data = fetch_arxiv_abstracts()
    reddit_data = fetch_reddit_qa()
    wikibook_data = scrape_wikibook_qa()

    # all_qas = {
    #     "stackexchange": se_data,
    #     "arxiv": arxiv_data,
    #     "reddit": reddit_data,
    #     "wikibook": wikibook_data
    # }

    # save_qas_to_file(all_qas)
    # print("Data collection complete. Saved to qa_data.json")


In [None]:
wikibook_data