In [1]:
import requests
import time
import json
import html
import time
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
from transformers import pipeline
import sentencepiece

In [2]:
# Initialize the question generation pipeline
qg_pipeline = pipeline("text2text-generation", model="valhalla/t5-base-qg-hl")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [3]:
# Stack Exchange API
# def fetch_stackexchange_qa(site='space', tagged='rocket', pages=5):
#     questions = []
#     for page in range(1, pages+1):
#         url = f"https://api.stackexchange.com/2.3/questions?page={page}&pagesize=20&order=desc&sort=activity&tagged={tagged}&site={site}&filter=withbody"
#         resp = requests.get(url).json()
#         print(f'resp {resp}')
#         for item in resp.get('items', []):
#             q = {
#                 'question': item['title'],
#                 'body': item.get('body', ''),
#                 'answers': []
#             }
#             if item.get('is_answered'):
#                 a_url = f"https://api.stackexchange.com/2.3/questions/{item['question_id']}/answers?order=desc&sort=votes&site={site}&filter=withbody"
#                 a_resp = requests.get(a_url).json()
#                 for answer in a_resp.get('items', []):
#                     q['answers'].append(answer['body'])
#                     questions.append(q)
#             # questions.append(q)
#         time.sleep(1)  # Respect rate limits
#     return questions

 
# arXiv API Q&A extraction from abstract (pseudo-QA from title and abstract)
def fetch_arxiv_abstracts(query='rocket propulsion', max_results=10):
    url = f"http://export.arxiv.org/api/query?search_query=all:{query.replace(' ', '+')}&start=0&max_results={max_results}"
    resp = requests.get(url)
    root = ET.fromstring(resp.content)

    qas = []
    ns = {'atom': 'http://www.w3.org/2005/Atom'}  # arXiv uses Atom XML namespace

    for entry in root.findall('atom:entry', ns):
        title = entry.find('atom:title', ns).text.strip()
        
        summary = entry.find('atom:summary', ns).text.strip()
        # Generate question from text
        input_text = f"generate question: {summary}"
        generated_question = qg_pipeline(input_text, max_length=64, do_sample=False)[0]['generated_text']
        
        qas.append({
            'title': title,
            'question': generated_question,
            'answer': summary
        })
    return qas


In [4]:
# Wikipedia scraping

# Initialize the question generation pipeline
# qg_pipeline = pipeline("text2text-generation", model="valhalla/t5-base-qg-hl")

def clean_text(text):
    return ' '.join(text.strip().split())

def get_all_rocket_propulsion_links(base_url='https://en.wikibooks.org/wiki/Rocket_Propulsion'):
    """Grab all unique subpage links under Rocket Propulsion."""
    resp = requests.get(base_url)
    soup = BeautifulSoup(resp.text, 'html.parser')

    content_div = soup.select_one('#mw-content-text')
    links = content_div.find_all('a', href=True)
    
    urls = set()
    for link in links:
        href = link['href']
        if 'https' in href:
            full_url = href
        else:
            full_url = 'https://en.wikibooks.org' + href
            # print(f'full_url {full_url}')
        urls.add(full_url)
        # if href.startswith('/wiki/Rocket_Propulsion/') and ':' not in href:
        #     print(f'url: https://en.wikibooks.org + { href}')
        #     urls.add('https://en.wikibooks.org' + href)

    return list(urls)

def scrape_pages_with_qg(urls, visited, M, batch_size=10):
    """Scrape up to `batch_size` new pages not in `visited`, return new QAs."""
    qa_pairs = []
    count = 0
    print(f'visited {visited}')

    for url in urls:
        if url in visited:
            continue
        visited.add(url)
        try:
            resp = requests.get(url)
            soup = BeautifulSoup(resp.text, 'html.parser')
            content_div = soup.select_one('#mw-content-text')
            paragraphs = content_div.find_all('p')
            text = clean_text(' '.join(p.get_text() for p in paragraphs[:3]))
            if not text or len(text) < M:
                continue

            # Generate question from text
            input_text = f"generate question: {text}"
            output = qg_pipeline(input_text, max_length=64, do_sample=False)[0]['generated_text']

            qa_pairs.append({
                'question': output,
                'answer': text,
                'source_url': url
            })
            print(f"✅ Generated Q&A from: {url}")
            count += 1
            time.sleep(1)  # polite scraping

            if count >= batch_size:
                break
        except Exception as e:
            print(f"⚠️ Failed to process {url}: {e}")
    return qa_pairs

def scrape_wikibook_qas(M):
    all_links = get_all_rocket_propulsion_links()
    visited = set()
    all_qas = []

    for i in range(10):
        print(f"\n🔁 Batch {i+1}/10")
        batch_qas = scrape_pages_with_qg(all_links, visited, M, batch_size=10)
        all_qas.extend(batch_qas)
        if len(all_qas) >= M:
            break

    # with open('wikibook_qas_100.json', 'w', encoding='utf-8') as f:
    #     json.dump(all_qas, f, indent=2)

    print(f"\n✅ Done. Saved {len(all_qas)} Q&A pairs to wikibook_qas_100.json")
    return all_qas

In [19]:


# Stack Exchange API

def clean_html_text(html_content):
    """Remove hyperlinks and strip HTML tags from content."""
    soup = BeautifulSoup(html_content, 'html.parser')

    # Replace <a> tags with their inner text
    for a in soup.find_all('a'):
        a.replace_with(a.get_text())

    # Get cleaned text
    text = soup.get_text(separator=' ')
    return html.unescape(text.strip())

def fetch_stackexchange_qa_data(site='space.stackexchange', tag='rockets', pagesize=20, max_pages=2):
    base_url = 'https://api.stackexchange.com/2.3/questions'
    answers_url = 'https://api.stackexchange.com/2.3/questions/{ids}/answers'
    all_qas = []

    for page in range(1, max_pages + 1):
        params = {
            'site': site,
            'tagged': tag,
            'pagesize': pagesize,
            'page': page,
            'filter': 'withbody'
        }
        resp = requests.get(base_url, params=params)
        data = resp.json()

        for question in data.get('items', []):
            q_id = question['question_id']
            q_body = clean_html_text(question.get('body', ''))
            title = html.unescape(question.get('title', ''))

            # Get answers
            a_params = {
                'site': site,
                'filter': 'withbody'
            }
            a_resp = requests.get(answers_url.format(ids=q_id), params=a_params)
            answers = a_resp.json().get('items', [])

            for ans in answers:
                a_body = clean_html_text(ans.get('body', ''))
                all_qas.append({
                    'question': f"{title}\n{q_body}",
                    'answer': a_body,
                    'source_url': f"https://{site}.com/questions/{q_id}"
                })

            time.sleep(1)  # polite API usage

    return all_qas


In [20]:
if __name__ == "__main__":
    se_data = fetch_stackexchange_qa_data()
    # arxiv_data = fetch_arxiv_abstracts()
    # reddit_data = fetch_reddit_qa()
    # wikibook_data = scrape_wikibook_qas(M=100)

    # all_qas = {
    #     "stackexchange": se_data,
    #     "arxiv": arxiv_data,
    #     "reddit": reddit_data,
    #     "wikibook": wikibook_data
    # }

    # save_qas_to_file(all_qas)
    # print("Data collection complete. Saved to qa_data.json")


In [33]:
se_data[5]['question']

"The Martian: Does it really take a supercomputer to calculate spaceflight maneuvers?\nMy preemptive apologies for asking a question about a movie, and the spoilers within said question, but considering the  widespread support for its scientific plausibility , I'm hoping you'll let it slide :) \n In the movie  The Martian , the character Rich Purnell is shown using the Pleiades supercomputer at the NASA Ames Research Center to confirm the calculations for his maneuver designed to safely redirect the Hermes spacecraft back to Mars, and then to Earth. Why? \n Space is just about the most ideal place possible for predictable physics. Little in the way of air or external forces, short of gravity which can be calculated between the spacecraft and the Sun and planets and little else, centrifugal force only during the maneuver itself, almost none of the fluid mechanics that otherwise make simple calculations complicated... Basically, with so few moving parts and sources of complexity, does it

In [34]:
se_data[5]['question'].split('\n')[0]

'The Martian: Does it really take a supercomputer to calculate spaceflight maneuvers?'

In [36]:
se_data[5]['answer']

"I think the answer is probably no, but not for the reasons other answers give.  First of all we can ignore the whole multi-body problem: it's a really good approximation that the planets & Sun  run on rails since they are hugely more massive than the spacecraft.  let's also assume that modelling a trajectory between two points is tractable, whether or not you use continuous thrust or not (this could well be a reasonably hard optimisation problem to minimise fuel &c but I suspect that's very doable on a modern personal computer. \n That's not what makes it hard: what makes it hard is that this is a search problem merely dressed up as a physics problem, and search problems, famously, have combinatorial explosions.  Search problems require machines like  Deep Blue  to solve them, and these things are definitely supercomputers (albeit specialised ones). \n Why is is a search problem?  Well, because the way you get around the Solar System isn't in fact by computing a trajectory between two

In [18]:
se_data[1]["body"].replace("<p>","").replace('"','').replace("</blockquote>"," ").replace("<a>","").replace("<blockquote>"," ")

"Roscosmos <a href=https://arstechnica.com/science/2022/07/russia-says-its-space-station-partnership-will-end-after-two-more-years/ rel=noreferrer>has announced their ISS partnership will end in 2 years</a></p>\n \nThis might have been seen as an indication that the new administration at Roscosmos was in a more cooperative mood. Any such hopes were dashed on Tuesday, when Borisov announced that Russia would not be renewing its current commitment to the ISS, which ends in 2024. NASA's current plans involve keeping the station occupied through the end of the decade.</p>\n \nIt's not clear what their plans are, but the new head of Roscosmos did say this</p>\n \nAccording to The New York Times, Borisov told Russian President Vladimir Putin that the 2024 date gives his country time as well. “I think that by this time, we will begin to form the Russian orbital station,” he said.</p>\n \nIn theory, Russia may want some of its ISS segments back for this &quot;new&quot; Russian space station. T