In [1]:
import requests
import time
import json
import html
import time
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
from transformers import pipeline
import sentencepiece
import pandas as pd
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import io
import os
import re

In [2]:
# Initialize the question generation pipeline
qg_pipeline = pipeline("text2text-generation", model="valhalla/t5-base-qg-hl")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [3]:
def load_and_regex_paragraphs(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()
    
    text = re.sub(r"\bFigure\s+\d+-\d+\b", "", text, flags=re.IGNORECASE)
    # Remove any number surrounded by spaces (e.g., " 42 ")
    text = re.sub(r"\d+", " ", text)
    # Optional: strip redundant whitespace
    cleaned_text = re.sub(r"\s{2,}", " ", text).strip()

    return text

In [4]:
def load_and_split_paragraphs(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()

    # Split on one or more blank lines (handles multiple \n between paragraphs)
    import re
    pattern = r"(*Figure -*\s*\s*-\s*(\d+)\s*"
    pat = r"[ ]*(\d+[ ]*)|\(*Figure -*\s*\d+\)?"
    # paragraphs = [para.strip() for para in re.split(r'\n\s*\n', re.sub(pattern, " ", text)) if para.strip()]
    paragraphs = [para.strip().replace("- . ","").replace(" . ","") for para in re.split(r'\n\s*\n', re.sub(r"\s*\d+\s*", "", re.sub(pat, " ", text))) if para.strip()]
    
    return paragraphs


In [5]:
# Example usage
# paragraphs = load_and_split_paragraphs("/Users/rckyi/Documents/Data/Nasa-Lessons-learned-in-engineering.txt")
# for i, para in enumerate(paragraphs[200:300]):  # Print first 5 paragraph
#     input_text = f"generate question: {para}"
#     generated_question = qg_pipeline(input_text, max_length=512, do_sample=False)[0]['generated_text']
#     print(f"\n--- Paragraph {i + 1} ---\n{generated_question}\n{para}")
        

In [17]:
def func(paragraph):
    # Example placeholder function — replace this with your desired logic
    return len(paragraph.split())  # returns word count

def process_paragraphs(json_file_path):
    with open(json_file_path, 'r', encoding='utf-8') as f:
        paragraphs = json.load(f)
    
    for para_id, para_text in paragraphs.items():
        input_text = f"generate question: {para_text}"
        # output = qg_pipeline(input_text, max_length=64, do_sample=False)[0]['generated_text']
        yield {'question': qg_pipeline(input_text, max_length=712, do_sample=False)[0]['generated_text'], 'answer': para_text}
        # yield [qg_pipeline(input_text, max_length=512, do_sample=False)[0]['generated_text'], para_text]


In [18]:
# Example usage:
json_file_path = "/Users/rckyi/Documents/Data/paragraphs_with_ids.json"  # or your full path
results = list(process_paragraphs(json_file_path))

In [16]:
results

[[{'question': 'What is the name of the NASA Technical Paper 3653 A History of Aerospace Problems, Their Solutions, Their Lessons R.S. Ryan?',
   'answer': 'NASA Technical Paper 3653 A History of Aerospace Problems, \nTheir Solutions, Their Lessons \nR.S. Ryan September 1996 NASA Technical Paper 3653 A History of Aerospace Problems, \nTheir Solutions, Their Lessons R.S. Ryan \nMarshall Space Flight Center ° MSFC, Alabama \nNational Aeronautics and Space Administration \nMarshall Space Flight Center ° MSFC, Alabama 35812 September 1996 NASA Technical Paper 3653 A History of Aerospace Problems, \nTheir Solutions, Their Lessons R.S. Ryan \nMarshall Space Flight Center ,, MSFC, Alabama \nNational Aeronautics and Space Administration \nMarshall Space Flight Center • MSFC, Alabama 35812 September 1996 TABLE OF CONTENTS I° INTRODUCTION .....................................................................................'}],
 [{'question': 'What is the general general of the general public?',


In [8]:
# arXiv API Q&A extraction from abstract (pseudo-QA from title and abstract)
def fetch_arxiv_abstracts(query='rocket propulsion', max_results=10):
    url = f"http://export.arxiv.org/api/query?search_query=all:{query.replace(' ', '+')}&start=0&max_results={max_results}"
    resp = requests.get(url)
    root = ET.fromstring(resp.content)

    qas = []
    ns = {'atom': 'http://www.w3.org/2005/Atom'}  # arXiv uses Atom XML namespace

    for entry in root.findall('atom:entry', ns):
        title = entry.find('atom:title', ns).text.strip()
        
        summary = entry.find('atom:summary', ns).text.strip()
        # Generate question from text
        input_text = f"generate question: {summary}"
        generated_question = qg_pipeline(input_text, max_length=64, do_sample=False)[0]['generated_text']
        
        qas.append({
            'title': title,
            'question': generated_question,
            'answer': summary
        })
    return qas


In [9]:
# Wikipedia scraping

# Initialize the question generation pipeline
# qg_pipeline = pipeline("text2text-generation", model="valhalla/t5-base-qg-hl")

def clean_text(text):
    return ' '.join(text.strip().split())

def get_all_rocket_propulsion_links(base_url='https://en.wikibooks.org/wiki/Rocket_Propulsion'):
    """Grab all unique subpage links under Rocket Propulsion."""
    resp = requests.get(base_url)
    soup = BeautifulSoup(resp.text, 'html.parser')

    content_div = soup.select_one('#mw-content-text')
    links = content_div.find_all('a', href=True)
    
    urls = set()
    for link in links:
        href = link['href']
        if 'https' in href:
            full_url = href
        else:
            full_url = 'https://en.wikibooks.org' + href
            # print(f'full_url {full_url}')
        urls.add(full_url)
        # if href.startswith('/wiki/Rocket_Propulsion/') and ':' not in href:
        #     print(f'url: https://en.wikibooks.org + { href}')
        #     urls.add('https://en.wikibooks.org' + href)

    return list(urls)

def scrape_pages_with_qg(urls, visited, M, batch_size=10):
    """Scrape up to `batch_size` new pages not in `visited`, return new QAs."""
    qa_pairs = []
    count = 0
    print(f'visited {visited}')

    for url in urls:
        if url in visited:
            continue
        visited.add(url)
        try:
            resp = requests.get(url)
            soup = BeautifulSoup(resp.text, 'html.parser')
            content_div = soup.select_one('#mw-content-text')
            paragraphs = content_div.find_all('p')
            text = clean_text(' '.join(p.get_text() for p in paragraphs[:3]))
            if not text or len(text) < M:
                continue

            # Generate question from text
            input_text = f"generate question: {text}"
            output = qg_pipeline(input_text, max_length=64, do_sample=False)[0]['generated_text']

            qa_pairs.append({
                'question': output,
                'answer': text,
                'source_url': url
            })
            print(f"✅ Generated Q&A from: {url}")
            count += 1
            time.sleep(1)  # polite scraping

            if count >= batch_size:
                break
        except Exception as e:
            print(f"⚠️ Failed to process {url}: {e}")
    return qa_pairs

def scrape_wikibook_qas(M):
    all_links = get_all_rocket_propulsion_links()
    visited = set()
    all_qas = []

    for i in range(10):
        print(f"\n🔁 Batch {i+1}/10")
        batch_qas = scrape_pages_with_qg(all_links, visited, M, batch_size=10)
        all_qas.extend(batch_qas)
        if len(all_qas) >= M:
            break

    # with open('wikibook_qas_100.json', 'w', encoding='utf-8') as f:
    #     json.dump(all_qas, f, indent=2)

    print(f"\n✅ Done. Saved {len(all_qas)} Q&A pairs to wikibook_qas_100.json")
    return all_qas

In [10]:
# Stack Exchange API

def clean_html_text(html_content):
    """Remove hyperlinks and strip HTML tags from content."""
    soup = BeautifulSoup(html_content, 'html.parser')

    # Replace <a> tags with their inner text
    for a in soup.find_all('a'):
        a.replace_with(a.get_text())

    # Get cleaned text
    text = soup.get_text(separator=' ')
    return html.unescape(text.strip())

def fetch_stackexchange_qas(site='space.stackexchange', tag='rockets', pagesize=20, max_pages=2):
    base_url = 'https://api.stackexchange.com/2.3/questions'
    answers_url = 'https://api.stackexchange.com/2.3/questions/{ids}/answers'
    all_qas = []

    for page in range(1, max_pages + 1):
        params = {
            'site': site,
            'tagged': tag,
            'pagesize': pagesize,
            'page': page,
            'filter': 'withbody'
        }
        resp = requests.get(base_url, params=params)
        data = resp.json()

        for question in data.get('items', []):
            q_id = question['question_id']
            q_body = clean_html_text(question.get('body', ''))
            title = html.unescape(question.get('title', ''))

            # Get answers
            a_params = {
                'site': site,
                'filter': 'withbody'
            }
            a_resp = requests.get(answers_url.format(ids=q_id), params=a_params)
            answers = a_resp.json().get('items', [])

            for ans in answers:
                a_body = clean_html_text(ans.get('body', ''))
                all_qas.append({
                    'question': f"{title}\n{q_body}",
                    'answer': a_body,
                    'source_url': f"https://{site}.com/questions/{q_id}"
                })

            time.sleep(1)  # polite API usage

    return all_qas


In [11]:
if __name__ == "__main__":
    se_data = fetch_stackexchange_qas()
    # arxiv_data = fetch_arxiv_abstracts()
    # reddit_data = fetch_reddit_qa()
    # wikibook_data = scrape_wikibook_qas(M=100)

    # all_qas = {
    #     "stackexchange": se_data,
    #     "arxiv": arxiv_data,
    #     "reddit": reddit_data,
    #     "wikibook": wikibook_data
    # }

    # save_qas_to_file(all_qas)
    # print("Data collection complete. Saved to qa_data.json")


In [12]:
se_data[0:3]

[{'question': "At a height of 100 km, what speed do you need to be going to escape Earth's gravity?\nI understand that the escape velocity of Earth is 11 km/s. However, Earth's gravitational sphere of influence is not infinite, so it is possible to go slower than that and still escape the sphere of influence (Because of the sun.) If a rocket starts accelerating from 0 on Earth's surface, what speed would it have to be going at, say, 100 km above the Earth's surface for it to escape Earth's gravity? How would you calculate this?",
  'answer': "Earth's gravitational sphere of influence is not infinite \n \n That's your problem. The force of gravity does have an infinite range. There is no place in the universe where earth's gravity is not felt.  \n As a result, it does not matter where you start, in order to escape earth, you need 11 km/s relative to the earth. If you first get into orbit at 100 km, then you already need a speed of 7 km/s for that. From that orbit, you only need an addit