In [1]:
import requests
import time
import json
import html
import time
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
from transformers import pipeline
import sentencepiece
import pandas as pd
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import io
import os
import re

In [2]:
# Initialize the question generation pipeline
qg_pipeline = pipeline("text2text-generation", model="valhalla/t5-base-qg-hl")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [3]:
def load_and_regex_paragraphs(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()
    
    text = re.sub(r"\bFigure\s+\d+-\d+\b", "", text, flags=re.IGNORECASE)
    # Remove any number surrounded by spaces (e.g., " 42 ")
    text = re.sub(r"\d+", " ", text)
    # Optional: strip redundant whitespace
    cleaned_text = re.sub(r"\s{2,}", " ", text).strip()

    return text

In [4]:
def load_and_split_paragraphs(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()

    # Split on one or more blank lines (handles multiple \n between paragraphs)
    import re
    pattern = r"(*Figure -*\s*\s*-\s*(\d+)\s*"
    pat = r"[ ]*(\d+[ ]*)|\(*Figure -*\s*\d+\)?"
    # paragraphs = [para.strip() for para in re.split(r'\n\s*\n', re.sub(pattern, " ", text)) if para.strip()]
    paragraphs = [para.strip().replace("- . ","").replace(" . ","") for para in re.split(r'\n\s*\n', re.sub(r"\s*\d+\s*", "", re.sub(pat, " ", text))) if para.strip()]
    
    return paragraphs


In [5]:
paragraphs = load_and_split_paragraphs("/Users/rckyi/Documents/Data/Nasa-Lessons-learned-in-engineering.txt")
lessons_learned_qa = []
for i, para in enumerate(paragraphs):  # Print first 5 paragraph
    input_text = f"generate question: {para}"
    generated_question = qg_pipeline(input_text, max_length=512, do_sample=False)[0]['generated_text']
    lessons_learned_qa.append({
            'question': generated_question,
            'answer': para,
            'source': 'nasa: lessons learned'
    })
    
    print(f"\n--- Paragraph {i + 1} ---\n{generated_question}\n{para}")
        

Token indices sequence length is longer than the specified maximum sequence length for this model (1393 > 512). Running this sequence through the model will result in indexing errors



--- Paragraph 1 ---
What is the purpose of the Lessons Learned in Engineering report?
Ôªø
 NASA/CR‚Äî ‚Äì 
Lessons Learned in Engineering
J.C. Blair, R.S. Ryan, and L.A. Schutzenhofer
Al Signal Research, Inc., Huntsville, Alabama
Prepared for Marshall Space Flight Center
under Contract NNM AA C
June 
The NASA STI Program‚Ä¶in Profile
Since its founding, NASA has been dedicated to the advancement of aeronautics and space science. The NASA Scientific and Technical Information (STI) Program Office plays a key part in helping NASA maintain this important role.
The NASA STI Program Office is operated by Langley Research Center, the lead center for NASA‚Äôs scientific and technical information. The NASA STI Program Office provides access to the NASA STI Database, the largest collection of aeronautical and space science STI in the world. The Program Office is also NASA‚Äôs institutional mechanism for disseminating the results of its research and development activities. These results are publ

In [6]:
# paragraphs_ps = split_into_paragraphs("/Users/rckyi/Documents/Data/A HISTORY OF AEROSPACE PROBLEMS, THEIR SOLUTIONS, THEIR LESSONS")

In [7]:
# for i, para in enumerate(paragraphs_ps[200:300]):  # Print first 5 paragraph
#     input_text = f"generate question: {para}"
#     generated_question = qg_pipeline(input_text, max_length=512, do_sample=False)[0]['generated_text']
#     print(f"\n--- Paragraph {i + 1} ---\n{generated_question}\n{para}")

In [8]:
def process_paragraphs(json_file_path):
    with open(json_file_path, 'r', encoding='utf-8') as f:
        paragraphs = json.load(f)

    results = []
    for para_id, para_text in paragraphs.items():
        input_text = f"generate question: {para_text}"
        results.append({'question': qg_pipeline(input_text, max_length=712, do_sample=False)[0]['generated_text'], 
                        'answer': para_text,
                       'source':  f'nasa: a history of aerospace problems and solns'
                       })
        # yield [qg_pipeline(input_text, max_length=512, do_sample=False)[0]['generated_text'], para_text]

    return results

In [9]:
# Example usage:
json_file_path = "/Users/rckyi/Documents/Data/paragraphs_with_ids.json"  # or your full path
nasa_probs_solns = process_paragraphs(json_file_path)

In [10]:
# arXiv API Q&A extraction from abstract (pseudo-QA from title and abstract)
def fetch_arxiv_abstracts(query='rocket propulsion', max_results=100):
    url = f"http://export.arxiv.org/api/query?search_query=all:{query.replace(' ', '+')}&start=0&max_results={max_results}"
    resp = requests.get(url)
    root = ET.fromstring(resp.content)

    qas = []
    ns = {'atom': 'http://www.w3.org/2005/Atom'}  # arXiv uses Atom XML namespace

    for entry in root.findall('atom:entry', ns):
        title = entry.find('atom:title', ns).text.strip()
        
        summary = entry.find('atom:summary', ns).text.strip()
        # Generate question from text
        input_text = f"generate question: {summary}"
        generated_question = qg_pipeline(input_text, max_length=64, do_sample=False)[0]['generated_text']
        
        qas.append({
            # 'title': title,
            'question': generated_question,
            'answer': summary,
            'source': f'arxiv'
        })
    return qas


In [11]:
# Wikipedia scraping

# Initialize the question generation pipeline
# qg_pipeline = pipeline("text2text-generation", model="valhalla/t5-base-qg-hl")

def clean_text(text):
    return ' '.join(text.strip().split())

def get_all_rocket_propulsion_links(base_url='https://en.wikibooks.org/wiki/Rocket_Propulsion'):
    """Grab all unique subpage links under Rocket Propulsion."""
    resp = requests.get(base_url)
    soup = BeautifulSoup(resp.text, 'html.parser')

    content_div = soup.select_one('#mw-content-text')
    links = content_div.find_all('a', href=True)
    
    urls = set()
    for link in links:
        href = link['href']
        if 'https' in href:
            full_url = href
        else:
            full_url = 'https://en.wikibooks.org' + href
        urls.add(full_url)

    return list(urls)

def scrape_pages_with_qg(urls, visited, M, batch_size=10):
    """Scrape up to `batch_size` new pages not in `visited`, return new QAs."""
    qa_pairs = []
    count = 0

    for url in urls:
        if url in visited:
            continue
        visited.add(url)
        try:
            resp = requests.get(url)
            soup = BeautifulSoup(resp.text, 'html.parser')
            content_div = soup.select_one('#mw-content-text')
            paragraphs = content_div.find_all('p')
            text = clean_text(' '.join(p.get_text() for p in paragraphs[:3]))
            if not text or len(text) < M:
                continue

            # Generate question from text
            input_text = f"generate question: {text}"
            output = qg_pipeline(input_text, max_length=64, do_sample=False)[0]['generated_text']

            qa_pairs.append({
                'question': output,
                'answer': text,
                'source': f"rocketry wiki"
            })
            print(f"‚úÖ Generated Q&A from: {url}")
            count += 1
            time.sleep(1)  # polite scraping

            if count >= batch_size:
                break
        except Exception as e:
            print(f"‚ö†Ô∏è Failed to process {url}: {e}")
    return qa_pairs

def scrape_wikibook_qas(M):
    all_links = get_all_rocket_propulsion_links()
    visited = set()
    all_qas = []

    for i in range(10):
        print(f"\nüîÅ Batch {i+1}/10")
        batch_qas = scrape_pages_with_qg(all_links, visited, M, batch_size=10)
        all_qas.extend(batch_qas)
        if len(all_qas) >= M:
            break

    return all_qas

In [29]:
# Stack Exchange API

def clean_html_text(html_content):
    """Remove hyperlinks and strip HTML tags from content."""
    soup = BeautifulSoup(html_content, 'html.parser')

    # Replace <a> tags with their inner text
    for a in soup.find_all('a'):
        a.replace_with(a.get_text())

    # Get cleaned text
    text = soup.get_text(separator=' ')
    return html.unescape(text.strip())

def fetch_stackexchange_qas(site='space.stackexchange', tag='rockets', pagesize=20, max_pages=5):
    base_url = 'https://api.stackexchange.com/2.3/questions'
    answers_url = 'https://api.stackexchange.com/2.3/questions/{ids}/answers'
    all_qas = []

    for page in range(1, max_pages + 1):
        params = {
            'site': site,
            'tagged': tag,
            'pagesize': pagesize,
            'page': page,
            'filter': 'withbody'
        }
        resp = requests.get(base_url, params=params)
        print(resp)
        data = resp.json()

        for question in data.get('items', []):
            q_id = question['question_id']
            q_body = clean_html_text(question.get('body', ''))
            title = html.unescape(question.get('title', ''))

            # Get answers
            a_params = {
                'site': site,
                'filter': 'withbody'
            }
            a_resp = requests.get(answers_url.format(ids=q_id), params=a_params)
            answers = a_resp.json().get('items', [])

            for ans in answers:
                a_body = clean_html_text(ans.get('body', ''))
                all_qas.append({
                    'question': f"{title}\n{q_body}",
                    'answer': a_body,
                    'source': f"https://{site}.com/questions/{q_id}"
                })

            time.sleep(1)  # polite API usage

    return all_qas


In [30]:
se_data = fetch_stackexchange_qas()

<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>


In [31]:
if __name__ == "__main__":
    
    arxiv_data = fetch_arxiv_abstracts()
    # reddit_data = fetch_reddit_qa()
    wikibook_data = scrape_wikibook_qas(M=1000)

    all_qas = {
        "stackexchange": se_data,
        "arxiv": arxiv_data,
        # "reddit": reddit_data,
        "wikibook": wikibook_data,
        "nasa lessons learned": lessons_learned_qa,
        "nasa problems and solns": nasa_probs_solns
    }

    with open('/Users/rckyi/Documents/Data/all_qas_data.json', 'w', encoding='utf-8') as f:
        json.dump(all_qas, f, indent=2)

    print(f"\n‚úÖ Done. Saved {len(all_qas)} Q&A pairs to qas_data.json")



üîÅ Batch 1/10
‚úÖ Generated Q&A from: https://en.wikibooks.org/wiki/Space_Transport_and_Engineering_Methods/Engineering_Methods
‚úÖ Generated Q&A from: https://en.wikibooks.org/wiki/Space_Transport_and_Engineering_Methods/Recycling_Methods
‚úÖ Generated Q&A from: https://en.wikibooks.org/wiki/Space_Transport_and_Engineering_Methods/Operations
‚úÖ Generated Q&A from: https://en.wikibooks.org/wiki/Space_Transport_and_Engineering_Methods/Existing_Programs
‚úÖ Generated Q&A from: https://en.wikibooks.org/wiki/Space_Transport_and_Engineering_Methods/Economics
‚úÖ Generated Q&A from: https://en.wikibooks.org/wiki/Space_Transport_and_Engineering_Methods/Later_Projects
‚úÖ Generated Q&A from: https://en.wikibooks.org/wiki/Space_Transport_and_Engineering_Methods/Resource_Uses
‚úÖ Generated Q&A from: https://en.wikibooks.org/wiki/Space_Transport_and_Engineering_Methods/Hypervelocity_Launcher
‚úÖ Generated Q&A from: https://en.wikibooks.org/wiki/Space_Transport_and_Engineering_Methods/Space_El

In [None]:
len(se_data)