Installing the dependencies

In [None]:
!pip install openai
!pip install tiktoken



In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, urldefrag
from collections import deque
import time
import json
import tiktoken
from openai import OpenAI
import os
from sklearn.model_selection import train_test_split
import pandas as pd


In [None]:
DOMAINS_TO_CRAWL = ['https://iss.ku.edu', 'https://engr.ku.edu', 'https://eecs.ku.edu', 'https://registrar.ku.edu', 'https://catalog.ku.edu']
MAX_DEPTH = 2
RAW_DATA_PATH = 
os.makedirs(RAW_DATA_PATH, exist_ok=True)
QA_DATA_PATH = 
os.makedirs(QA_DATA_PATH, exist_ok=True)

## Crawling through the webpages and extracting raw data

In [None]:
def is_link_valid(url: str, crawl_domain: str) -> bool:
  bad_extensions = (
        ".pdf", ".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx",
        ".zip", ".rar", ".gz", ".tar", ".7z",
        ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".svg",
        ".mp4", ".avi", ".mov", ".mp3", ".wav"
  )
  login_auth_redirect = ["/cas", "/login", "/signin", "auth", "session", "returnto"]
  try:
    parsed_url = urlparse(url)
    for keyword in login_auth_redirect:
      if keyword in parsed_url.path or keyword in parsed_url.query:
        return False
    for ext in bad_extensions:
      if parsed_url.path.endswith(ext):
        return False
    domain = parsed_url.netloc.lower()
    if parsed_url.scheme == 'https' and domain.endswith(crawl_domain[9:]):
      return True
    if parsed_url.scheme == 'http' and domain.endswith(crawl_domain[8:]):
      return True
    return False
  except Exception:
    return False

In [None]:
def get_visible_text(html_content, header_nav_flag):
    # crawl thru header and nav only once
    soup = BeautifulSoup(html_content, "html.parser")
    if header_nav_flag == 0:
      for tag in soup(["script", "style", "noscript"]):
          tag.decompose()
    else:
      for tag in soup(["script", "style", "noscript", "header", "footer", "nav"]):
          tag.decompose()
    main = soup.find("main")
    if main:
      return main.get_text(separator=" ", strip=True)
    return soup.get_text(separator=" ", strip=True)

In [None]:
def crawl(domain: str):
  urls_visited = set()
  frontier = deque([(domain, 0)])
  enc = tiktoken.encoding_for_model("gpt-4")
  res = []
  total_num_tokens = 0
  total_urls_visited = 0
  header_nav_flag = 0
  print(f'Starting to crawl through {domain} ')

  while frontier:
    url, depth = frontier.popleft()
    if url in urls_visited or depth > MAX_DEPTH:
      continue
    urls_visited.add(url)
    total_urls_visited += 1
    # print(f'Visiting: {url}')

    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
    except Exception as e:
        print(f"Failed to fetch {url}: {e}")
        continue

    text = get_visible_text(response.text, header_nav_flag)
    header_nav_flag = 1
    num_tokens = len(enc.encode(text))
    total_num_tokens += num_tokens
    res.append({"url": url, "text": text, "num_tokens": num_tokens})

    soup = BeautifulSoup(response.text, "html.parser")
    for link in soup.find_all("a", href=True):
        url_to_crawl = urljoin(url, link["href"])
        url_to_crawl, _ = urldefrag(url_to_crawl)
        if is_link_valid(url_to_crawl, domain) and url_to_crawl not in urls_visited:
            frontier.append((url_to_crawl, depth + 1))

    time.sleep(0.5)

  print(f'Total num of pages crawled - {total_urls_visited}')
  print(f'Total num of tokens - {total_num_tokens}')
  return res, total_num_tokens

In [None]:
total_num_tokens = 0
for domain in DOMAINS_TO_CRAWL:
  crawled_urls_text, num_tokens = crawl(domain)
  total_num_tokens += num_tokens
  with open(f"{RAW_DATA_PATH}{domain[8:].replace('.','_')}.json", "w", encoding="utf-8") as f:
    json.dump(crawled_urls_text, f, indent=2)
  print('')
  time.sleep(5.0)

print('')
print(f'Total num of tokens for all webpages - {total_num_tokens}')

Starting to crawl through https://iss.ku.edu 
Failed to fetch https://iss.ku.edu/node/356: 404 Client Error: Not Found for url: https://iss.ku.edu/node/356
Failed to fetch https://iss.ku.edu/newly-admitted-ku-ids: 404 Client Error: Not Found for url: https://iss.ku.edu/newly-admitted-ku-ids
Failed to fetch https://iss.ku.edu/node/353: 404 Client Error: Not Found for url: https://iss.ku.edu/node/353
Total num of pages crawled - 164
Total num of tokens - 125815

Starting to crawl through https://engr.ku.edu 
Failed to fetch https://me.engr.ku.edu/faculty-research: HTTPSConnectionPool(host='me.engr.ku.edu', port=443): Max retries exceeded with url: /faculty-research (Caused by SSLError(SSLCertVerificationError(1, "[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: Hostname mismatch, certificate is not valid for 'me.engr.ku.edu'. (_ssl.c:1006)")))
Failed to fetch https://engr.ku.edu/departments-0: 403 Client Error: Forbidden for url: https://engr.ku.edu/departments-0
Failed to fet

Total num of tokens - 236974 + 135571 + 67943 + 109610 + 1088978 = 1639076

## Generating questions, answers, context from raw data

In [None]:
system_prompt = (
    "You are a knowledgeable assistant trained to extract Q&A from university-related text data. "
    "Your goal is to identify and surface overlooked or under-discussed information from the passage. "
    # "Avoid common and frequently asked topics like admissions, visas, application deadlines, or department overviews unless there is something truly novel."
)

user_prompt_template = """Given the following passage and URL, generate 3 to 5 high-quality question-answer (Q&A) pairs that are:

- Fresh and unique: Prioritize lesser-known or underused aspects of the passage.
- Factual and grounded: Base everything strictly on the given text.
- Diverse and non-redundant: Cover different ideas without overlap.
- Clearly supported: Include a supporting excerpt from the text as 'context'.

Each Q&A pair must have:
- question: A clear, specific question based on the content.
- answer: A concise, factually correct answer from the passage.
- context: A supporting excerpt from the passage.
- source: The URL of the passage.

Avoid repeating topics across questions. Focus especially on detailed points that are easy to overlook but still valuable to know.

Passage:
{text}

Source:
{url}

Return the output as a JSON array, where each item has:
- question: the question text
- answer: the answer text
- context: the supporting excerpt from the text
- source: the original URL

Example format:
[
  {{
    "question": "...",
    "answer": "...",
    "context": "...",
    "source": "{url}"
  }},
  ...
]
Output only the JSON array in the right format. Do not include anything before or after it.
"""

url= "https://iss.ku.edu/f-1-transfer"
text= "F-1 Transfer All students currently in F-1 status at any type of institution (high school, college, university, intensive English institute) in the U.S. (including those on Optional Practical Training) who plan to change schools must complete the transfer procedure through SEVIS.\u00a0 It is the student\u2019s responsibility to maintain his or her F-1 student status and to complete the procedures below. Transfer Procedure Complete admission to the University of Kansas. Only after you have decided to attend KU: Notify your current school (or if on Optional Practical Training your former school who recommended the OPT) of your intent to transfer to KU. Speak with an international student advisor at your current school to request the release of your SEVIS (I-20) record to KU and set the date for the release of your record. To release your SEVIS record to the University of Kansas, your advisor will need KU\u2019s SEVIS school code: KAN214F00634000. Complete and submit the \"Intent to Transfer to KU\" e-form online: Step 1 \u2013 Setup KU Online ID (you will need your 7-digit KUID number [e.g. 2991234]) Step 2 \u2013 Log in to iHawk by using the blue \u201cLogin\u201d button. Step 3 \u2013 Complete and submit the \u201cIntent to Transfer to KU from a U.S. School\u201d e-form, under the \u201cGet Your I-20/DS-2019\u201d tab.\u00a0 You will need to provide the email address of your current international student advisor at the transfer-out school and any Reduced Course Load (RCL) information. After the release date, KU will produce an I-20 if all e-forms have been completed and approved. KU cannot produce the I-20 before the release date . Once the I-20 is issued, you will receive it in the manner which you selected in iHawk. To complete the transfer, you are required to report to an international student advisor at KU within 15 days of the report date on your new KU I-20, during orientation or at an ISS Check-in session. Driver License & SSN Issues Your transferred in SEVIS record is typically not registered (changed to active status) until orientation/ISS Check-ins are completed, you have enrolled, and classes have started.\u00a0 If you can, you should wait to apply for your driver\u2019s license or social security number until after classes have started.\u00a0 If you absolutely cannot wait, consult the ISS office. Re-admitted Students, Transferring Back to KU Checklist of things to do: Have your SEVIS record released (transferred) to KU: See instructions above Complete a required ISS Check In: As a readmitted student, you are not required (although certainly welcome) to attend the International Student Orientation, however, you are still required to complete the ISS Check-in. Review the details on Readmitted Student Arrival and Check-in Processes ."

user_prompt = user_prompt_template.format(text=text.strip(), url=url)
print(user_prompt)

Given the following passage and URL, generate 3 to 5 high-quality question-answer (Q&A) pairs that are:

- Fresh and unique: Prioritize lesser-known or underused aspects of the passage.
- Factual and grounded: Base everything strictly on the given text.
- Diverse and non-redundant: Cover different ideas without overlap.
- Clearly supported: Include a supporting excerpt from the text as 'context'.

Each Q&A pair must have:
- question: A clear, specific question based on the content.
- answer: A concise, factually correct answer from the passage.
- context: A supporting excerpt from the passage.
- source: The URL of the passage.

Avoid repeating topics across questions. Focus especially on detailed points that are easy to overlook but still valuable to know.

Passage:
F-1 Transfer All students currently in F-1 status at any type of institution (high school, college, university, intensive English institute) in the U.S. (including those on Optional Practical Training) who plan to change sc

In [None]:
client = OpenAI(
  api_key = OPENAI_API_KEY
)

completion = client.chat.completions.create(
  model="gpt-4o-mini",
  store=True,
  messages=[
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": user_prompt}
  ]
)
print(completion.choices[0].message);


In [None]:
for filename in os.listdir(RAW_DATA_PATH):
  with open(f"{RAW_DATA_PATH}{filename}", "r", encoding="utf-8") as f:
    data = json.load(f)
  res = []
  print(f'Gnerating Q&As for {filename}')
  for item in data:
    url = item['url']
    text = item['text']
    num_tokens = item['num_tokens']
    user_prompt = user_prompt_template.format(text=text.strip(), url=url)

    # print(f'  Generating Q&As for {url}')
    client = OpenAI(
    api_key = OPENAI_API_KEY
    )
    try:
      completion = client.chat.completions.create(
        model="gpt-4o-mini",
        store=True,
        messages=[
          {"role": "system", "content": system_prompt},
          {"role": "user", "content": user_prompt}
        ]
      )
      raw_output = completion.choices[0].message.content
      raw_output = raw_output.strip().replace("```json", "").replace("```", "").strip()
      output = json.loads(raw_output)
      res.extend(output)
    except Exception as e:
      print(f'Failed to generate prompts for: {e}')

    time.sleep(1.5)

  with open(f'{QA_DATA_PATH}{filename}', "w", encoding="utf-8") as f:
          json.dump(res, f, indent=2)

Generating Q&As for https://engr.ku.edu
Generating Q&As for https://engr.ku.edu/navigator
Generating Q&As for https://engr.ku.edu/
Generating Q&As for https://engr.ku.edu/about
Generating Q&As for https://engr.ku.edu/visit-us
Generating Q&As for https://engr.ku.edu/our-mission
Generating Q&As for https://engr.ku.edu/explore
Generating Q&As for https://engr.ku.edu/facilities-maps
Generating Q&As for https://engr.ku.edu/recognition-ceremonies
Generating Q&As for https://engr.ku.edu/news
Generating Q&As for https://engr.ku.edu/alumni-friends
Generating Q&As for https://engr.ku.edu/history
Generating Q&As for https://engr.ku.edu/contact-ku-school-engineering
Generating Q&As for https://engr.ku.edu/academics
Generating Q&As for https://engr.ku.edu/departments
Generating Q&As for http://bio.engr.ku.edu/
Generating Q&As for http://cpe.engr.ku.edu/
Generating Q&As for http://ephx.engr.ku.edu/
Generating Q&As for https://engr.ku.edu/current-undergraduate-students
Generating Q&As for https://eng

IsADirectoryError: [Errno 21] Is a directory: '/content/raw_data/.ipynb_checkpoints'

In [None]:
with open(f"{RAW_DATA_PATH}iss_ku_edu.json", "r", encoding="utf-8") as f:
  data = json.load(f)

res = []
for item in data:
  url = item['url']
  text = item['text']
  num_tokens = item['num_tokens']
  user_prompt = user_prompt_template.format(text=text.strip(), url=url)

  print(f'Generating Q&As for {url}')
  client = OpenAI(
  api_key = OPENAI_API_KEY
  )
  try:
    completion = client.chat.completions.create(
      model="gpt-4o-mini",
      store=True,
      messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
      ]
    )
    raw_output = completion.choices[0].message.content
    raw_output = raw_output.strip().replace("```json", "").replace("```", "").strip()
    output = json.loads(raw_output)
    res.extend(output)
  except Exception as e:
    print(f'Failed to generate prompts for: {e}')

  time.sleep(1.5)

with open(f'{QA_DATA_PATH}iss_ku_edu.json', "w", encoding="utf-8") as f:
        json.dump(res, f, indent=2)

Generating Q&As for https://iss.ku.edu
Generating Q&As for https://iss.ku.edu/navigator
Generating Q&As for https://iss.ku.edu/
Generating Q&As for https://iss.ku.edu/contact-us
Generating Q&As for https://iss.ku.edu/our-services
Generating Q&As for https://iss.ku.edu/people
Generating Q&As for https://iss.ku.edu/advising
Generating Q&As for https://iss.ku.edu/hours-operation-location
Generating Q&As for https://iss.ku.edu/messages-iss
Generating Q&As for https://iss.ku.edu/apply
Generating Q&As for https://iss.ku.edu/jayhawk-semester-students
Generating Q&As for https://iss.ku.edu/sponsored-students
Generating Q&As for https://iss.ku.edu/newly-admitted-students
Generating Q&As for https://iss.ku.edu/student-id-number
Generating Q&As for https://iss.ku.edu/applying-your-i-20ds-2019-form
Generating Q&As for https://iss.ku.edu/estimated-cost-attendance
Generating Q&As for https://iss.ku.edu/proof-finances
Generating Q&As for https://iss.ku.edu/f-1-transfer
Generating Q&As for https://iss

## Creating the dataset

In [None]:
rows = []

for filename in os.listdir(QA_DATA_PATH):
  with open(f"{QA_DATA_PATH}{filename}", "r", encoding="utf-8") as f:
    data = json.load(f)
  for item in data:
    rows.append(item)
df = pd.DataFrame(rows)
df.head(7)

In [None]:
df['subdomain'] = df['source'].apply(lambda x: urlparse(str(x)).netloc)
df = df[df['subdomain']!='people.eecs.ku.edu']

In [None]:
train_val_df, test_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df["subdomain"],
    random_state=42
)

train_df, val_df = train_test_split(
    train_val_df,
    test_size=0.125,
    stratify=train_val_df["subdomain"],
    random_state=42
)