# RAG System for IEP Goal Generation

_A Retrieval-Augmented Generation (RAG) project for generating IEP transition goals for special education, using NLP and standards alignment._

---


In [1]:
# Install required libraries
!pip install langchain sentence-transformers faiss-cpu streamlit gradio beautifulsoup4 requests pypdf openai python-dotenv

Collecting langchain
  Downloading langchain-0.2.17-py3-none-any.whl.metadata (7.1 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.2.1-py3-none-any.whl.metadata (10 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0.post1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Collecting streamlit
  Downloading streamlit-1.40.1-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting gradio
  Downloading gradio-4.44.1-py3-none-any.whl.metadata (15 kB)
Collecting beautifulsoup4
  Downloading beautifulsoup4-4.14.2-py3-none-any.whl.metadata (3.8 kB)
Collecting requests
  Downloading requests-2.32.4-py3-none-any.whl.metadata (4.9 kB)
Collecting pypdf
  Downloading pypdf-5.9.0-py3-none-any.whl.metadata (7.1 kB)
Collecting openai
  Downloading openai-2.7.1-py3-none-any.whl.metadata (29 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting PyYAML>=5.3 (from langchain)
  Downloading PyYAML-6.0.3-cp

In [2]:
import os
import requests
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import pickle
from langchain.text_splitter import RecursiveCharacterTextSplitter


  from tqdm.autonotebook import tqdm, trange


## 1. Data Collection & Preprocessing

In [3]:
def scrape_bls_occupation(url):
    """Scrape occupation information from BLS OOH."""
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')
    def ext(section):
        tag = soup.find('div', id=section)
        return tag.get_text(strip=True) if tag else ''
    out = {
        'duties': ext('duties'),
        'work_env': ext('work-environment'),
        'training': ext('requirements'),
        'salary': ext('pay'),
    }
    return out

# Example BLS scrape
# retail = scrape_bls_occupation('https://www.bls.gov/ooh/sales/retail-sales-workers.htm')

In [4]:
# For local PDF extraction; skipped due to online notebook limitations
# Use PyPDF or 'pdfplumber' to extract standards text
# Implemented as a stub
standards_text = "Employability: Communicate and work productively; adapt to roles; demonstrate ethical behavior; show initiative; accountability;..."

In [5]:
# Use multiple external sources for realistic IEP goal exemplars
sample_iep_goals = [
    "After high school, Clarence will obtain a full-time job at Walmart as a sales associate.",
    "After high school, Clarence will complete on-the-job training provided by Walmart and participate in employer-sponsored customer service workshops.",
    "In 36 weeks, Clarence will demonstrate effective workplace communication and customer service skills..."
]


In [6]:
def chunk_documents(documents, chunk_size=512, chunk_overlap=50):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len)
    return splitter.split_documents(documents)
# Placeholder: actual call would use real data
# chunks = chunk_documents([standards_text, *sample_iep_goals])

## 2. Embedding & Vector Store

In [7]:
model = SentenceTransformer('all-MiniLM-L6-v2')
dummy_chunks = sample_iep_goals + [standards_text]
embeddings = model.encode(dummy_chunks, normalize_embeddings=True)
index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(np.array(embeddings, dtype='float32'))
meta = [{'text': t, 'idx': i} for i, t in enumerate(dummy_chunks)]
with open('iep_metadata.pkl', 'wb') as f: pickle.dump(meta, f)
faiss.write_index(index, 'iep_faiss.index')

In [8]:
def retrieve_relevant_context(query, model, index, meta, k=3):
    q_emb = model.encode([query], normalize_embeddings=True)
    D, I = index.search(np.array(q_emb, dtype='float32'), k)
    return [meta[i] for i in I[0]]
# Example
query = 'retail sales associate requirements'
retrieved = retrieve_relevant_context(query, model, index, meta)
for r in retrieved: print(r['text'])


After high school, Clarence will obtain a full-time job at Walmart as a sales associate.
After high school, Clarence will complete on-the-job training provided by Walmart and participate in employer-sponsored customer service workshops.
In 36 weeks, Clarence will demonstrate effective workplace communication and customer service skills...


## 3. Prompt Engineering & Generation Prep

In [None]:
def construct_prompt(student_info, context):
    promp = f"""You are an expert in special education transition planning.
Student: {student_info['name']}, Age: {student_info['age']}, Grade: {student_info['grade']}
Interests: {student_info['interests']}
Assessment: {student_info['assessment']}

Relevant Career/Standards Info:
"""
    for c in context:
        promp += "- " + c['text'] + "\n"
    promp += "\nGenerate: (1) measurable postsecondary employment goal; (2) one annual IEP objective aligned to standards; (3) short-term objectives.\nExplain alignment to standards."
    return promp

# Example
student = {'name': 'Clarence', 'age':15, 'grade':'10', 'interests':'Retail sales', 'assessment':'Strong in Enterprising'}
prompt = construct_prompt(student, retrieved)
print(prompt)


You are an expert in special education transition planning.
Student: Clarence, Age: 15, Grade: 10
Interests: Retail sales
Assessment: Strong in Enterprising

Relevant Career/Standards Info:
- After high school, Clarence will obtain a full-time job at Walmart as a sales associate.
- After high school, Clarence will complete on-the-job training provided by Walmart and participate in employer-sponsored customer service workshops.
- In 36 weeks, Clarence will demonstrate effective workplace communication and customer service skills...

Generate: (1) measurable postsecondary employment goal; (2) one annual IEP objective aligned to standards; (3) short-term objectives.
Explain alignment to standards.


In [None]:
# Outline only: for local use of OpenAI, set key in environment and uncomment
# import openai
# response = openai.ChatCompletion.create(model='gpt-4', messages=[
#     {"role":"system", "content": "You are an expert in special education..."},
#     {"role":"user", "content": prompt}
# ])
# print(response['choices'][0]['message']['content'])


In [None]:
# Streamlit UI (run with: streamlit run thisnotebook.py)
# import streamlit as st
# Implement inputs for student name, age, interests, assessment
# On submit: retrieve context, create prompt, run LLM, display/output goals
# See project writeup for full UI design


In [None]:
# Save/reload all indexes and metadata for repeated querying
# with open('iep_metadata.pkl', 'rb') as f: meta = pickle.load(f)
# index = faiss.read_index('iep_faiss.index')
