# Syllabus Similarity Notebook
This notebook parses two syllabi (PDF or DOCX), sections them, computes embeddings, and outputs a similarity score.

In [16]:
import pdfplumber
import docx
import spacy
import nltk
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm

# Use BERT-based all-mpnet-base-v2 for best semantic similarity
MODEL_NAME = 'all-mpnet-base-v2'
model = SentenceTransformer(MODEL_NAME)
nlp = spacy.blank("en")  # Or use a full model for more advanced parsing

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [None]:
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        return "\n".join([page.extract_text() or "" for page in pdf.pages])

def extract_text_from_docx(docx_path):
    doc = docx.Document(docx_path)
    return "\n".join([para.text for para in doc.paragraphs])

def clean_and_tokenize(text):
    doc = nlp(text)
    return " ".join([token.text for token in doc])


In [21]:
import re
import difflib

# Canonical section titles (expand as needed)
SECTION_TITLES = [
    "Course Description", "Learning Outcomes", "Objectives", "Prerequisites",
    "Textbook", "Required Materials", "Grading", "Schedule", "Policies",
    "Attendance", "Assignments", "Instructor", "Contact", "Office Hours"
]

def find_closest_section(line, section_titles, cutoff=0.75):
    """Return the closest matching section title or None."""
    matches = difflib.get_close_matches(line.strip().lower(), [s.lower() for s in section_titles], n=1, cutoff=cutoff)
    if matches:
        # Return canonical capitalization
        idx = [s.lower() for s in section_titles].index(matches[0])
        return section_titles[idx]
    return None

In [22]:
def section_syllabus_text(text):
    """Parse syllabus text into sections using fuzzy header matching."""
    sections = {}
    current_section = None
    buffer = []
    lines = text.splitlines()
    for line in lines:
        # Try to match a section header
        match = find_closest_section(line, SECTION_TITLES)
        if match:
            if current_section and buffer:
                sections[current_section] = "\n".join(buffer).strip()
                buffer = []
            current_section = match
        elif current_section:
            buffer.append(line)
    if current_section and buffer:
        sections[current_section] = "\n".join(buffer).strip()
    return sections

In [24]:
def parse_syllabus(file_path):
    # Extract and clean text
    if file_path.lower().endswith(".pdf"):
        text = extract_text_from_pdf(file_path)
    elif file_path.lower().endswith(".docx"):
        text = extract_text_from_docx(file_path)
    else:
        raise ValueError("Unsupported file type")
    # Section the syllabus
    sections = section_syllabus_text(text)
    return sections

def syllabus_to_json(file_path):
    # Parse and return as JSON/dict
    return parse_syllabus(file_path)

In [25]:
def embed_section(text):
    return model.encode(text or "", show_progress_bar=False)

def compute_section_similarities(s1, s2, section_titles=SECTION_TITLES):
    sims = []
    for section in section_titles:
        t1 = s1.get(section, "")
        t2 = s2.get(section, "")
        if t1.strip() and t2.strip():
            emb1 = embed_section(t1)
            emb2 = embed_section(t2)
            sim = cosine_similarity([emb1], [emb2])[0][0]
        else:
            sim = 0.0
        sims.append(sim)
    return sims

In [None]:
# Example: DataFrame with columns: pdf1, pdf2, label
df = pd.read_csv("syllabus_pairs.csv")
results = []
for idx, row in tqdm(df.iterrows(), total=len(df)):
    syl1 = syllabus_to_json(row['pdf1'])
    syl2 = syllabus_to_json(row['pdf2'])
    sim = compute_similarity(syl1, syl2)
    results.append({"sim": sim, "label": row['label']})

df_results = pd.DataFrame(results)

In [19]:
import json
# Save parsed syllabi as JSON for schema consistency check
for file in Path("syllabi").glob("*"):
    parsed = syllabus_to_json(str(file))
    with open(f"{file}.json", "w") as f:
        json.dump(parsed, f, indent=2)

In [None]:
# Example: DataFrame with columns: pdf1, pdf2, label
df = pd.read_csv("syllabus_pairs.csv")
X = []
y = []
for _, row in tqdm(df.iterrows(), total=len(df)):
    s1 = parse_syllabus(row['pdf1'])
    s2 = parse_syllabus(row['pdf2'])
    feats = compute_section_similarities(s1, s2)
    X.append(feats)
    y.append(row['label'])
X = np.array(X)
y = np.array(y)

from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(fit_intercept=False, solver='liblinear')
clf.fit(X, y)
print("Learned section weights:")
for section, weight in zip(SECTION_TITLES, clf.coef_[0]):
    print(f"{section}: {weight:.3f}")

In [None]:
from sklearn.metrics import average_precision_score

map_score = average_precision_score(df_results['label'], df_results['sim'])
print(f"Mean Average Precision (MAP): {map_score:.2f}")

## 1. Syllabus Parsing and Sectioning

In [8]:
SECTION_TITLES = [
    'Course Description', 'Learning Outcomes', 'Objectives', 'Prerequisites',
    'Textbook', 'Required Materials', 'Grading', 'Schedule', 'Policies', 'Attendance',
    'Assignments', 'Instructor', 'Contact', 'Office Hours'
]

def extract_text_from_pdf(pdf_path: str) -> str:
    with pdfplumber.open(pdf_path) as pdf:
        return '\n'.join(page.extract_text() or '' for page in pdf.pages)

def extract_text_from_docx(docx_path: str) -> str:
    doc = docx.Document(docx_path)
    return '\n'.join([para.text for para in doc.paragraphs])

def section_syllabus_text(text: str) -> Dict[str, str]:
    sections = {}
    current_section = None
    buffer = []
    lines = text.splitlines()
    for line in lines:
        line_strip = line.strip()
        matched = False
        for title in SECTION_TITLES:
            if re.fullmatch(title, line_strip, re.IGNORECASE):
                if current_section and buffer:
                    sections[current_section] = '\n'.join(buffer).strip()
                    buffer = []
                current_section = title
                matched = True
                break
        if not matched and current_section:
            buffer.append(line)
    if current_section and buffer:
        sections[current_section] = '\n'.join(buffer).strip()
    return sections

def parse_syllabus(file_path: str) -> Dict[str, str]:
    if file_path.lower().endswith('.pdf'):
        text = extract_text_from_pdf(file_path)
    elif file_path.lower().endswith('.docx'):
        text = extract_text_from_docx(file_path)
    else:
        raise ValueError('Unsupported file type')
    return section_syllabus_text(text)


In [None]:
def print_found_sections(syllabus_dict, label):
    print(f"Sections found in {label}:")
    for section, content in syllabus_dict.items():
        print(f"  {section}: {len(content.split())} words")
    print()

In [13]:
def embed_text(text, model):
    return model.encode(text or '', show_progress_bar=False)

def fallback_document_similarity(s1_text, s2_text, model):
    emb1 = embed_text(s1_text, model)
    emb2 = embed_text(s2_text, model)
    return cosine_similarity([emb1], [emb2])[0][0]

## 2. Syllabus Similarity Model

In [None]:
MODEL_NAME = 'all-mpnet-base-v2'
model = SentenceTransformer(MODEL_NAME)

SECTION_WEIGHTS = {
    'Course Description': 2.0,
    'Learning Outcomes': 3.0,
    'Objectives': 2.0,
    'Prerequisites': 1.0,
    'Grading': 0.5,
    'Schedule': 1.0,
}
ALL_SECTIONS = list(SECTION_WEIGHTS.keys())

def embed_section(section_text: str) -> np.ndarray:
    return model.encode(section_text or '', show_progress_bar=False)

def compare_syllabi(s1: Dict[str, str], s2: Dict[str, str]) -> Tuple[float, Dict[str, float]]:
    similarities = {}
    weighted_sum = 0.0
    total_weight = 0.0
    for section in ALL_SECTIONS:
        t1 = s1.get(section, '')
        t2 = s2.get(section, '')
        if t1.strip() and t2.strip():
            emb1 = embed_section(t1)
            emb2 = embed_section(t2)
            sim = cosine_similarity([emb1], [emb2])[0][0]
        else:
            sim = 0.0
        similarities[section] = sim
        weighted_sum += sim * SECTION_WEIGHTS[section]
        total_weight += SECTION_WEIGHTS[section]
    overall_score = weighted_sum / total_weight if total_weight else 0.0
    return overall_score, similarities


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


## 3. Demo: Compare Two Syllabi

In [14]:
# Example usage: replace with your file paths
syllabus1_path = '/Users/rutmehta/Developer/NJBDA/syllabi/AERG200_syllabus.docx'  # or .docx
syllabus2_path = '/Users/rutmehta/Developer/NJBDA/syllabi/AERG101_syllabus.docx'  # or .docx

s1 = parse_syllabus(syllabus1_path)
s2 = parse_syllabus(syllabus2_path)

print_found_sections(s1, "Syllabus 1")
print_found_sections(s2, "Syllabus 2")

overall_score, section_scores = compare_syllabi(s1, s2)
print(f'Overall Section-Based Similarity Score: {overall_score:.3f}')
print('Section Similarities:')
for section, score in section_scores.items():
    print(f'  {section}: {score:.3f}')

# Fallback: If section-based similarity is too low, use whole-document similarity
if overall_score < 0.1:
    print("\nSection-based similarity is very low. Falling back to whole-document similarity...")
    # Re-extract raw text for both syllabi
    s1_text = extract_text_from_pdf(syllabus1_path) if syllabus1_path.lower().endswith('.pdf') else extract_text_from_docx(syllabus1_path)
    s2_text = extract_text_from_pdf(syllabus2_path) if syllabus2_path.lower().endswith('.pdf') else extract_text_from_docx(syllabus2_path)
    doc_score = fallback_document_similarity(s1_text, s2_text, model)
    print(f'Whole-Document Similarity Score: {doc_score:.3f}')


Sections found in Syllabus 1:
  Course Description: 58 words
  Textbook: 615 words

Sections found in Syllabus 2:
  Course Description: 66 words
  Textbook: 1135 words

Overall Section-Based Similarity Score: 0.069
Section Similarities:
  Course Description: 0.326
  Learning Outcomes: 0.000
  Objectives: 0.000
  Prerequisites: 0.000
  Grading: 0.000
  Schedule: 0.000

Section-based similarity is very low. Falling back to whole-document similarity...
Whole-Document Similarity Score: 0.536
