# Doctoral parser logic

In [1]:
from PyPDF2 import PdfReader, PdfWriter
import os

# Ok so we are going to load in the data and start everything again for the Phd guys
input_path = r"C:\Users\Admin\OneDrive\Documents\Schoolwork\Projects\UNI finals project\Code stuff\US 2019 - Doctorate Faculty Evaluation.pdf"
output_name = "Doctoral baseline.pdf"
output_path = os.path.join(os.path.dirname(input_path), output_name)

# read the original PDF, copy all pages into a new PdfWriter and write to the output file
reader = PdfReader(input_path)
pdf_writer = PdfWriter()
for page in reader.pages:
    pdf_writer.add_page(page)

with open(output_path, "wb") as f:
    pdf_writer.write(f)

# to confirm that the file has been saved correctly, you can print a success message
print("The whole document has been successfully extracted and saved as 'Doctoral baseline.pdf'.")

The whole document has been successfully extracted and saved as 'Doctoral baseline.pdf'.


In [2]:
# ALRIGHT its time to begin the parser logic again but for the doctoral data this time
import pdfplumber
import re
import pandas as pd

# Full PDF path
pdf_path = r"C:\Users\Admin\OneDrive\Documents\Schoolwork\Projects\UNI finals project\Code stuff\US 2019 - Doctorate Faculty Evaluation.pdf"

# Output Excel file
output_excel = "Doctoral output.xlsx"

# Map question prompts to categories
question_map = {
    "enjoyed most": "Strengths",
    "didn't like": "Weaknesses",
    "improve": "Suggestions",
    "feedback on assignments": "Feedback Evaluation",
    "course text": "Course Text",
    "materials or resources": "Resources",
    "overall evaluation": "Overall"
}

# Regex patterns
section_header_pattern = re.compile(r"\(([A-Z]{3,}\d{3,}) \(UG\d+\)\)\s*Section:\s*([A-Z])", re.IGNORECASE)

# Instructor: capture up to end-of-line
instructor_pattern = re.compile(r"^Instructor:\s*([^\n\r]+)$", re.IGNORECASE)

# Semester: allow trailing parenthetical (e.g., (UNDG))
semester_pattern = re.compile(r"(Spring|Summer|Fall|Winter)\s+Semester\s+\d{4}(?:\s*\([^)]+\))?", re.IGNORECASE)

# Strip any semester tokens if they bleed into the instructor line
strip_semester_in_line = re.compile(r"(Spring|Summer|Fall|Winter)\s+Semester\s+\d{4}(?:\s*\([^)]+\))?", re.IGNORECASE)

# Prompt fragments to exclude
prompt_patterns = [
    re.compile(r"your lecturer would like to know", re.IGNORECASE),
    re.compile(r"aspects of his/her teaching", re.IGNORECASE),
    re.compile(r"specific things you believe", re.IGNORECASE),
    re.compile(r"and why", re.IGNORECASE),
    re.compile(r"comments$", re.IGNORECASE),
    re.compile(r"comment on your evaluation", re.IGNORECASE),
    re.compile(r"explain your evaluation", re.IGNORECASE),
    re.compile(r"what other materials", re.IGNORECASE),
    re.compile(r"my overall evaluation", re.IGNORECASE),
    re.compile(r"per week\?", re.IGNORECASE),
    re.compile(r"should be added to support your learning\?", re.IGNORECASE),
    re.compile(r"improve (his/her|hisher) teaching in this course", re.IGNORECASE),
    re.compile(r"hours or by appointment", re.IGNORECASE),
    re.compile(r"satisfact below dept all", re.IGNORECASE),
]

# Table and header noise to exclude
noise_patterns = [
    re.compile(r"Course Evaluation Section Report", re.IGNORECASE),
    re.compile(r"Average|Mean|Dept Mean|All College", re.IGNORECASE),
    re.compile(r"STUDENT SELF EVALUATION", re.IGNORECASE),
    re.compile(r"STUDENT SELF EVAL 2", re.IGNORECASE),
    re.compile(r"STUDENT SELF EVAL 3", re.IGNORECASE),
    re.compile(r"15\+\s*12-14|9-11|7-8|4-6|1-3", re.IGNORECASE),
    re.compile(r"COURSE EVALUATION", re.IGNORECASE),
]

# Known atomic responses
atomic_responses = {"n/a", "na", "none", "nil", "ok", "good", "fair", "excellent", "poor"}

def repair_broken_words(text):
    text = re.sub(
        r'\b(?:[A-Za-z]\s+){2,}[A-Za-z]\b',
        lambda m: m.group(0).replace(" ", ""),
        text
    )
    text = text.replace("D is cuss ion", "Discussion")
    return text

def clean_line(line):
    line = re.sub(r"\s+", " ", line).strip()
    line = repair_broken_words(line)
    return line

def is_noise(line):
    return any(p.search(line) for p in noise_patterns)

def is_prompt(line):
    return any(p.search(line) for p in prompt_patterns)

def looks_like_table(line):
    if "%" in line:
        return True
    if len(re.findall(r"\d+", line)) >= 2:
        return True
    return False

def flush_atomic_or_comment(candidate, records,
                            current_course, current_section,
                            current_instructor, current_semester,
                            current_type):
    tokens = candidate.split()

    def append_record(text):
        records.append({
            "Course Code": current_course,
            "Section Code": current_section,
            "Instructor": current_instructor,
            "Semester": current_semester,
            "Comment Type": current_type,
            "Comment Text": text
        })

    # If the whole candidate is atomic, save directly
    if candidate.lower() in atomic_responses:
        append_record(candidate)
        return

    # Otherwise, split tokens and save atomics separately
    buffer_tokens = []
    for tok in tokens:
        if tok.lower() in atomic_responses:
            if buffer_tokens:
                append_record(" ".join(buffer_tokens))
                buffer_tokens = []
            append_record(tok)
        else:
            buffer_tokens.append(tok)
    if buffer_tokens:
        append_record(" ".join(buffer_tokens))

records = []
current_course = current_section = current_instructor = current_semester = current_type = None
buffer = ""

with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        text = page.extract_text()
        if not text:
            continue
        lines = text.split("\n")
        for raw_line in lines:
            line = clean_line(raw_line)
            if not line:
                continue

            # Detect course/section: reset instructor & semester on new section header
            cs_match = section_header_pattern.search(line)
            if cs_match:
                current_course, current_section = cs_match.groups()
                current_instructor = None
                current_semester = None
                continue

            # Detect instructor (strip any inline semester)
            instr_match = instructor_pattern.search(line)
            if instr_match:
                instructor_raw = instr_match.group(1).strip()
                instructor_clean = strip_semester_in_line.sub("", instructor_raw).strip().rstrip(",")
                current_instructor = instructor_clean
                # Also capture semester if it appears inline with instructor
                sem_inline = semester_pattern.search(instructor_raw)
                if sem_inline:
                    current_semester = sem_inline.group(0).strip()
                continue

            # Detect semester (on its own line)
            sem_match = semester_pattern.search(line)
            if sem_match:
                current_semester = sem_match.group(0).strip()
                continue

            # Detect question type
            for key, label in question_map.items():
                if key in line.lower():
                    current_type = label
                    buffer = ""
                    break

            # Skip noise/prompt/table lines
            if is_prompt(line) or is_noise(line) or looks_like_table(line):
                buffer = ""
                continue

            # If atomic response, save immediately (only if metadata present)
            if line.lower() in atomic_responses:
                if current_course and current_section and current_instructor and current_semester and current_type:
                    flush_atomic_or_comment(line, records, current_course, current_section, current_instructor, current_semester, current_type)
                buffer = ""
                continue

            # Accumulate partial sentences
            buffer += " " + line
            if re.search(r"[.?!]$", line):
                candidate = buffer.strip()
                buffer = ""
                # Only write if all metadata present
                if current_course and current_section and current_instructor and current_semester and current_type:
                    flush_atomic_or_comment(candidate, records, current_course, current_section, current_instructor, current_semester, current_type)

# Convert to DataFrame and save to Excel
df_comments = pd.DataFrame(records)
print(f"Extracted {len(df_comments)} comments")
print(df_comments.head(20).to_string(index=False))

df_comments.to_excel(output_excel, index=False)



Extracted 0 comments
Empty DataFrame
Columns: []
Index: []


ok so clearly the UNG parser logic does not at all. So we have to start to overhaul it 

## Overhaul 1.2

In [3]:
import pdfplumber
import re
import pandas as pd

# Full PDF path
pdf_path = r"C:\Users\Admin\OneDrive\Documents\Schoolwork\Projects\UNI finals project\Code stuff\US 2019 - Doctorate Faculty Evaluation.pdf"

# Output Excel file
output_excel = "Doctoral output.xlsx"

# Map question prompts to categories (expanded for PhD)
question_map = {
    "enjoyed most": "Strengths",
    "didn't like": "Weaknesses",
    "improve": "Suggestions",
    "feedback on assignments": "Feedback Evaluation",
    "course text": "Course Text",
    "materials or resources": "Resources",
    "overall evaluation": "Overall",
    "group work": "Feedback Evaluation",
    "case studies": "Resources",
    "lecturer was readily available": "Overall",
    "feedback was very insightful": "Suggestions",
    "analysis and interpretation": "Strengths",
    "financial statement": "Strengths",
    "peacebuilding": "Course Text",
    "conflict resolution": "Course Text",
}

# Regex patterns
section_header_pattern = re.compile(
    r"\((?P<course>[A-Z]{2,}\d{2,})\s*(?:\([A-Z]+\))?\)\s*Section:\s*(?P<section>[A-Za-z0-9]+)",
    re.IGNORECASE
)

instructor_pattern = re.compile(r"^Instructor:\s*([^\n\r]+)$", re.IGNORECASE)
semester_pattern = re.compile(r"(Spring|Summer|Fall|Winter)\s+Semester\s+\d{4}(?:\s*\([^)]+\))?", re.IGNORECASE)
strip_semester_in_line = re.compile(r"(Spring|Summer|Fall|Winter)\s+Semester\s+\d{4}(?:\s*\([^)]+\))?", re.IGNORECASE)

# Instructor fallbacks
name_last_first = re.compile(r"^[A-Z][a-z]+,\s*[A-Z][a-z]+(?:\s+[A-Z][a-z]+)?$")
name_first_last = re.compile(r"^[A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,2}$")

# Prompt fragments to exclude
prompt_patterns = [
    re.compile(r"your lecturer would like to know", re.IGNORECASE),
    re.compile(r"aspects of his/her teaching", re.IGNORECASE),
    re.compile(r"specific things you believe", re.IGNORECASE),
    re.compile(r"and why", re.IGNORECASE),
    re.compile(r"comments$", re.IGNORECASE),
    re.compile(r"comment on your evaluation", re.IGNORECASE),
    re.compile(r"explain your evaluation", re.IGNORECASE),
    re.compile(r"what other materials", re.IGNORECASE),
    re.compile(r"my overall evaluation", re.IGNORECASE),
    re.compile(r"per week\?", re.IGNORECASE),
    re.compile(r"should be added to support your learning\?", re.IGNORECASE),
    re.compile(r"improve (his/her|hisher) teaching in this course", re.IGNORECASE),
    re.compile(r"hours or by appointment", re.IGNORECASE),
    re.compile(r"satisfact below dept all", re.IGNORECASE),
]

# Table and header noise to exclude
noise_patterns = [
    re.compile(r"Course Evaluation Section Report", re.IGNORECASE),
    re.compile(r"Average|Mean|Dept Mean|All College", re.IGNORECASE),
    re.compile(r"STUDENT SELF EVALUATION", re.IGNORECASE),
    re.compile(r"STUDENT SELF EVAL 2", re.IGNORECASE),
    re.compile(r"STUDENT SELF EVAL 3", re.IGNORECASE),
    re.compile(r"15\+\s*12-14|9-11|7-8|4-6|1-3", re.IGNORECASE),
    re.compile(r"COURSE EVALUATION", re.IGNORECASE),
]

# Known atomic responses
atomic_responses = {"n/a", "na", "none", "nil", "ok", "good", "fair", "excellent", "poor"}

def repair_broken_words(text):
    text = re.sub(r'\b(?:[A-Za-z]\s+){2,}[A-Za-z]\b', lambda m: m.group(0).replace(" ", ""), text)
    text = text.replace("D is cuss ion", "Discussion")
    return text

def clean_line(line):
    line = re.sub(r"\s+", " ", line).strip()
    line = repair_broken_words(line)
    return line

def is_noise(line): return any(p.search(line) for p in noise_patterns)
def is_prompt(line): return any(p.search(line) for p in prompt_patterns)

def looks_like_table(line):
    if "%" in line: return True
    if len(re.findall(r"\d+", line)) >= 2: return True
    return False

def flush_atomic_or_comment(candidate, records, course, section, instructor, semester, ctype):
    candidate = candidate.strip()
    if not candidate or candidate in {".", ".."}: return
    tokens = candidate.split()
    buffer_tokens = []

    def append_record(text):
        records.append({
            "Course Code": course,
            "Section Code": section,
            "Instructor": instructor,
            "Semester": semester,
            "Comment Type": ctype,
            "Comment Text": text
        })
        print(f"[DEBUG] Added comment: {text}")

    if candidate.lower() in atomic_responses:
        append_record(candidate)
        return

    for tok in tokens:
        if tok.lower() in atomic_responses:
            if buffer_tokens:
                append_record(" ".join(buffer_tokens))
                buffer_tokens = []
            append_record(tok)
        else:
            buffer_tokens.append(tok)
    if buffer_tokens:
        append_record(" ".join(buffer_tokens))

def parse_pdf(pdf_path):
    records = []
    course = section = instructor = semester = ctype = None
    buffer = ""

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            text = page.extract_text()
            if not text: continue
            lines = text.split("\n")
            print(f"[DEBUG] Page {page_num}, {len(lines)} lines")

            for raw_line in lines:
                line = clean_line(raw_line)
                if not line: continue

                cs_match = section_header_pattern.search(line)
                if cs_match:
                    course, section = cs_match.groups()
                    instructor = None
                    semester = None
                    print(f"[DEBUG] Found course={course}, section={section}")
                    continue

                instr_match = instructor_pattern.search(line)
                if instr_match:
                    instructor_raw = instr_match.group(1).strip()
                    instructor_clean = strip_semester_in_line.sub("", instructor_raw).strip().rstrip(",")
                    instructor = instructor_clean
                    sem_inline = semester_pattern.search(instructor_raw)
                    if sem_inline: semester = sem_inline.group(0).strip()
                    print(f"[DEBUG] Found instructor={instructor}, semester={semester}")
                    continue

                if instructor is None:
                    if name_last_first.match(line) or name_first_last.match(line):
                        instructor = line.strip()
                        print(f"[DEBUG] Fallback instructor={instructor}")

                sem_match = semester_pattern.search(line)
                if sem_match:
                    semester = sem_match.group(0).strip()
                    print(f"[DEBUG] Found semester={semester}")
                    continue

                for key, label in question_map.items():
                    if key in line.lower():
                        ctype = label
                        buffer = ""
                        print(f"[DEBUG] Found type={ctype}")
                        break

                if is_prompt(line) or is_noise(line) or looks_like_table(line):
                    buffer = ""
                    continue

                if line.lower() in atomic_responses:
                    if course and section and instructor and semester and ctype:
                        flush_atomic_or_comment(line, records, course, section, instructor, semester, ctype)
                    buffer = ""
                    continue

                buffer += " " + line
                if re.search(r"[.?!]$", line):
                    candidate = buffer.strip()
                    buffer = ""
                    if course and section and instructor and semester and ctype:
                        flush_atomic_or_comment(candidate, records, course, section, instructor, semester, ctype)

    df_comments = pd.DataFrame(records)
    print(f"Extracted {len(df_comments)} comments")
    return df_comments

# Run parser
df_comments = parse_pdf(pdf_path)
df_comments.to_excel(output_excel, index=False)

[DEBUG] Page 1, 67 lines
[DEBUG] Found semester=Summer Semester 2019 (DOC)
[DEBUG] Found type=Overall
[DEBUG] Found type=Course Text
[DEBUG] Found type=Course Text
[DEBUG] Fallback instructor=Sometim Dept All
[DEBUG] Page 2, 53 lines
[DEBUG] Found type=Strengths
[DEBUG] Found type=Resources
[DEBUG] Found type=Feedback Evaluation
[DEBUG] Found type=Resources
[DEBUG] Found type=Resources
[DEBUG] Page 3, 46 lines
[DEBUG] Found type=Strengths
[DEBUG] Found type=Feedback Evaluation
[DEBUG] Found type=Feedback Evaluation
[DEBUG] Found type=Weaknesses
[DEBUG] Found type=Feedback Evaluation
[DEBUG] Page 4, 47 lines
[DEBUG] Found type=Suggestions
[DEBUG] Found type=Resources
[DEBUG] Found type=Feedback Evaluation
[DEBUG] Found type=Feedback Evaluation
[DEBUG] Found type=Suggestions
[DEBUG] Page 5, 46 lines
[DEBUG] Found type=Suggestions
[DEBUG] Found type=Course Text
[DEBUG] Found type=Course Text
[DEBUG] Found type=Course Text
[DEBUG] Found type=Course Text
[DEBUG] Page 6, 48 lines
[DEBUG] Fou

ok so what we know from all of that
What We Learned from the Debug Run
Semester detection works
We see repeated [DEBUG] Found semester=Summer Semester 2019 (DOC) — so semester regex is fine.
Comment type detection works
Strengths, Weaknesses, Suggestions, Feedback Evaluation, Resources, Course Text, Overall are all being picked up.
That means our expanded question_map is firing correctly.
Instructor detection is inconsistent
Sometimes we get real names: Kimotho, Stephen, Muturi, Lucy, Nyaga, Nancy.
Other times we get placeholders: Sometim Dept All, Staff, Faculty.
These placeholders are polluting the Instructor field and preventing flushes.
Zero comments extracted
Even though types and semesters are detected, flushes aren’t happening because the can_flush condition requires valid instructor + course + section + semester + type.
Since course/section regex isn’t firing (no [DEBUG] Found course=... lines), and instructor is often invalid, nothing passes the flush guard


so time to fix it

Overhaul 1.3

In [4]:
import pdfplumber
import re
import pandas as pd

# Full PDF path
pdf_path = r"C:\Users\Admin\OneDrive\Documents\Schoolwork\Projects\UNI finals project\Code stuff\US 2019 - Doctorate Faculty Evaluation.pdf"

# Output Excel file
output_excel = "Doctoral output.xlsx"

# Expanded question map for doctoral prompts
question_map = {
    "enjoyed most": "Strengths",
    "didn't like": "Weaknesses",
    "improve": "Suggestions",
    "feedback on assignments": "Feedback Evaluation",
    "course text": "Course Text",
    "materials or resources": "Resources",
    "overall evaluation": "Overall",
    # Doctoral-specific expansions
    "group work": "Feedback Evaluation",
    "case studies": "Resources",
    "lecturer was readily available": "Overall",
    "feedback was very insightful": "Suggestions",
    "analysis and interpretation": "Strengths",
    "financial statement": "Strengths",
    "peacebuilding": "Course Text",
    "conflict resolution": "Course Text",
    "guest lectures": "Resources",
    "journal critiques": "Feedback Evaluation",
    "presentations": "Strengths",
    "videos": "Resources",
    "assignments": "Feedback Evaluation",
    "timely feedback": "Feedback Evaluation",
    "strictness": "Weaknesses",
    "time management": "Suggestions",
    "entrepreneurship": "Course Text",
    "research interests": "Strengths",
    "mock meta-synthesis": "Suggestions",
}

# Regex patterns
section_header_pattern = re.compile(
    r"\((?P<course>[A-Z]{2,}\d{2,})\s*(?:\([A-Z]+\))?\)\s*Section:\s*(?P<section>[A-Za-z0-9]+)",
    re.IGNORECASE
)

instructor_pattern = re.compile(r"^Instructor:\s*([^\n\r]+)$", re.IGNORECASE)
semester_pattern = re.compile(r"(Spring|Summer|Fall|Winter)\s+Semester\s+\d{4}(?:\s*\([^)]+\))?", re.IGNORECASE)
strip_semester_in_line = re.compile(r"(Spring|Summer|Fall|Winter)\s+Semester\s+\d{4}(?:\s*\([^)]+\))?", re.IGNORECASE)

# Instructor fallbacks
name_last_first = re.compile(r"^[A-Z][a-z]+,\s*[A-Z][a-z]+(?:\s+[A-Z][a-z]+)?$")
name_first_last = re.compile(r"^[A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,2}$")

# Placeholders to ignore
invalid_instructors = {"sometim dept all", "staff, faculty", "dept all", "all aspect"}

# Prompt fragments to exclude
prompt_patterns = [
    re.compile(r"your lecturer would like to know", re.IGNORECASE),
    re.compile(r"aspects of his/her teaching", re.IGNORECASE),
    re.compile(r"specific things you believe", re.IGNORECASE),
    re.compile(r"and why", re.IGNORECASE),
    re.compile(r"comments$", re.IGNORECASE),
    re.compile(r"comment on your evaluation", re.IGNORECASE),
    re.compile(r"explain your evaluation", re.IGNORECASE),
    re.compile(r"what other materials", re.IGNORECASE),
    re.compile(r"my overall evaluation", re.IGNORECASE),
    re.compile(r"per week\?", re.IGNORECASE),
    re.compile(r"should be added to support your learning\?", re.IGNORECASE),
    re.compile(r"improve (his/her|hisher) teaching in this course", re.IGNORECASE),
    re.compile(r"hours or by appointment", re.IGNORECASE),
    re.compile(r"satisfact below dept all", re.IGNORECASE),
]

# Table and header noise to exclude
noise_patterns = [
    re.compile(r"Course Evaluation Section Report", re.IGNORECASE),
    re.compile(r"Average|Mean|Dept Mean|All College", re.IGNORECASE),
    re.compile(r"STUDENT SELF EVALUATION", re.IGNORECASE),
    re.compile(r"STUDENT SELF EVAL 2", re.IGNORECASE),
    re.compile(r"STUDENT SELF EVAL 3", re.IGNORECASE),
    re.compile(r"15\+\s*12-14|9-11|7-8|4-6|1-3", re.IGNORECASE),
    re.compile(r"COURSE EVALUATION", re.IGNORECASE),
]

# Known atomic responses
atomic_responses = {"n/a", "na", "none", "nil", "ok", "good", "fair", "excellent", "poor"}

def repair_broken_words(text):
    text = re.sub(r'\b(?:[A-Za-z]\s+){2,}[A-Za-z]\b', lambda m: m.group(0).replace(" ", ""), text)
    text = text.replace("D is cuss ion", "Discussion")
    return text

def clean_line(line):
    line = re.sub(r"\s+", " ", line).strip()
    line = repair_broken_words(line)
    return line

def is_noise(line): return any(p.search(line) for p in noise_patterns)
def is_prompt(line): return any(p.search(line) for p in prompt_patterns)

def looks_like_table(line):
    if "%" in line: return True
    if len(re.findall(r"\d+", line)) >= 2: return True
    return False

def can_flush(course, section, instructor, semester, ctype):
    valid_course = bool(course)
    valid_instructor = instructor and instructor.lower() not in invalid_instructors
    return ctype and semester and (valid_course or valid_instructor)

def flush_atomic_or_comment(candidate, records, course, section, instructor, semester, ctype):
    candidate = candidate.strip()
    if not candidate or candidate in {".", ".."}: return
    tokens = candidate.split()
    buffer_tokens = []

    def append_record(text):
        records.append({
            "Course Code": course,
            "Section Code": section,
            "Instructor": instructor,
            "Semester": semester,
            "Comment Type": ctype,
            "Comment Text": text
        })
        print(f"[DEBUG] Flushed comment: {text}")

    if candidate.lower() in atomic_responses:
        append_record(candidate)
        return

    for tok in tokens:
        if tok.lower() in atomic_responses:
            if buffer_tokens:
                append_record(" ".join(buffer_tokens))
                buffer_tokens = []
            append_record(tok)
        else:
            buffer_tokens.append(tok)
    if buffer_tokens:
        append_record(" ".join(buffer_tokens))

def parse_pdf(pdf_path):
    records = []
    course = section = instructor = semester = ctype = None
    buffer = ""

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            text = page.extract_text()
            if not text: continue
            lines = text.split("\n")
            print(f"[DEBUG] Page {page_num}, {len(lines)} lines")

            for raw_line in lines:
                line = clean_line(raw_line)
                if not line: continue

                cs_match = section_header_pattern.search(line)
                if cs_match:
                    course, section = cs_match.groups()
                    instructor = None
                    semester = None
                    print(f"[DEBUG] Found course={course}, section={section}")
                    continue

                instr_match = instructor_pattern.search(line)
                if instr_match:
                    instructor_raw = instr_match.group(1).strip()
                    instructor_clean = strip_semester_in_line.sub("", instructor_raw).strip().rstrip(",")
                    if instructor_clean.lower() in invalid_instructors:
                        instructor = None
                    else:
                        instructor = instructor_clean
                    sem_inline = semester_pattern.search(instructor_raw)
                    if sem_inline: semester = sem_inline.group(0).strip()
                    print(f"[DEBUG] Found instructor={instructor}, semester={semester}")
                    continue

                if instructor is None:
                    if name_last_first.match(line) or name_first_last.match(line):
                        instructor = line.strip()
                        print(f"[DEBUG] Fallback instructor={instructor}")

                sem_match = semester_pattern.search(line)
                if sem_match:
                    semester = sem_match.group(0).strip()
                    print(f"[DEBUG] Found semester={semester}")
                    continue

                for key, label in question_map.items():
                    if key in line.lower():
                        ctype = label
                        buffer = ""
                        print(f"[DEBUG] Found type={ctype}")
                        break

                if is_prompt(line) or is_noise(line) or looks_like_table(line):
                    buffer = ""
                    continue

                if line.lower() in atomic_responses:
                    if can_flush(course, section, instructor, semester, ctype):
                        flush_atomic_or_comment(line, records, course, section, instructor, semester, ctype)
                    buffer = ""
                    continue

                buffer += " " + line
                if re.search(r"[.?!]$", line):
                    candidate = buffer.strip()
                    buffer = ""
                    if can_flush(course, section, instructor, semester, ctype):
                        flush_atomic_or_comment(candidate, records, course, section, instructor, semester, ctype)
                    continue

    df_comments = pd.DataFrame(records)
    print(f"Extracted {len(df_comments)} comments")
    return df_comments

# Run parser
df_comments = parse_pdf(pdf_path)
df_comments.to_excel(output_excel, index=False)

[DEBUG] Page 1, 67 lines
[DEBUG] Found semester=Summer Semester 2019 (DOC)
[DEBUG] Found type=Overall
[DEBUG] Found type=Feedback Evaluation
[DEBUG] Found type=Course Text
[DEBUG] Found type=Course Text
[DEBUG] Fallback instructor=Sometim Dept All
[DEBUG] Page 2, 53 lines
[DEBUG] Found type=Feedback Evaluation
[DEBUG] Found type=Strengths
[DEBUG] Found type=Resources
[DEBUG] Found type=Feedback Evaluation
[DEBUG] Found type=Resources
[DEBUG] Found type=Feedback Evaluation
[DEBUG] Found type=Resources
[DEBUG] Page 3, 46 lines
[DEBUG] Found type=Strengths
[DEBUG] Found type=Feedback Evaluation
[DEBUG] Found type=Feedback Evaluation
[DEBUG] Found type=Weaknesses
[DEBUG] Found type=Feedback Evaluation
[DEBUG] Found type=Feedback Evaluation
[DEBUG] Page 4, 47 lines
[DEBUG] Found type=Weaknesses
[DEBUG] Found type=Suggestions
[DEBUG] Found type=Resources
[DEBUG] Found type=Feedback Evaluation
[DEBUG] Found type=Feedback Evaluation
[DEBUG] Found type=Suggestions
[DEBUG] Page 5, 46 lines
[DEBU

So course code and course section is not being picked up at all. there is bleed through of the word 'week?'. As well as names are not being picked up, cause we only have 4 names being registered, yet the pdf has multiple ones that are not being registered. But so far we have improment

Overhaul 1.4

In [5]:
import pdfplumber
import re
import pandas as pd
from datetime import datetime

# Full PDF path
pdf_path = r"C:\Users\Admin\OneDrive\Documents\Schoolwork\Projects\UNI finals project\Code stuff\US 2019 - Doctorate Faculty Evaluation.pdf"

# Output Excel file
output_excel = "Doctoral output.xlsx"

# Expanded question map for doctoral prompts
question_map = {
    "enjoyed most": "Strengths",
    "didn't like": "Weaknesses",
    "improve": "Suggestions",
    "feedback on assignments": "Feedback Evaluation",
    "course text": "Course Text",
    "materials or resources": "Resources",
    "overall evaluation": "Overall",
    # Doctoral-specific expansions
    "group work": "Feedback Evaluation",
    "case studies": "Resources",
    "lecturer was readily available": "Overall",
    "feedback was very insightful": "Suggestions",
    "analysis and interpretation": "Strengths",
    "financial statement": "Strengths",
    "peacebuilding": "Course Text",
    "conflict resolution": "Course Text",
    "guest lectures": "Resources",
    "journal critiques": "Feedback Evaluation",
    "presentations": "Strengths",
    "videos": "Resources",
    "assignments": "Feedback Evaluation",
    "timely feedback": "Feedback Evaluation",
    "strictness": "Weaknesses",
    "time management": "Suggestions",
    "entrepreneurship": "Course Text",
    "research interests": "Strengths",
    "mock meta-synthesis": "Suggestions",
}

# Regex patterns
section_header_pattern = re.compile(
    r"\((?P<course>[A-Z]{2,}\d{2,})\s*(?:\([A-Z]+\))?\)\s*Section:\s*(?P<section>[A-Za-z0-9]+)",
    re.IGNORECASE
)

instructor_pattern = re.compile(r"^Instructor:\s*([^\n\r]+)$", re.IGNORECASE)
semester_pattern = re.compile(r"(Spring|Summer|Fall|Winter)\s+Semester\s+\d{4}(?:\s*\([^)]+\))?", re.IGNORECASE)
strip_semester_in_line = re.compile(r"(Spring|Summer|Fall|Winter)\s+Semester\s+\d{4}(?:\s*\([^)]+\))?", re.IGNORECASE)

# Instructor fallbacks
name_last_first = re.compile(r"^[A-Z][a-z]+,\s*[A-Z][a-z]+(?:\s+[A-Z][a-z]+)?$")
name_first_last = re.compile(r"^[A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,2}$")

# Placeholders to ignore
invalid_instructors = {"sometim dept all", "staff, faculty", "dept all", "all aspect"}

# Prompt fragments to exclude
prompt_patterns = [
    re.compile(r"your lecturer would like to know", re.IGNORECASE),
    re.compile(r"aspects of his/her teaching", re.IGNORECASE),
    re.compile(r"specific things you believe", re.IGNORECASE),
    re.compile(r"and why", re.IGNORECASE),
    re.compile(r"comments$", re.IGNORECASE),
    re.compile(r"comment on your evaluation", re.IGNORECASE),
    re.compile(r"explain your evaluation", re.IGNORECASE),
    re.compile(r"what other materials", re.IGNORECASE),
    re.compile(r"my overall evaluation", re.IGNORECASE),
    re.compile(r"per week\?", re.IGNORECASE),
    re.compile(r"should be added to support your learning\?", re.IGNORECASE),
    re.compile(r"improve (his/her|hisher) teaching in this course", re.IGNORECASE),
    re.compile(r"hours or by appointment", re.IGNORECASE),
    re.compile(r"satisfact below dept all", re.IGNORECASE),
]

# Table and header noise to exclude
noise_patterns = [
    re.compile(r"Course Evaluation Section Report", re.IGNORECASE),
    re.compile(r"Average|Mean|Dept Mean|All College", re.IGNORECASE),
    re.compile(r"STUDENT SELF EVALUATION", re.IGNORECASE),
    re.compile(r"STUDENT SELF EVAL 2", re.IGNORECASE),
    re.compile(r"STUDENT SELF EVAL 3", re.IGNORECASE),
    re.compile(r"15\+\s*12-14|9-11|7-8|4-6|1-3", re.IGNORECASE),
    re.compile(r"COURSE EVALUATION", re.IGNORECASE),
]

# Known atomic responses
atomic_responses = {"n/a", "na", "none", "nil", "ok", "good", "fair", "excellent", "poor"}

def repair_broken_words(text):
    text = re.sub(r'\b(?:[A-Za-z]\s+){2,}[A-Za-z]\b', lambda m: m.group(0).replace(" ", ""), text)
    text = text.replace("D is cuss ion", "Discussion")
    return text

def clean_line(line):
    line = re.sub(r"\s+", " ", line).strip()
    line = repair_broken_words(line)
    return line

def is_noise(line): return any(p.search(line) for p in noise_patterns)
def is_prompt(line): return any(p.search(line) for p in prompt_patterns)

def looks_like_table(line):
    if "%" in line: return True
    if len(re.findall(r"\d+", line)) >= 2: return True
    return False

def can_flush(course, section, instructor, semester, ctype):
    valid_course = bool(course)
    valid_instructor = instructor and instructor.lower() not in invalid_instructors
    return ctype and semester and (valid_course or valid_instructor)

def flush_atomic_or_comment(candidate, records, course, section, instructor, semester, ctype):
    candidate = candidate.strip()
    if not candidate or candidate in {".", ".."}: return
    tokens = candidate.split()
    buffer_tokens = []

    def append_record(text):
        records.append({
            "Course Code": course,
            "Section Code": section,
            "Instructor": instructor,
            "Semester": semester,
            "Comment Type": ctype,
            "Comment Text": text
        })
        print(f"[DEBUG] Flushed comment: {text}")

    if candidate.lower() in atomic_responses:
        append_record(candidate)
        return

    for tok in tokens:
        if tok.lower() in atomic_responses:
            if buffer_tokens:
                append_record(" ".join(buffer_tokens))
                buffer_tokens = []
            append_record(tok)
        else:
            buffer_tokens.append(tok)
    if buffer_tokens:
        append_record(" ".join(buffer_tokens))

def parse_pdf(pdf_path):
    records = []
    course = section = instructor = semester = ctype = None
    buffer = ""

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            text = page.extract_text()
            if not text: continue
            lines = text.split("\n")
            print(f"[DEBUG] Page {page_num}, {len(lines)} lines")

            for raw_line in lines:
                line = clean_line(raw_line)
                if not line: continue

                cs_match = section_header_pattern.search(line)
                if cs_match:
                    course, section = cs_match.groups()
                    instructor = None
                    semester = None
                    print(f"[DEBUG] Found course={course}, section={section}")
                    continue

                instr_match = instructor_pattern.search(line)
                if instr_match:
                    instructor_raw = instr_match.group(1).strip()
                    instructor_clean = strip_semester_in_line.sub("", instructor_raw).strip().rstrip(",")
                    if instructor_clean.lower() in invalid_instructors:
                        instructor = None
                    else:
                        instructor = instructor_clean
                    sem_inline = semester_pattern.search(instructor_raw)
                    if sem_inline: semester = sem_inline.group(0).strip()
                    print(f"[DEBUG] Found instructor={instructor}, semester={semester}")
                    continue

                if instructor is None:
                    if name_last_first.match(line) or name_first_last.match(line):
                        instructor = line.strip()
                        print(f"[DEBUG] Fallback instructor={instructor}")

                sem_match = semester_pattern.search(line)
                if sem_match:
                    semester = sem_match.group(0).strip()
                    print(f"[DEBUG] Found semester={semester}")
                    continue

                for key, label in question_map.items():
                    if key in line.lower():
                        ctype = label
                        buffer = ""
                        print(f"[DEBUG] Found type={ctype}")
                        break

                if is_prompt(line) or is_noise(line) or looks_like_table(line):
                    buffer = ""
                    continue

                if line.lower() in atomic_responses:
                    if can_flush(course, section, instructor, semester, ctype):
                        flush_atomic_or_comment(line, records, course, section, instructor, semester, ctype)
                    buffer = ""
                    continue

                buffer += " " + line
                if re.search(r"[.?!]$", line):
                    candidate = buffer.strip()
                    buffer = ""
                    if can_flush(course, section, instructor, semester, ctype):
                        flush_atomic_or_comment(candidate, records, course, section, instructor, semester, ctype)
                    continue

    df_comments = pd.DataFrame(records)
    print(f"Extracted {len(df_comments)} comments")
    return df_comments
# Run parser
df_comments = parse_pdf(pdf_path)

def save_dataframe_safely(df, base_path):
    try:
        df.to_excel(base_path, index=False)
        print(f"[INFO] Saved output to: {base_path}")
        return base_path
    except PermissionError:
        # If the target file is locked (open in Excel) create a timestamped alternative filename
        ts = datetime.now().strftime("%Y%m%d_%H%M%S")
        if base_path.lower().endswith(".xlsx"):
            alt_path = base_path[:-5] + f"_{ts}.xlsx"
        else:
            alt_path = base_path + f"_{ts}"
        try:
            df.to_excel(alt_path, index=False)
            print(f"[WARN] Could not write to {base_path} (file may be open). Saved to: {alt_path}")
            return alt_path
        except Exception as e:
            print(f"[ERROR] Failed to save to alternate path {alt_path}: {e}")
            raise

output_path = save_dataframe_safely(df_comments, output_excel)
df_comments.to_excel(output_excel, index=False)

[DEBUG] Page 1, 67 lines
[DEBUG] Found semester=Summer Semester 2019 (DOC)
[DEBUG] Found type=Overall
[DEBUG] Found type=Feedback Evaluation
[DEBUG] Found type=Course Text
[DEBUG] Found type=Course Text
[DEBUG] Fallback instructor=Sometim Dept All
[DEBUG] Page 2, 53 lines
[DEBUG] Found type=Feedback Evaluation
[DEBUG] Found type=Strengths
[DEBUG] Found type=Resources
[DEBUG] Found type=Feedback Evaluation
[DEBUG] Found type=Resources
[DEBUG] Found type=Feedback Evaluation
[DEBUG] Found type=Resources
[DEBUG] Page 3, 46 lines
[DEBUG] Found type=Strengths
[DEBUG] Found type=Feedback Evaluation
[DEBUG] Found type=Feedback Evaluation
[DEBUG] Found type=Weaknesses
[DEBUG] Found type=Feedback Evaluation
[DEBUG] Found type=Feedback Evaluation
[DEBUG] Page 4, 47 lines
[DEBUG] Found type=Weaknesses
[DEBUG] Found type=Suggestions
[DEBUG] Found type=Resources
[DEBUG] Found type=Feedback Evaluation
[DEBUG] Found type=Feedback Evaluation
[DEBUG] Found type=Suggestions
[DEBUG] Page 5, 46 lines
[DEBU

ok so i decided to actually read over the pdf(its just 89 pages) and ive learnt that the code was working as in intended, there are actuall sections that have no instructor name at all but just have ' Instructor: Staff, Faculty'. or a class like oral presentation that would have a panel instead, thats why there is no staff name but instea d staff, faculty. So that means we don't have to filter it out, but we still need to understand why the only names we are picking up are kimotho Stephan, nyaga nancy and muturi lucy.

ok so after review ive learnt this
- Their names appear on the same line as "Instructor:"
- The line ends cleanly — no semester bleed-through
those names get picked up because they dont extend or start on the next line so we have to factor that into the code

In [6]:
import pdfplumber
import re
import pandas as pd

# Full PDF path
pdf_path = r"C:\Users\Admin\OneDrive\Documents\Schoolwork\Projects\UNI finals project\Code stuff\US 2019 - Doctorate Faculty Evaluation.pdf"

# Output Excel file
output_excel = "Doctoral output.xlsx"

# Expanded question map for doctoral prompts
question_map = {
    "enjoyed most": "Strengths",
    "didn't like": "Weaknesses",
    "improve": "Suggestions",
    "feedback on assignments": "Feedback Evaluation",
    "course text": "Course Text",
    "materials or resources": "Resources",
    "overall evaluation": "Overall",
    "group work": "Feedback Evaluation",
    "case studies": "Resources",
    "lecturer was readily available": "Overall",
    "feedback was very insightful": "Suggestions",
    "analysis and interpretation": "Strengths",
    "financial statement": "Strengths",
    "peacebuilding": "Course Text",
    "conflict resolution": "Course Text",
    "guest lectures": "Resources",
    "journal critiques": "Feedback Evaluation",
    "presentations": "Strengths",
    "videos": "Resources",
    "assignments": "Feedback Evaluation",
    "timely feedback": "Feedback Evaluation",
    "strictness": "Weaknesses",
    "time management": "Suggestions",
    "entrepreneurship": "Course Text",
    "research interests": "Strengths",
    "mock meta-synthesis": "Suggestions",
}

# Regex patterns
section_header_pattern = re.compile(
    r"\((?P<course>[A-Z]{2,}\d{2,})\s*(?:\([A-Z]+\))?\)?\s*Section:\s*(?P<section>[A-Za-z0-9]+)",
    re.IGNORECASE
)

instructor_pattern = re.compile(r"^Instructor:\s*(.+)", re.IGNORECASE)
semester_pattern = re.compile(r"(Spring|Summer|Fall|Winter)\s+Semester\s+\d{4}(?:\s*\([^)]+\))?", re.IGNORECASE)
strip_semester_in_line = re.compile(r"(Spring|Summer|Fall|Winter)\s+Semester\s+\d{4}(?:\s*\([^)]+\))?", re.IGNORECASE)

# Instructor fallbacks
name_fallback_pattern = re.compile(r"^[A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,2}$")

# Placeholders to accept as valid
valid_placeholders = {"staff, faculty", "sometim dept all", "dept all", "all aspect"}

# Prompt fragments to exclude
prompt_patterns = [
    re.compile(r"your lecturer would like to know", re.IGNORECASE),
    re.compile(r"aspects of his/her teaching", re.IGNORECASE),
    re.compile(r"specific things you believe", re.IGNORECASE),
    re.compile(r"and why", re.IGNORECASE),
    re.compile(r"comments$", re.IGNORECASE),
    re.compile(r"comment on your evaluation", re.IGNORECASE),
    re.compile(r"explain your evaluation", re.IGNORECASE),
    re.compile(r"what other materials", re.IGNORECASE),
    re.compile(r"my overall evaluation", re.IGNORECASE),
    re.compile(r"per week\?", re.IGNORECASE),
    re.compile(r"should be added to support your learning\?", re.IGNORECASE),
    re.compile(r"improve (his/her|hisher) teaching in this course", re.IGNORECASE),
    re.compile(r"hours or by appointment", re.IGNORECASE),
    re.compile(r"satisfact below dept all", re.IGNORECASE),
]

# Table and header noise to exclude
noise_patterns = [
    re.compile(r"Course Evaluation Section Report", re.IGNORECASE),
    re.compile(r"Average|Mean|Dept Mean|All College", re.IGNORECASE),
    re.compile(r"STUDENT SELF EVALUATION", re.IGNORECASE),
    re.compile(r"STUDENT SELF EVAL 2", re.IGNORECASE),
    re.compile(r"STUDENT SELF EVAL 3", re.IGNORECASE),
    re.compile(r"15\+\s*12-14|9-11|7-8|4-6|1-3", re.IGNORECASE),
    re.compile(r"COURSE EVALUATION", re.IGNORECASE),
]

# Known atomic responses
atomic_responses = {"n/a", "na", "none", "nil", "ok", "good", "fair", "excellent", "poor"}
fragment_tokens = {"week?", "times", "dept all", "aspect", "section"}

def repair_broken_words(text):
    text = re.sub(r'\b(?:[A-Za-z]\s+){2,}[A-Za-z]\b', lambda m: m.group(0).replace(" ", ""), text)
    text = text.replace("D is cuss ion", "Discussion")
    return text

def clean_line(line):
    line = re.sub(r"\s+", " ", line).strip()
    line = repair_broken_words(line)
    return line

def is_noise(line): return any(p.search(line) for p in noise_patterns)
def is_prompt(line): return any(p.search(line) for p in prompt_patterns)
def is_fragment(line): return line.lower().strip() in fragment_tokens or len(line.split()) <= 2

def looks_like_table(line):
    if "%" in line: return True
    if len(re.findall(r"\d+", line)) >= 2: return True
    return False

def can_flush(course, section, instructor, semester, ctype):
    valid_course = bool(course)
    valid_instructor = instructor and (instructor.lower() in valid_placeholders or name_fallback_pattern.match(instructor))
    return ctype and semester and (valid_course or valid_instructor)

def flush_atomic_or_comment(candidate, records, course, section, instructor, semester, ctype):
    candidate = candidate.strip()
    if not candidate or candidate in {".", ".."} or is_fragment(candidate):
        return
    tokens = candidate.split()
    buffer_tokens = []

    def append_record(text):
        records.append({
            "Course Code": course,
            "Section Code": section,
            "Instructor": instructor,
            "Semester": semester,
            "Comment Type": ctype,
            "Comment Text": text
        })
        print(f"[DEBUG] Flushed comment: {text}")

    if candidate.lower() in atomic_responses:
        append_record(candidate)
        return

    for tok in tokens:
        if tok.lower() in atomic_responses:
            if buffer_tokens:
                append_record(" ".join(buffer_tokens))
                buffer_tokens = []
            append_record(tok)
        else:
            buffer_tokens.append(tok)
    if buffer_tokens:
        append_record(" ".join(buffer_tokens))

def parse_pdf(pdf_path):
    records = []
    course = section = instructor = semester = ctype = None
    buffer = ""

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            text = page.extract_text()
            if not text: continue
            lines = text.split("\n")
            print(f"[DEBUG] Page {page_num}, {len(lines)} lines")

            for raw_line in lines:
                line = clean_line(raw_line)
                if not line: continue

                # Detect course/section
                cs_match = section_header_pattern.search(line)
                if cs_match:
                    course, section = cs_match.groups()
                    instructor = None
                    semester = None
                    print(f"[DEBUG] Found course={course}, section={section}")
                    continue

                # Detect instructor
                instr_match = instructor_pattern.search(line)
                if instr_match:
                    instructor_raw = instr_match.group(1).strip()
                    instructor_clean = strip_semester_in_line.sub("", instructor_raw).strip().rstrip(",")
                    instructor = instructor_clean
                    sem_inline = semester_pattern.search(instructor_raw)
                    if sem_inline: semester = sem_inline.group(0).strip()
                    print(f"[DEBUG] Found instructor={instructor}, semester={semester}")
                    continue

                # Fallback instructor detection
                if instructor is None and not looks_like_table(line):
                    candidate = line.strip()
                    if name_fallback_pattern.match(candidate) or candidate.lower() in valid_placeholders:
                        instructor = candidate
                        print(f"[DEBUG] Fallback instructor={instructor}")

                # Detect semester
                sem_match = semester_pattern.search(line)
                if sem_match:
                    semester = sem_match.group(0).strip()
                    print(f"[DEBUG] Found semester={semester}")
                    continue

                # Detect question type
                for key, label in question_map.items():
                    if key in line.lower():
                        ctype = label
                        buffer = ""
                        print(f"[DEBUG] Found type={ctype}")
                        break

                # Skip noise/prompt/table
                if is_prompt(line) or is_noise(line) or looks_like_table(line):
                    buffer = ""
                    continue

                # Atomic responses
                if line.lower() in atomic_responses:
                    if can_flush(course, section, instructor, semester, ctype):
                        flush_atomic_or_comment(line, records, course, section, instructor, semester, ctype)
                    buffer = ""
                    continue

                # Accumulate sentences
                buffer += " " + line
                if re.search(r"[.?!]$", line):
                    candidate = buffer.strip()
                    buffer = ""
                    if can_flush(course, section, instructor, semester, ctype):
                        flush_atomic_or_comment(candidate, records, course, section, instructor, semester, ctype)

    df_comments = pd.DataFrame(records)
    print(f"Extracted {len(df_comments)} comments")
    df_comments.to_excel(output_excel, index=False)
    return df_comments

# Run parser
df_comments = parse_pdf(pdf_path)

[DEBUG] Page 1, 67 lines
[DEBUG] Found semester=Summer Semester 2019 (DOC)
[DEBUG] Found type=Overall
[DEBUG] Found type=Feedback Evaluation
[DEBUG] Found type=Course Text
[DEBUG] Found type=Course Text
[DEBUG] Fallback instructor=Sometim Dept All
[DEBUG] Page 2, 53 lines
[DEBUG] Found type=Feedback Evaluation
[DEBUG] Found type=Strengths
[DEBUG] Found type=Resources
[DEBUG] Found type=Feedback Evaluation
[DEBUG] Found type=Resources
[DEBUG] Found type=Feedback Evaluation
[DEBUG] Found type=Resources
[DEBUG] Page 3, 46 lines
[DEBUG] Found type=Strengths
[DEBUG] Found type=Feedback Evaluation
[DEBUG] Flushed comment: I enjoyed all PointPoint lectures; explanation of financial statements; case studies and group work.
[DEBUG] Flushed comment: The lectures were great because lecturer is very interesting and he simplified explanations enough for all to understand and be able to apply in critical business advisory settings.
[DEBUG] Flushed comment: Practical application Their real life exper

ok so what do we now know.
ok so some comments for some reason are split into different lines. so in the screenshot in the section overall, there is a clear sentence that has been split up, but good news is that 'week?' is no longer being picked up. but now weve lost all name instructors and we are only picking up staff faculty as we wanted, but also 'Sometim Dept All' that we don't want. and still we are not picking up course code, and course section.

In [7]:
import pdfplumber
import re
import pandas as pd

# Full PDF path
pdf_path = r"C:\Users\Admin\OneDrive\Documents\Schoolwork\Projects\UNI finals project\Code stuff\US 2019 - Doctorate Faculty Evaluation.pdf"

# Output Excel file
output_excel = "Doctoral output.xlsx"

# Expanded question map for doctoral prompts
question_map = {
    "enjoyed most": "Strengths",
    "didn't like": "Weaknesses",
    "improve": "Suggestions",
    "feedback on assignments": "Feedback Evaluation",
    "course text": "Course Text",
    "materials or resources": "Resources",
    "overall evaluation": "Overall",
    "group work": "Feedback Evaluation",
    "case studies": "Resources",
    "lecturer was readily available": "Overall",
    "feedback was very insightful": "Suggestions",
    "analysis and interpretation": "Strengths",
    "financial statement": "Strengths",
    "peacebuilding": "Course Text",
    "conflict resolution": "Course Text",
    "guest lectures": "Resources",
    "journal critiques": "Feedback Evaluation",
    "presentations": "Strengths",
    "videos": "Resources",
    "assignments": "Feedback Evaluation",
    "timely feedback": "Feedback Evaluation",
    "strictness": "Weaknesses",
    "time management": "Suggestions",
    "entrepreneurship": "Course Text",
    "research interests": "Strengths",
    "mock meta-synthesis": "Suggestions",
}

# Regex patterns
section_header_pattern = re.compile(
    r"\((?P<course>[A-Z]{2,}\d{2,})\s*(?:\([A-Z0-9]+\))?\)?\s*Section:\s*(?P<section>[A-Za-z0-9]+)",
    re.IGNORECASE
)

instructor_pattern = re.compile(r"^Instructor:\s*(.+)", re.IGNORECASE)
semester_pattern = re.compile(r"(Spring|Summer|Fall|Winter)\s+Semester\s+\d{4}(?:\s*\([^)]+\))?", re.IGNORECASE)
strip_semester_in_line = re.compile(r"(Spring|Summer|Fall|Winter)\s+Semester\s+\d{4}(?:\s*\([^)]+\))?", re.IGNORECASE)

# Instructor fallbacks
name_fallback_pattern = re.compile(r"^[A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,2}$")

# Placeholders to accept as valid
valid_placeholders = {"staff, faculty", "sometim dept all", "dept all", "all aspect"}

# Prompt fragments to exclude
prompt_patterns = [
    re.compile(r"your lecturer would like to know", re.IGNORECASE),
    re.compile(r"aspects of his/her teaching", re.IGNORECASE),
    re.compile(r"specific things you believe", re.IGNORECASE),
    re.compile(r"and why", re.IGNORECASE),
    re.compile(r"comments$", re.IGNORECASE),
    re.compile(r"comment on your evaluation", re.IGNORECASE),
    re.compile(r"explain your evaluation", re.IGNORECASE),
    re.compile(r"what other materials", re.IGNORECASE),
    re.compile(r"my overall evaluation", re.IGNORECASE),
    re.compile(r"per week\?", re.IGNORECASE),
    re.compile(r"should be added to support your learning\?", re.IGNORECASE),
    re.compile(r"improve (his/her|hisher) teaching in this course", re.IGNORECASE),
    re.compile(r"hours or by appointment", re.IGNORECASE),
    re.compile(r"satisfact below dept all", re.IGNORECASE),
]

# Table and header noise to exclude
noise_patterns = [
    re.compile(r"Course Evaluation Section Report", re.IGNORECASE),
    re.compile(r"Average|Mean|Dept Mean|All College", re.IGNORECASE),
    re.compile(r"STUDENT SELF EVALUATION", re.IGNORECASE),
    re.compile(r"STUDENT SELF EVAL 2", re.IGNORECASE),
    re.compile(r"STUDENT SELF EVAL 3", re.IGNORECASE),
    re.compile(r"15\+\s*12-14|9-11|7-8|4-6|1-3", re.IGNORECASE),
    re.compile(r"COURSE EVALUATION", re.IGNORECASE),
]

# Known atomic responses
atomic_responses = {"n/a", "na", "none", "nil", "ok", "good", "fair", "excellent", "poor"}
fragment_tokens = {"week?", "times", "dept all", "aspect", "section"}

def repair_broken_words(text):
    text = re.sub(r'\b(?:[A-Za-z]\s+){2,}[A-Za-z]\b', lambda m: m.group(0).replace(" ", ""), text)
    text = text.replace("D is cuss ion", "Discussion")
    return text

def clean_line(line):
    line = re.sub(r"\s+", " ", line).strip()
    line = repair_broken_words(line)
    return line

def is_noise(line): return any(p.search(line) for p in noise_patterns)
def is_prompt(line): return any(p.search(line) for p in prompt_patterns)
def is_fragment(line): return line.lower().strip() in fragment_tokens or len(line.split()) <= 2

def looks_like_table(line):
    if "%" in line: return True
    if len(re.findall(r"\d+", line)) >= 2: return True
    return False



def can_flush(course, section, instructor, semester, ctype):
    valid_course = bool(course)
    valid_instructor = instructor and (instructor.lower() in valid_placeholders or name_fallback_pattern.match(instructor))
    return ctype and semester and (valid_course or valid_instructor)

def flush_atomic_or_comment(candidate, records, course, section, instructor, semester, ctype):
    candidate = candidate.strip()
    if not candidate or candidate in {".", ".."} or is_fragment(candidate):
        return
    tokens = candidate.split()
    buffer_tokens = []

    def append_record(text):
        records.append({
            "Course Code": course,
            "Section Code": section,
            "Instructor": instructor,
            "Semester": semester,
            "Comment Type": ctype,
            "Comment Text": text
        })
        print(f"[DEBUG] Flushed comment: {text}")

    if candidate.lower() in atomic_responses:
        append_record(candidate)
        return

    for tok in tokens:
        if tok.lower() in atomic_responses:
            if buffer_tokens:
                append_record(" ".join(buffer_tokens))
                buffer_tokens = []
            append_record(tok)
        else:
            buffer_tokens.append(tok)
    if buffer_tokens:
        append_record(" ".join(buffer_tokens))

def parse_pdf(pdf_path):
    records = []
    course = section = instructor = semester = ctype = None
    buffer = ""

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            text = page.extract_text()
            if not text: continue
            lines = text.split("\n")
            print(f"[DEBUG] Page {page_num}, {len(lines)} lines")

            for raw_line in lines:
                line = clean_line(raw_line)
                if not line: continue

                # Detect course/section
                cs_match = section_header_pattern.search(line)
                if cs_match:
                    course, section = cs_match.groups()
                    instructor = None
                    semester = None
                    print(f"[DEBUG] Found course={course}, section={section}")
                    continue

                # Detect instructor from dedicated line or embedded in course header
                if "Instructor:" in line:
                    # case 1 or 3 instructor line
                    instr_match = re.search(r"Instructor:\s*([A-Za-z ,]+)", line)
                    if instr_match:
                        instructor_raw = instr_match.group(1).strip()
                        instructor_clean = strip_semester_in_line.sub("", instructor_raw).strip().rstrip(",")
                        instructor = instructor_clean
                        sem_inline = semester_pattern.search(line)
                        if sem_inline:
                            semester = sem_inline.group(0).strip()
                        print(f"[DEBUG] Found instructor={instructor}, semester={semester}")
                        continue

                # Case 2: instructor on one line, semester on next
                if instructor is None and name_fallback_pattern.match(line):
                    instructor = line.strip()
                    print(f"[DEBUG] Fallback instructor={instructor}")
                    continue
                if semester is None:
                    sem_match = semester_pattern.search(line)
                    if sem_match:
                        semester = sem_match.group(0).strip()
                        print(f"[DEBUG] Found semester={semester}")
                        continue

            

                # Detect semester
                sem_match = semester_pattern.search(line)
                if sem_match:
                    semester = sem_match.group(0).strip()
                    print(f"[DEBUG] Found semester={semester}")
                    continue

                # Detect question type
                for key, label in question_map.items():
                    if key in line.lower():
                        ctype = label
                        buffer = ""
                        print(f"[DEBUG] Found type={ctype}")
                        break

                # Skip noise/prompt/table
                if is_prompt(line) or is_noise(line) or looks_like_table(line):
                    buffer = ""
                    continue

                # Atomic responses
                if line.lower() in atomic_responses:
                    if can_flush(course, section, instructor, semester, ctype):
                        flush_atomic_or_comment(line, records, course, section, instructor, semester, ctype)
                    buffer = ""
                    continue

                # Accumulate sentences
                buffer += " " + line
                if re.search(r"[.?!]$", line):
                    candidate = buffer.strip()
                    buffer = ""
                    if can_flush(course, section, instructor, semester, ctype):
                        flush_atomic_or_comment(candidate, records, course, section, instructor, semester, ctype)

    df_comments = pd.DataFrame(records)
    print(f"Extracted {len(df_comments)} comments")
    df_comments.to_excel(output_excel, index=False)
    return df_comments

# Run parser
df_comments = parse_pdf(pdf_path)

[DEBUG] Page 1, 67 lines
[DEBUG] Found course=DBA7030, section=A
[DEBUG] Found semester=Summer Semester 2019 (DOC)
[DEBUG] Found type=Overall
[DEBUG] Found type=Feedback Evaluation
[DEBUG] Found type=Course Text
[DEBUG] Found type=Course Text
[DEBUG] Fallback instructor=Sometim Dept All
[DEBUG] Page 2, 53 lines
[DEBUG] Found type=Feedback Evaluation
[DEBUG] Found type=Strengths
[DEBUG] Found type=Resources
[DEBUG] Found type=Feedback Evaluation
[DEBUG] Found type=Resources
[DEBUG] Found type=Feedback Evaluation
[DEBUG] Found type=Resources
[DEBUG] Page 3, 46 lines
[DEBUG] Found type=Strengths
[DEBUG] Found type=Feedback Evaluation
[DEBUG] Flushed comment: I enjoyed all PointPoint lectures; explanation of financial statements; case studies and group work.
[DEBUG] Flushed comment: The lectures were great because lecturer is very interesting and he simplified explanations enough for all to understand and be able to apply in critical business advisory settings.
[DEBUG] Flushed comment: Pra

ayt so update, decided to ran the parser logic for the UNDG with the doctoral pdf and i ended up getting more names than i do with my current code. so i decided to run things back, went to the pdf and copied pasted all the names dirrectly so that ik how many lecs we should be looking for as well as to see any formatting differences as to why certain names were being picked and why others werent. 
Undergraduate parser (surprisingly effective):
- Caught most named instructors, including Arasa, Kamau, Koshal, Brown, Omollo
- Missed Ali, Achoki, Mbae, Veney, Webbo, Sungi
Doctoral parser (more structured but stricter):
- Caught Kimotho, Nyaga, Muturi, Ongecha
- Missed Arasa, Kamau, Koshal, Brown, Omollo
Common failure mode:



so imma run a rededicated instructor logic code 

In [8]:
import pdfplumber
import re
import pandas as pd

# Path to your doctoral PDF
pdf_path = r"C:\Users\Admin\OneDrive\Documents\Schoolwork\Projects\UNI finals project\Code stuff\US 2019 - Doctorate Faculty Evaluation.pdf"

# Regex patterns for course/section and instructor
section_header_pattern = re.compile(
    r"\((?P<course>[A-Z]{2,}\d{2,})\s*(?:\([A-Z0-9]+\))?\)?\s*Section:\s*(?P<section>[A-Za-z0-9]+)",
    re.IGNORECASE
)
instructor_pattern = re.compile(r"Instructor:\s*(.+)", re.IGNORECASE)
semester_pattern = re.compile(r"(Spring|Summer|Fall|Winter)\s+Semester\s+\d{4}(?:\s*\([^)]+\))?", re.IGNORECASE)

def clean_instructor(raw):
    """Strip semester info and trailing commas from instructor names."""
    if not raw:
        return None
    # Remove semester suffix if present
    cleaned = semester_pattern.sub("", raw).strip().rstrip(",")
    return cleaned

def extract_instructors(pdf_path):
    records = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            text = page.extract_text()
            if not text:
                continue
            lines = text.split("\n")
            for line in lines:
                # Detect course/section
                cs_match = section_header_pattern.search(line)
                course, section = None, None
                if cs_match:
                    course, section = cs_match.groups()

                # Detect instructor
                instr_match = instructor_pattern.search(line)
                if instr_match:
                    instructor_raw = instr_match.group(1).strip()
                    instructor_clean = clean_instructor(instructor_raw)
                    semester_match = semester_pattern.search(line)
                    semester = semester_match.group(0).strip() if semester_match else None

                    records.append({
                        "Course Code": course,
                        "Section Code": section,
                        "Instructor": instructor_clean,
                        "Semester": semester
                    })
    df = pd.DataFrame(records).drop_duplicates().reset_index(drop=True)
    print(f"Extracted {len(df)} instructors")
    return df

# Run extractor
df_instructors = extract_instructors(pdf_path)
print(df_instructors)

Extracted 16 instructors
   Course Code Section Code               Instructor  \
0      PSY7735            A  Arasa, Josephine Summer   
1      PSY7736            A              Brown, Dana   
2      DBA7020            A            Kamau, Joseph   
3         None         None         Kimotho, Stephen   
4      DBA7050            A  Koshal, Jeremiah Summer   
5         None         None             Muturi, Lucy   
6      PSY7722            A             Muturi, Lucy   
7         None         None             Nyaga, Nancy   
8      PSY7760            A                   Omollo   
9         None         None       Ongecha, Francisca   
10     PSY7733            A    Staff, Faculty Summer   
11        None         None           Staff, Faculty   
12     IRL7010            A                    Sungi   
13     IRL7017            A                    Sungi   
14        None         None         Veney, Cassandra   
15        None         None         Webbo, Roselynne   

                      

ok so summer is in the wrong place, semester year is not being fullt picked up, we got 16 instructurs being picked up out of 24

What This Fixes
- Arasa, Josephine Summer → Arasa, Josephine
- Koshal, Jeremiah Summer → Koshal, Jeremiah
- Staff, Faculty Summer → Staff, Faculty
- Instructors now inherit the last seen course/section even if split across lines.
- Output should match your gold-standard list of 24 instructors.


In [9]:
import pdfplumber
import re
import pandas as pd

pdf_path = r"C:\Users\Admin\OneDrive\Documents\Schoolwork\Projects\UNI finals project\Code stuff\US 2019 - Doctorate Faculty Evaluation.pdf"

# Regex patterns
section_header_pattern = re.compile(
    r"\((?P<course>[A-Z]{2,}\d{2,})\s*(?:\([A-Z0-9]+\))?\)?\s*Section:\s*(?P<section>[A-Za-z0-9]+)",
    re.IGNORECASE
)
instructor_pattern = re.compile(r"Instructor:\s*(.+)", re.IGNORECASE)
semester_pattern = re.compile(r"(Spring|Summer|Fall|Winter)\s+Semester\s+\d{4}(?:\s*\([^)]+\))?", re.IGNORECASE)

# Normalize placeholders
placeholder_normalize = {
    "staff, faculty": "Staff, Faculty",
    "sometim dept all": "Staff, Faculty",
    "dept all": "Staff, Faculty",
    "all aspect": "Staff, Faculty",
}

def clean_instructor(raw):
    """Strip semester info and normalize placeholders."""
    if not raw:
        return None
    cleaned = semester_pattern.sub("", raw).strip().rstrip(",")
    key = cleaned.lower()
    if key in placeholder_normalize:
        return placeholder_normalize[key]
    return cleaned

def extract_instructors(pdf_path):
    records = []
    last_course, last_section = None, None

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            text = page.extract_text()
            if not text:
                continue
            lines = text.split("\n")

            for line in lines:
                # Detect course/section header
                cs_match = section_header_pattern.search(line)
                if cs_match:
                    last_course, last_section = cs_match.groups()

                # Detect instructor
                instr_match = instructor_pattern.search(line)
                if instr_match:
                    instructor_raw = instr_match.group(1).strip()
                    instructor_clean = clean_instructor(instructor_raw)
                    semester_match = semester_pattern.search(line)
                    semester = semester_match.group(0).strip() if semester_match else None

                    records.append({
                        "Course Code": last_course,
                        "Section Code": last_section,
                        "Instructor": instructor_clean,
                        "Semester": semester
                    })

    df = pd.DataFrame(records).drop_duplicates().reset_index(drop=True)
    print(f"Extracted {len(df)} instructors")
    return df

# Run extractor
df_instructors = extract_instructors(pdf_path)
print(df_instructors)

Extracted 16 instructors
   Course Code Section Code               Instructor  \
0      PSY7735            A  Arasa, Josephine Summer   
1      PSY7736            A              Brown, Dana   
2      DBA7020            A            Kamau, Joseph   
3      PSY7727            A         Kimotho, Stephen   
4      DBA7050            A  Koshal, Jeremiah Summer   
5      PSY7743            A             Muturi, Lucy   
6      PSY7722            A             Muturi, Lucy   
7      PSY7734            A             Nyaga, Nancy   
8      PSY7760            A                   Omollo   
9      PSY7760            A       Ongecha, Francisca   
10     PSY7733            A    Staff, Faculty Summer   
11     PSY7733            A           Staff, Faculty   
12     IRL7010            A                    Sungi   
13     IRL7017            A                    Sungi   
14     IRL7017            A         Veney, Cassandra   
15     IRL7017            A         Webbo, Roselynne   

                      

ok so summer is still being picked up, sungi simuon is just being registered as sungi the current cleanup strips “Summer Semester…” but leaves only “Sungi,” because the comma split logic isn’t handling the second token.
, section code is good, and course code is alright, summer semester is not being registered for all some rows show None for semester even though it’s present in the PDF.This happens because the semester sometimes appears on the next line, not the same line as the instructor.


In [10]:
import pdfplumber
import re
import pandas as pd

pdf_path = r"C:\Users\Admin\OneDrive\Documents\Schoolwork\Projects\UNI finals project\Code stuff\US 2019 - Doctorate Faculty Evaluation.pdf"

# Regex patterns
section_header_pattern = re.compile(
    r"\((?P<course>[A-Z]{2,}\d{2,})\s*(?:\([A-Z0-9]+\))?\)?\s*Section:\s*(?P<section>[A-Za-z0-9]+)",
    re.IGNORECASE
)
instructor_pattern = re.compile(r"Instructor:\s*([A-Za-z ,.'-]+)", re.IGNORECASE)
semester_pattern = re.compile(r"(Spring|Summer|Fall|Winter)\s+Semester\s+\d{4}(?:\s*\([^)]+\))?", re.IGNORECASE)

# Normalize placeholders
placeholder_normalize = {
    "staff, faculty": "Staff, Faculty",
    "sometim dept all": "Staff, Faculty",
    "dept all": "Staff, Faculty",
    "all aspect": "Staff, Faculty",
}

def clean_instructor(raw):
    """Strip semester info and normalize placeholders."""
    if not raw:
        return None
    # Cut off at the word "Semester" if present
    cleaned = re.split(r"\bSemester\b", raw)[0].strip().rstrip(",")
    key = cleaned.lower()
    if key in placeholder_normalize:
        return placeholder_normalize[key]
    return cleaned

def extract_instructors(pdf_path):
    records = []
    last_course, last_section, last_semester = None, None, None

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            text = page.extract_text()
            if not text:
                continue
            lines = text.split("\n")

            for line in lines:
                # Detect course/section header
                cs_match = section_header_pattern.search(line)
                if cs_match:
                    last_course, last_section = cs_match.groups()

                # Detect semester (carry forward if on separate line)
                sem_match = semester_pattern.search(line)
                if sem_match:
                    last_semester = sem_match.group(0).strip()

                # Detect instructor
                instr_match = instructor_pattern.search(line)
                if instr_match:
                    instructor_raw = instr_match.group(1).strip()
                    instructor_clean = clean_instructor(instructor_raw)
                    # Use semester from same line if present, else last seen
                    semester = semester_pattern.search(line)
                    semester = semester.group(0).strip() if semester else last_semester

                    records.append({
                        "Course Code": last_course,
                        "Section Code": last_section,
                        "Instructor": instructor_clean,
                        "Semester": semester
                    })

    df = pd.DataFrame(records).drop_duplicates().reset_index(drop=True)
    print(f"Extracted {len(df)} instructors")
    return df

# Run extractor
df_instructors = extract_instructors(pdf_path)
print(df_instructors)

Extracted 15 instructors
   Course Code Section Code                 Instructor  \
0      PSY7735            A    Arasa, Josephine Summer   
1      PSY7736            A                Brown, Dana   
2      DBA7020            A              Kamau, Joseph   
3      PSY7727            A    Kimotho, Stephen Summer   
4      DBA7050            A    Koshal, Jeremiah Summer   
5      PSY7743            A        Muturi, Lucy Summer   
6      PSY7722            A               Muturi, Lucy   
7      PSY7734            A        Nyaga, Nancy Summer   
8      PSY7760            A                     Omollo   
9      PSY7760            A  Ongecha, Francisca Summer   
10     PSY7733            A      Staff, Faculty Summer   
11     IRL7010            A                      Sungi   
12     IRL7017            A                      Sungi   
13     IRL7017            A           Veney, Cassandra   
14     IRL7017            A    Webbo, Roselynne Summer   

                      Semester  
0   Summer Se

ok so weve fixed the semester year issue, but sungi is still one name as well as omollo, we are missing the second name,names are still too few as we are missing, george, ali,justus, with some still having summer attached to them

In [11]:
import pdfplumber
import re
import pandas as pd

pdf_path = r"C:\Users\Admin\OneDrive\Documents\Schoolwork\Projects\UNI finals project\Code stuff\US 2019 - Doctorate Faculty Evaluation.pdf"

# Regex patterns
section_header_pattern = re.compile(
    r"\((?P<course>[A-Z]{2,}\d{2,})\s*(?:\([A-Z0-9]+\))?\)?\s*Section:\s*(?P<section>[A-Za-z0-9]+)",
    re.IGNORECASE
)
instructor_pattern = re.compile(r"Instructor:\s*([A-Za-z ,.'-]+)?", re.IGNORECASE)
semester_pattern = re.compile(r"(Spring|Summer|Fall|Winter)\s+Semester\s+\d{4}(?:\s*\([^)]+\))?", re.IGNORECASE)

# Normalize placeholders
placeholder_normalize = {
    "staff, faculty": "Staff, Faculty",
    "sometim dept all": "Staff, Faculty",
    "dept all": "Staff, Faculty",
    "all aspect": "Staff, Faculty",
}

def clean_instructor(raw):
    """Strip semester info and normalize placeholders."""
    if not raw:
        return None
    # Remove anything after 'Semester' or 'Summer Semester'
    cleaned = re.split(r"(Summer\s+Semester|Semester)", raw)[0].strip().rstrip(",")
    key = cleaned.lower()
    if key in placeholder_normalize:
        return placeholder_normalize[key]
    return cleaned

def extract_instructors(pdf_path):
    records = []
    last_course, last_section, last_semester = None, None, None

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            text = page.extract_text()
            if not text:
                continue
            lines = text.split("\n")

            for i, line in enumerate(lines):
                # Detect course/section header
                cs_match = section_header_pattern.search(line)
                if cs_match:
                    last_course, last_section = cs_match.groups()

                # Detect semester (carry forward if on separate line)
                sem_match = semester_pattern.search(line)
                if sem_match:
                    last_semester = sem_match.group(0).strip()

                # Detect instructor inline
                instr_match = instructor_pattern.search(line)
                if instr_match and instr_match.group(1):
                    instructor_raw = instr_match.group(1).strip()
                    instructor_clean = clean_instructor(instructor_raw)
                    semester = semester_pattern.search(line)
                    semester = semester.group(0).strip() if semester else last_semester

                    records.append({
                        "Course Code": last_course,
                        "Section Code": last_section,
                        "Instructor": instructor_clean,
                        "Semester": semester
                    })

                # Handle split-line instructors (line is just "Instructor:")
                elif line.strip().lower() == "instructor:" and i+1 < len(lines):
                    instructor_raw = lines[i+1].strip()
                    instructor_clean = clean_instructor(instructor_raw)
                    semester = semester_pattern.search(lines[i+1])
                    semester = semester.group(0).strip() if semester else last_semester

                    records.append({
                        "Course Code": last_course,
                        "Section Code": last_section,
                        "Instructor": instructor_clean,
                        "Semester": semester
                    })

    df = pd.DataFrame(records).drop_duplicates().reset_index(drop=True)
    print(f"Extracted {len(df)} instructors")
    return df

# Run extractor
df_instructors = extract_instructors(pdf_path)
print(df_instructors)

Extracted 16 instructors
   Course Code Section Code               Instructor  \
0      PSY7735            A  Arasa, Josephine Summer   
1      PSY7736            A              Brown, Dana   
2      DBA7020            A            Kamau, Joseph   
3      PSY7727            A         Kimotho, Stephen   
4      DBA7050            A  Koshal, Jeremiah Summer   
5      PSY7743            A             Muturi, Lucy   
6      PSY7722            A             Muturi, Lucy   
7      PSY7734            A             Nyaga, Nancy   
8      PSY7760            A                   Omollo   
9      PSY7760            A       Ongecha, Francisca   
10     PSY7733            A    Staff, Faculty Summer   
11     PSY7733            A           Staff, Faculty   
12     IRL7010            A                    Sungi   
13     IRL7017            A                    Sungi   
14     IRL7017            A         Veney, Cassandra   
15     IRL7017            A         Webbo, Roselynne   

                      

AAAAAAAAAAAHHHHHHHHHHHHH, summer is still bieng picked up, we are still missing names, and now staff faculty suffering from the summer issue, and names like sungi and omollo are still missing their second names

In [12]:
import pdfplumber
import re
import pandas as pd

pdf_path = r"C:\Users\Admin\OneDrive\Documents\Schoolwork\Projects\UNI finals project\Code stuff\US 2019 - Doctorate Faculty Evaluation.pdf"

# Regex patterns
section_header_pattern = re.compile(
    r"\((?P<course>[A-Z]{2,}\d{2,})\s*(?:\([A-Z0-9]+\))?\)?\s*Section:\s*(?P<section>[A-Za-z0-9]+)",
    re.IGNORECASE
)
instructor_pattern = re.compile(r"Instructor:\s*([A-Za-z ,.'-]+)?", re.IGNORECASE)
semester_pattern = re.compile(r"(Spring|Summer|Fall|Winter)\s+Semester\s+\d{4}(?:\s*\([^)]+\))?", re.IGNORECASE)

# Normalize placeholders
placeholder_normalize = {
    "staff, faculty": "Staff, Faculty",
    "sometim dept all": "Staff, Faculty",
    "dept all": "Staff, Faculty",
    "all aspect": "Staff, Faculty",
}

def clean_instructor(raw):
    """Strip semester info, dangling 'Summer', and normalize placeholders."""
    if not raw:
        return None
    # Remove semester phrases
    cleaned = re.split(r"(Spring\s+Semester|Summer\s+Semester|Fall\s+Semester|Winter\s+Semester)", raw)[0].strip().rstrip(",")
    # Remove dangling 'Summer' if left at the end
    cleaned = re.sub(r"\bSummer$", "", cleaned).strip()
    key = cleaned.lower()
    if key in placeholder_normalize:
        return placeholder_normalize[key]
    return cleaned

def extract_instructors(pdf_path):
    records = []
    last_course, last_section, last_semester = None, None, None

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            text = page.extract_text()
            if not text:
                continue
            lines = text.split("\n")

            for i, line in enumerate(lines):
                # Detect course/section header
                cs_match = section_header_pattern.search(line)
                if cs_match:
                    last_course, last_section = cs_match.groups()

                # Detect semester (carry forward if on separate line)
                sem_match = semester_pattern.search(line)
                if sem_match:
                    last_semester = sem_match.group(0).strip()

                # Detect instructor inline
                instr_match = instructor_pattern.search(line)
                if instr_match and instr_match.group(1):
                    instructor_raw = instr_match.group(1).strip()
                    instructor_clean = clean_instructor(instructor_raw)
                    semester = semester_pattern.search(line)
                    semester = semester.group(0).strip() if semester else last_semester

                    records.append({
                        "Course Code": last_course,
                        "Section Code": last_section,
                        "Instructor": instructor_clean,
                        "Semester": semester
                    })

                # Handle split-line instructors (line is just "Instructor:")
                elif line.strip().lower() == "instructor:" and i+1 < len(lines):
                    instructor_raw = lines[i+1].strip()
                    instructor_clean = clean_instructor(instructor_raw)
                    semester = semester_pattern.search(lines[i+1])
                    semester = semester.group(0).strip() if semester else last_semester

                    records.append({
                        "Course Code": last_course,
                        "Section Code": last_section,
                        "Instructor": instructor_clean,
                        "Semester": semester
                    })

    df = pd.DataFrame(records).drop_duplicates().reset_index(drop=True)
    print(f"Extracted {len(df)} instructors")
    return df

# Run extractor
df_instructors = extract_instructors(pdf_path)
print(df_instructors)

Extracted 15 instructors
   Course Code Section Code          Instructor                    Semester
0      PSY7735            A    Arasa, Josephine  Summer Semester 2019 (DOC)
1      PSY7736            A         Brown, Dana  Summer Semester 2019 (DOC)
2      DBA7020            A       Kamau, Joseph  Summer Semester 2019 (DOC)
3      PSY7727            A    Kimotho, Stephen  Summer Semester 2019 (DOC)
4      DBA7050            A    Koshal, Jeremiah  Summer Semester 2019 (DOC)
5      PSY7743            A        Muturi, Lucy  Summer Semester 2019 (DOC)
6      PSY7722            A        Muturi, Lucy  Summer Semester 2019 (DOC)
7      PSY7734            A        Nyaga, Nancy  Summer Semester 2019 (DOC)
8      PSY7760            A              Omollo  Summer Semester 2019 (DOC)
9      PSY7760            A  Ongecha, Francisca  Summer Semester 2019 (DOC)
10     PSY7733            A      Staff, Faculty  Summer Semester 2019 (DOC)
11     IRL7010            A               Sungi  Summer Semeste

feels like we are back at square 4. sungi and omollo are still missing, but atleast summer is no longer an issue, and we still dont have all the instructors.

- The current code only grabs the line after "Instructor:" if it’s immediately adjacent, but it doesn’t parse the comma-separated Last, First correctly.
- Regex too permissive
Instructor:\s*([A-Za-z ,.'-]+)? stops at the first match, so "Omollo, Joseph Summer Semester 2019 (DOC)" is being truncated to "Omollo".
- Missed instructors (Achoki, Ali, Mbae)
These are also split-line cases, and because we don’t scan the next line properly, they’re skipped.


In [13]:
import pdfplumber
import re
import pandas as pd

pdf_path = r"C:\Users\Admin\OneDrive\Documents\Schoolwork\Projects\UNI finals project\Code stuff\US 2019 - Doctorate Faculty Evaluation.pdf"

# Regex patterns
section_header_pattern = re.compile(
    r"\((?P<course>[A-Z]{2,}\d{2,})\s*(?:\([A-Z0-9]+\))?\)?\s*Section:\s*(?P<section>[A-Za-z0-9]+)",
    re.IGNORECASE
)
semester_pattern = re.compile(r"(Spring|Summer|Fall|Winter)\s+Semester\s+\d{4}(?:\s*\([^)]+\))?", re.IGNORECASE)

# Normalize placeholders
placeholder_normalize = {
    "staff, faculty": "Staff, Faculty",
    "sometim dept all": "Staff, Faculty",
    "dept all": "Staff, Faculty",
    "all aspect": "Staff, Faculty",
}

def clean_instructor(raw):
    """Strip semester info and normalize placeholders."""
    if not raw:
        return None
    # Remove semester phrases
    cleaned = re.split(r"(Spring\s+Semester|Summer\s+Semester|Fall\s+Semester|Winter\s+Semester)", raw)[0].strip().rstrip(",")
    # Remove dangling 'Summer'
    cleaned = re.sub(r"\bSummer$", "", cleaned).strip()
    key = cleaned.lower()
    if key in placeholder_normalize:
        return placeholder_normalize[key]
    return cleaned

def extract_instructors(pdf_path):
    records = []
    last_course, last_section, last_semester = None, None, None

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            text = page.extract_text()
            if not text:
                continue
            lines = text.split("\n")

            for i, line in enumerate(lines):
                # Detect course/section header
                cs_match = section_header_pattern.search(line)
                if cs_match:
                    last_course, last_section = cs_match.groups()

                # Detect semester (carry forward if on separate line)
                sem_match = semester_pattern.search(line)
                if sem_match:
                    last_semester = sem_match.group(0).strip()

                # Case 1: Instructor inline
                if line.strip().lower().startswith("instructor:"):
                    # If name is on same line
                    parts = line.split("Instructor:")
                    if len(parts) > 1 and parts[1].strip():
                        instructor_raw = parts[1].strip()
                    # If name is on next line
                    elif i+1 < len(lines):
                        instructor_raw = lines[i+1].strip()
                    else:
                        instructor_raw = None

                    instructor_clean = clean_instructor(instructor_raw)
                    semester = semester_pattern.search(line)
                    semester = semester.group(0).strip() if semester else last_semester

                    if instructor_clean:
                        records.append({
                            "Course Code": last_course,
                            "Section Code": last_section,
                            "Instructor": instructor_clean,
                            "Semester": semester
                        })

    df = pd.DataFrame(records).drop_duplicates().reset_index(drop=True)
    print(f"Extracted {len(df)} instructors")
    return df

# Run extractor
df_instructors = extract_instructors(pdf_path)
print(df_instructors)

Extracted 4 instructors
  Course Code Section Code        Instructor                    Semester
0     PSY7727            A  Kimotho, Stephen  Summer Semester 2019 (DOC)
1     PSY7743            A      Muturi, Lucy  Summer Semester 2019 (DOC)
2     PSY7734            A      Nyaga, Nancy  Summer Semester 2019 (DOC)
3     PSY7733            A    Staff, Faculty  Summer Semester 2019 (DOC)


and now we have a completed a full loop back to issues we were dealing with our parser, its become to strcit, and is picking up those 3 names plus the staff.

ok so gotta make a hybrid code
What This Hybrid Does
- Permissive detection: grabs both inline "Instructor:" lines and standalone Last, First lines.
- Lookahead: if "Instructor:" is on one line, it captures the next line.
- Cleanup: strips semester suffixes and dangling “Summer”.
- Normalization: converts placeholders like "Staff, Faculty Summer" → "Staff, Faculty".
- Course/section linking: attaches the last seen course/section to each instructor.


In [14]:
import pdfplumber
import re
import pandas as pd

pdf_path = r"C:\Users\Admin\OneDrive\Documents\Schoolwork\Projects\UNI finals project\Code stuff\US 2019 - Doctorate Faculty Evaluation.pdf"

# Regex patterns
section_header_pattern = re.compile(
    r"\((?P<course>[A-Z]{2,}\d{2,})\s*(?:\([A-Z0-9]+\))?\)?\s*Section:\s*(?P<section>[A-Za-z0-9]+)",
    re.IGNORECASE
)
semester_pattern = re.compile(r"(Spring|Summer|Fall|Winter)\s+Semester\s+\d{4}(?:\s*\([^)]+\))?", re.IGNORECASE)

# Normalize placeholders
placeholder_normalize = {
    "staff, faculty": "Staff, Faculty",
    "sometim dept all": "Staff, Faculty",
    "dept all": "Staff, Faculty",
    "all aspect": "Staff, Faculty",
}

def clean_instructor(raw):
    """Strip semester info, dangling 'Summer', and normalize placeholders."""
    if not raw:
        return None
    # Remove semester phrases
    cleaned = re.split(r"(Spring\s+Semester|Summer\s+Semester|Fall\s+Semester|Winter\s+Semester)", raw)[0].strip().rstrip(",")
    # Remove dangling 'Summer'
    cleaned = re.sub(r"\bSummer$", "", cleaned).strip()
    key = cleaned.lower()
    if key in placeholder_normalize:
        return placeholder_normalize[key]
    return cleaned

def extract_instructors(pdf_path):
    records = []
    last_course, last_section, last_semester = None, None, None

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            text = page.extract_text()
            if not text:
                continue
            lines = text.split("\n")

            for i, line in enumerate(lines):
                # Detect course/section header
                cs_match = section_header_pattern.search(line)
                if cs_match:
                    last_course, last_section = cs_match.groups()

                # Detect semester (carry forward if on separate line)
                sem_match = semester_pattern.search(line)
                if sem_match:
                    last_semester = sem_match.group(0).strip()

                # Case 1: Instructor inline
                if line.strip().lower().startswith("instructor:"):
                    # If name is on same line
                    parts = line.split("Instructor:")
                    if len(parts) > 1 and parts[1].strip():
                        instructor_raw = parts[1].strip()
                    # If name is on next line
                    elif i+1 < len(lines):
                        instructor_raw = lines[i+1].strip()
                    else:
                        instructor_raw = None

                    instructor_clean = clean_instructor(instructor_raw)
                    semester = semester_pattern.search(line)
                    semester = semester.group(0).strip() if semester else last_semester

                    if instructor_clean:
                        records.append({
                            "Course Code": last_course,
                            "Section Code": last_section,
                            "Instructor": instructor_clean,
                            "Semester": semester
                        })

                # Case 2: Instructor name appears without prefix (fallback)
                elif "," in line and not line.lower().startswith("section report"):
                    # Likely a "Last, First" name line
                    instructor_raw = line.strip()
                    instructor_clean = clean_instructor(instructor_raw)
                    if instructor_clean and ("," in instructor_clean or instructor_clean.lower() in placeholder_normalize):
                        records.append({
                            "Course Code": last_course,
                            "Section Code": last_section,
                            "Instructor": instructor_clean,
                            "Semester": last_semester
                        })

    df = pd.DataFrame(records).drop_duplicates().reset_index(drop=True)
    print(f"Extracted {len(df)} instructors")
    return df

# Run extractor
df_instructors = extract_instructors(pdf_path)
print(df_instructors)

Extracted 175 instructors
    Course Code Section Code  \
0       DBA7030            A   
1       DBA7030            A   
2       DBA7030            A   
3       DBA7030            A   
4       DBA7030            A   
..          ...          ...   
170     IRL7017            A   
171     IRL7017            A   
172     IRL7017            A   
173     IRL7017            A   
174     IRL7017            A   

                                            Instructor  \
0                                       Achoki, George   
1    assignments, quizzes and exams. 80.0% 20.0% 0....   
2    For this course, I expect to receive a grade 2...   
3    The delivery of the course was overall excelle...   
4    Group work , it gave unfair advantage to those...   
..                                                 ...   
170  assignments, quizzes and exams. 66.7% 33.3% 0....   
171  For this course, I expect to receive a grade 4...   
172  the feedback on assignments, quizzes and The l...   
173      

weve gone to far with it. I want a farm.

What went wrong
- In the last hybrid version, I added a fallback rule:
elif "," in line and not line.lower().startswith("section report"):
     treat as instructor

- That’s far too broad. It happily grabs any line with a comma, including comments, tables, and evaluation text.

We need to tighten the fallback so it only accepts lines that look like names. That means:
- Must match the pattern Last, First (optionally with a middle name).
- Must not contain numbers, %, or long sentences.
- Must be short (say ≤ 4 tokens).




In [15]:
import pdfplumber
import re
import pandas as pd

pdf_path = r"C:\Users\Admin\OneDrive\Documents\Schoolwork\Projects\UNI finals project\Code stuff\US 2019 - Doctorate Faculty Evaluation.pdf"

# Regex patterns
section_header_pattern = re.compile(
    r"\((?P<course>[A-Z]{2,}\d{2,})\s*(?:\([A-Z0-9]+\))?\)?\s*Section:\s*(?P<section>[A-Za-z0-9]+)",
    re.IGNORECASE
)
semester_pattern = re.compile(r"(Spring|Summer|Fall|Winter)\s+Semester\s+\d{4}(?:\s*\([^)]+\))?", re.IGNORECASE)

# Strict name pattern: Last, First [Middle]
name_pattern = re.compile(r"^[A-Z][a-z]+,\s*[A-Z][a-z]+(?:\s+[A-Z][a-z]+)?$")

# Normalize placeholders
placeholder_normalize = {
    "staff, faculty": "Staff, Faculty",
    "sometim dept all": "Staff, Faculty",
    "dept all": "Staff, Faculty",
    "all aspect": "Staff, Faculty",
}

def clean_instructor(raw):
    """Strip semester info, dangling 'Summer', and normalize placeholders."""
    if not raw:
        return None
    # Remove semester phrases
    cleaned = re.split(r"(Spring\s+Semester|Summer\s+Semester|Fall\s+Semester|Winter\s+Semester)", raw)[0].strip().rstrip(",")
    # Remove dangling 'Summer'
    cleaned = re.sub(r"\bSummer$", "", cleaned).strip()
    key = cleaned.lower()
    if key in placeholder_normalize:
        return placeholder_normalize[key]
    return cleaned

def extract_instructors(pdf_path):
    records = []
    last_course, last_section, last_semester = None, None, None

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            text = page.extract_text()
            if not text:
                continue
            lines = text.split("\n")

            for i, line in enumerate(lines):
                # Detect course/section header
                cs_match = section_header_pattern.search(line)
                if cs_match:
                    last_course, last_section = cs_match.groups()

                # Detect semester (carry forward if on separate line)
                sem_match = semester_pattern.search(line)
                if sem_match:
                    last_semester = sem_match.group(0).strip()

                # Case 1: Instructor inline or split-line
                if line.strip().lower().startswith("instructor:"):
                    parts = line.split("Instructor:")
                    if len(parts) > 1 and parts[1].strip():
                        instructor_raw = parts[1].strip()
                    elif i+1 < len(lines):
                        instructor_raw = lines[i+1].strip()
                    else:
                        instructor_raw = None

                    instructor_clean = clean_instructor(instructor_raw)
                    semester = semester_pattern.search(line)
                    semester = semester.group(0).strip() if semester else last_semester

                    if instructor_clean:
                        records.append({
                            "Course Code": last_course,
                            "Section Code": last_section,
                            "Instructor": instructor_clean,
                            "Semester": semester
                        })

                # Case 2: Fallback for standalone names
                elif name_pattern.match(line.strip()):
                    instructor_raw = line.strip()
                    instructor_clean = clean_instructor(instructor_raw)
                    if instructor_clean:
                        records.append({
                            "Course Code": last_course,
                            "Section Code": last_section,
                            "Instructor": instructor_clean,
                            "Semester": last_semester
                        })

    df = pd.DataFrame(records).drop_duplicates().reset_index(drop=True)
    print(f"Extracted {len(df)} instructors")
    return df

# Run extractor
df_instructors = extract_instructors(pdf_path)
print(df_instructors)

Extracted 4 instructors
  Course Code Section Code        Instructor                    Semester
0     PSY7727            A  Kimotho, Stephen  Summer Semester 2019 (DOC)
1     PSY7743            A      Muturi, Lucy  Summer Semester 2019 (DOC)
2     PSY7734            A      Nyaga, Nancy  Summer Semester 2019 (DOC)
3     PSY7733            A    Staff, Faculty  Summer Semester 2019 (DOC)


and we looped back to square one again.

Why We’re Stuck at 4
- The strict name_pattern only matches Last, First with capitalized tokens. That excludes lines like “Achoki, George Summer Semester 2019 (DOC)” because the regex cuts too early.
- Split-line cases (e.g. "Instructor:" on one line, name on the next) are still being missed if the next line doesn’t perfectly match the regex.
- We’ve been oscillating between too permissive (175 “instructors”) and too strict (4 instructors).


The Balanced Fix
Instead of relying on regex alone, let’s use a two-step heuristic:
- Anchor on “Instructor:”
Always capture the next non-empty line after "Instructor:". Don’t try to regex it too tightly — just grab the whole line.
- Clean the captured line
- Strip semester phrases (Summer Semester 2019 (DOC) etc.).
- Remove dangling “Summer”.
- Normalize placeholders (Staff, Faculty).
- Keep everything before the semester phrase intact (so “Sungi, Simeon Summer Semester 2019 (DOC)” → “Sungi, Simeon”).
- Skip obvious non-names
If the captured line contains numbers, %, or more than ~5 words, discard it. That prevents comment text from being misclassified


In [17]:
import pdfplumber
import re
import pandas as pd

pdf_path = r"C:\Users\Admin\OneDrive\Documents\Schoolwork\Projects\UNI finals project\Code stuff\US 2019 - Doctorate Faculty Evaluation.pdf"

# Regex patterns
section_header_pattern = re.compile(
    r"\((?P<course>[A-Z]{2,}\d{2,})\s*(?:\([A-Z0-9]+\))?\)?\s*Section:\s*(?P<section>[A-Za-z0-9]+)",
    re.IGNORECASE
)
semester_pattern = re.compile(r"(Spring|Summer|Fall|Winter)\s+Semester\s+\d{4}(?:\s*\([^)]+\))?", re.IGNORECASE)

# Normalize placeholders
placeholder_normalize = {
    "staff, faculty": "Staff, Faculty",
    "sometim dept all": "Staff, Faculty",
    "dept all": "Staff, Faculty",
    "all aspect": "Staff, Faculty",
}

def clean_instructor(raw):
    """Strip semester info, dangling 'Summer', and normalize placeholders."""
    if not raw:
        return None
    # Remove semester phrases
    cleaned = re.split(r"(Spring\s+Semester|Summer\s+Semester|Fall\s+Semester|Winter\s+Semester)", raw)[0].strip().rstrip(",")
    # Remove dangling 'Summer'
    cleaned = re.sub(r"\bSummer$", "", cleaned).strip()
    key = cleaned.lower()
    if key in placeholder_normalize:
        return placeholder_normalize[key]
    return cleaned

def looks_like_name(text):
    """Heuristic: short, contains a comma, no digits or % signs, not a long sentence."""
    if not text:
        return False
    if len(text.split()) > 4:  # too long
        return False
    if re.search(r"\d|%", text):
        return False
    if "," not in text:
        return False
    return True

def extract_instructors(pdf_path):
    records = []
    last_course, last_section, last_semester = None, None, None

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            text = page.extract_text()
            if not text:
                continue
            lines = text.split("\n")

            for i, line in enumerate(lines):
                # Detect course/section header
                cs_match = section_header_pattern.search(line)
                if cs_match:
                    last_course, last_section = cs_match.groups()

                # Detect semester (carry forward if on separate line)
                sem_match = semester_pattern.search(line)
                if sem_match:
                    last_semester = sem_match.group(0).strip()

                # Case 1: Instructor inline or split-line
                if line.strip().lower().startswith("instructor:"):
                    parts = line.split("Instructor:")
                    if len(parts) > 1 and parts[1].strip():
                        instructor_raw = parts[1].strip()
                    else:
                        # Look ahead for next non-empty line
                        j = i+1
                        while j < len(lines) and not lines[j].strip():
                            j += 1
                        instructor_raw = lines[j].strip() if j < len(lines) else None

                    instructor_clean = clean_instructor(instructor_raw)
                    if instructor_clean and looks_like_name(instructor_clean):
                        records.append({
                            "Course Code": last_course,
                            "Section Code": last_section,
                            "Instructor": instructor_clean,
                            "Semester": last_semester
                        })

                # Case 2: Standalone name line (fallback)
                elif looks_like_name(line.strip()):
                    instructor_raw = line.strip()
                    instructor_clean = clean_instructor(instructor_raw)
                    if instructor_clean:
                        records.append({
                            "Course Code": last_course,
                            "Section Code": last_section,
                            "Instructor": instructor_clean,
                            "Semester": last_semester
                        })

    df = pd.DataFrame(records).drop_duplicates().reset_index(drop=True)
    print(f"Extracted {len(df)} instructors")
    return df

# Run extractor
df_instructors = extract_instructors(pdf_path)
print(df_instructors)

Extracted 13 instructors
   Course Code Section Code                           Instructor  \
0      DBA7030            A                   good , transparent   
1      DBA7030            A              Quite good, transparent   
2      IRL7019            A    Discussions, films, presentations   
3      PSY7736            A         EXCELLENT, CLEAR AND HELPFUL   
4      DBA7020            A                      great, relevant   
5      PSY7727            A                     Kimotho, Stephen   
6      PSY7743            A                         Muturi, Lucy   
7      PSY7734            A                         Nyaga, Nancy   
8      PSY7734            A     An excellent, competent Lecturer   
9      PSY7733            A                       Staff, Faculty   
10     IRL7010            A  Excellent, supportive and inspiring   
11     IRL7017            A                                 none   
12     IRL7017            A            Good , Competent Lecturer   

                      

i dont even know how to describe this, weve gottent eh reverse.
- Our fallback rule 'looks_like_name' is still too loose. It’s letting through any short line with a comma or capital letters, which matches evaluation comments like “Good, Competent Lecturer”.
- The real instructor lines in your PDF are always anchored by “Instructor:”. That’s the reliable signal we should stick to.
- By trying to be permissive, we accidentally opened the door to non‑name text.


In [18]:
import pdfplumber
import re
import pandas as pd

pdf_path = r"C:\Users\Admin\OneDrive\Documents\Schoolwork\Projects\UNI finals project\Code stuff\US 2019 - Doctorate Faculty Evaluation.pdf"

# Regex patterns
section_header_pattern = re.compile(
    r"\((?P<course>[A-Z]{2,}\d{2,})\s*(?:\([A-Z0-9]+\))?\)?\s*Section:\s*(?P<section>[A-Za-z0-9]+)",
    re.IGNORECASE
)
semester_pattern = re.compile(r"(Spring|Summer|Fall|Winter)\s+Semester\s+\d{4}(?:\s*\([^)]+\))?", re.IGNORECASE)

# Normalize placeholders
placeholder_normalize = {
    "staff, faculty": "Staff, Faculty",
}

def clean_instructor(raw):
    """Strip semester info, dangling 'Summer', and normalize placeholders."""
    if not raw:
        return None
    # Remove semester phrases
    cleaned = re.split(r"(Spring\s+Semester|Summer\s+Semester|Fall\s+Semester|Winter\s+Semester)", raw)[0].strip().rstrip(",")
    # Remove dangling 'Summer'
    cleaned = re.sub(r"\bSummer$", "", cleaned).strip()
    key = cleaned.lower()
    if key in placeholder_normalize:
        return placeholder_normalize[key]
    return cleaned

def extract_instructors(pdf_path):
    records = []
    last_course, last_section, last_semester = None, None, None

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            text = page.extract_text()
            if not text:
                continue
            lines = text.split("\n")

            for i, line in enumerate(lines):
                # Detect course/section header
                cs_match = section_header_pattern.search(line)
                if cs_match:
                    last_course, last_section = cs_match.groups()

                # Detect semester (carry forward if on separate line)
                sem_match = semester_pattern.search(line)
                if sem_match:
                    last_semester = sem_match.group(0).strip()

                # Instructor detection anchored only
                if line.strip().lower().startswith("instructor:"):
                    parts = line.split("Instructor:")
                    if len(parts) > 1 and parts[1].strip():
                        instructor_raw = parts[1].strip()
                    else:
                        # Look ahead for next non-empty line
                        j = i+1
                        while j < len(lines) and not lines[j].strip():
                            j += 1
                        instructor_raw = lines[j].strip() if j < len(lines) else None

                    instructor_clean = clean_instructor(instructor_raw)

                    # Skip if clearly feedback (contains words like 'good', 'excellent', 'lecturer')
                    if instructor_clean and not re.search(r"(good|excellent|lecturer|supportive|clear|helpful)", instructor_clean.lower()):
                        records.append({
                            "Course Code": last_course,
                            "Section Code": last_section,
                            "Instructor": instructor_clean,
                            "Semester": last_semester
                        })

    df = pd.DataFrame(records).drop_duplicates().reset_index(drop=True)
    print(f"Extracted {len(df)} instructors")
    return df

# Run extractor
df_instructors = extract_instructors(pdf_path)
print(df_instructors)

Extracted 4 instructors
  Course Code Section Code        Instructor                    Semester
0     PSY7727            A  Kimotho, Stephen  Summer Semester 2019 (DOC)
1     PSY7743            A      Muturi, Lucy  Summer Semester 2019 (DOC)
2     PSY7734            A      Nyaga, Nancy  Summer Semester 2019 (DOC)
3     PSY7733            A    Staff, Faculty  Summer Semester 2019 (DOC)


and we are just looping back again, theres a fine line that we are not getting.

In [19]:
import pdfplumber
import re
import pandas as pd

pdf_path = r"C:\Users\Admin\OneDrive\Documents\Schoolwork\Projects\UNI finals project\Code stuff\US 2019 - Doctorate Faculty Evaluation.pdf"

# Regex patterns
section_header_pattern = re.compile(
    r"\((?P<course>[A-Z]{2,}\d{2,})\s*(?:\([A-Z0-9]+\))?\)?\s*Section:\s*(?P<section>[A-Za-z0-9]+)",
    re.IGNORECASE
)
semester_pattern = re.compile(r"(Spring|Summer|Fall|Winter)\s+Semester\s+\d{4}(?:\s*\([^)]+\))?", re.IGNORECASE)

# Normalize placeholders
placeholder_normalize = {
    "staff, faculty": "Staff, Faculty",
}

def clean_instructor(raw):
    """Strip semester info, dangling 'Summer', and normalize placeholders."""
    if not raw:
        return None
    # Remove semester phrases
    cleaned = re.split(r"(Spring\s+Semester|Summer\s+Semester|Fall\s+Semester|Winter\s+Semester)", raw)[0].strip().rstrip(",")
    # Remove dangling 'Summer'
    cleaned = re.sub(r"\bSummer$", "", cleaned).strip()
    key = cleaned.lower()
    if key in placeholder_normalize:
        return placeholder_normalize[key]
    return cleaned

def looks_like_name(text):
    """Heuristic: must contain a comma, short length, no obvious feedback words."""
    if not text:
        return False
    if "," not in text:
        return False
    if len(text.split()) > 4:  # too long for a name
        return False
    if re.search(r"(good|excellent|lecturer|supportive|clear|helpful)", text.lower()):
        return False
    return True

def extract_instructors(pdf_path):
    records = []
    last_course, last_section, last_semester = None, None, None

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            text = page.extract_text()
            if not text:
                continue
            lines = text.split("\n")

            for i, line in enumerate(lines):
                # Detect course/section header
                cs_match = section_header_pattern.search(line)
                if cs_match:
                    last_course, last_section = cs_match.groups()

                # Detect semester (carry forward if on separate line)
                sem_match = semester_pattern.search(line)
                if sem_match:
                    last_semester = sem_match.group(0).strip()

                # Instructor detection
                if line.strip().lower() == "instructor:":
                    # Name is on next non-empty line
                    j = i+1
                    while j < len(lines) and not lines[j].strip():
                        j += 1
                    instructor_raw = lines[j].strip() if j < len(lines) else None

                elif line.strip().lower().startswith("instructor:"):
                    # Inline instructor
                    instructor_raw = line.split("Instructor:")[1].strip()

                else:
                    instructor_raw = None

                if instructor_raw:
                    instructor_clean = clean_instructor(instructor_raw)
                    if instructor_clean and looks_like_name(instructor_clean):
                        records.append({
                            "Course Code": last_course,
                            "Section Code": last_section,
                            "Instructor": instructor_clean,
                            "Semester": last_semester
                        })

    df = pd.DataFrame(records).drop_duplicates().reset_index(drop=True)
    print(f"Extracted {len(df)} instructors")
    return df

# Run extractor
df_instructors = extract_instructors(pdf_path)
print(df_instructors)

Extracted 4 instructors
  Course Code Section Code        Instructor                    Semester
0     PSY7727            A  Kimotho, Stephen  Summer Semester 2019 (DOC)
1     PSY7743            A      Muturi, Lucy  Summer Semester 2019 (DOC)
2     PSY7734            A      Nyaga, Nancy  Summer Semester 2019 (DOC)
3     PSY7733            A    Staff, Faculty  Summer Semester 2019 (DOC)


its like nothing that i do works😭😭😭😭😭. im just tired fam.
ok let me run a diagnostics to see how the pdfplumber is reading things

In [22]:
with pdfplumber.open(pdf_path) as pdf:
    page = pdf.pages[0]
    text = page.extract_text()
    for idx, line in enumerate(text.split("\n")):
        print(idx, repr(line))

0 'Section Report: ACCOUNTING & FINANCIAL MANAGEMENT (DBA7030 (DC18)) Section: A, Instructor:'
1 'Achoki, George Summer Semester 2019 (DOC)'
2 'Satisfact Below Dept All'
3 'COURSE EVALUATION Excellent Good Poor Total Mean'
4 'ory Average Mean College'
5 'The lecturer clearly communicated the 27 3 0 0 0 30'
6 '4.90 4.90 4.60'
7 'learning outcomes. 90.0% 10.0% 0.0% 0.0% 0.0% 100.0%'
8 'The course activities were related to the 27 3 0 0 0 30'
9 '4.90 4.90 4.60'
10 'learning outcomes. 90.0% 10.0% 0.0% 0.0% 0.0% 100.0%'
11 '27 3 0 0 0 30'
12 'The lecturer was well prepared for class. 4.90 4.90 4.60'
13 '90.0% 10.0% 0.0% 0.0% 0.0% 100.0%'
14 'The lecturer used a variety of teaching 24 6 0 0 0 30'
15 '4.80 4.80 4.55'
16 'methods. 80.0% 20.0% 0.0% 0.0% 0.0% 100.0%'
17 'The lecturer demonstrated thorough 26 4 0 0 0 30'
18 '4.87 4.87 4.63'
19 'knowledge of the subject. 86.7% 13.3% 0.0% 0.0% 0.0% 100.0%'
20 'The lecturer was open to diverse view points 24 6 0 0 0 30'
21 '4.80 4.80 4.55'
22 'and o

- Line 0: "… Section: A, Instructor:"
- Line 1: "Achoki, George Summer Semester 2019 (DOC)"
That means the instructor name is always on the line immediately following the “Instructor:” marker, not inline (except for a few cases like Kimotho, Muturi, Nyaga, Staff).
Our earlier extractor was only checking inline or skipping the next line because of filtering. That’s why we kept collapsing back to 4 names


In [27]:
import pdfplumber
import re
import pandas as pd

pdf_path = r"C:\\Users\\Admin\\OneDrive\\Documents\\Schoolwork\\Projects\\UNI finals project\\Code stuff\\US 2019 - Doctorate Faculty Evaluation.pdf"

section_header_pattern = re.compile(
    r"\((?P<course>[A-Z]{2,}\d{4})\s*(?:\([A-Z0-9]+\))?\)\s*Section:\s*(?P<section>[A-Za-z0-9]+)",
    re.IGNORECASE
)
semester_pattern = re.compile(r"(Spring|Summer|Fall|Winter)\s+Semester\s+\d{4}(?:\s*\([^)]+\))?", re.IGNORECASE)

placeholder_normalize = {"staff, faculty": "Staff, Faculty"}

def clean_instructor(raw):
    if not raw:
        return None
    cleaned = re.split(r"(Spring\s+Semester|Summer\s+Semester|Fall\s+Semester|Winter\s+Semester)", raw)[0].strip().rstrip(",")
    cleaned = re.sub(r"\bSummer$", "", cleaned).strip()
    key = cleaned.lower()
    if key in placeholder_normalize:
        return placeholder_normalize[key]
    return cleaned

def extract_instructors(pdf_path):
    records = []
    last_course, last_section, last_semester = None, None, None

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if not text:
                continue
            lines = text.split("\n")

            for i, line in enumerate(lines):
                # Detect course/section header
                cs_match = section_header_pattern.search(line)
                if cs_match:
                    last_course, last_section = cs_match.groups()

                # Detect semester (carry forward if on separate line)
                sem_match = semester_pattern.search(line)
                if sem_match:
                    last_semester = sem_match.group(0).strip()

                # Instructor detection
                if "instructor:" in line.lower():
                    # Always take the next non-empty line
                    j = i+1
                    while j < len(lines) and not lines[j].strip():
                        j += 1
                    if j < len(lines):
                        instructor_raw = lines[j].strip()
                        # Extract semester if embedded in instructor line
                        sem_match = semester_pattern.search(instructor_raw)
                        semester = sem_match.group(0).strip() if sem_match else last_semester
                        instructor_clean = clean_instructor(instructor_raw)
                        if instructor_clean and "," in instructor_clean:
                            records.append({
                                "Course Code": last_course,
                                "Section Code": last_section,
                                "Instructor": instructor_clean,
                                "Semester": semester
                            })

    df = pd.DataFrame(records).drop_duplicates().reset_index(drop=True)
    print(f"Extracted {len(df)} instructors")
    return df

# Run extractor
df_instructors = extract_instructors(pdf_path)
print(df_instructors)

Extracted 4 instructors
  Course Code Section Code      Instructor                    Semester
0     DBA7030            A  Achoki, George  Summer Semester 2019 (DOC)
1     IRL7019            A     Ali, Fatuma  Summer Semester 2019 (DOC)
2     IRL7006            A    Mbae, Justus  Summer Semester 2019 (DOC)
3     PSY7733            A  Staff, Faculty  Summer Semester 2019 (DOC)


so now we got new names, so thats something, but not yet there

In [28]:
import pdfplumber

pdf_path = r"C:\\Users\\Admin\\OneDrive\\Documents\\Schoolwork\\Projects\\UNI finals project\\Code stuff\\US 2019 - Doctorate Faculty Evaluation.pdf"

with pdfplumber.open(pdf_path) as pdf:
    for page_num, page in enumerate(pdf.pages, start=1):
        text = page.extract_text()
        if not text:
            continue
        lines = text.split("\n")
        for i, line in enumerate(lines):
            if "instructor:" in line.lower():
                print(f"\nPage {page_num}, Line {i}: {line}")
                if i+1 < len(lines):
                    print(f"   Next line: {lines[i+1]}")


Page 1, Line 0: Section Report: ACCOUNTING & FINANCIAL MANAGEMENT (DBA7030 (DC18)) Section: A, Instructor:
   Next line: Achoki, George Summer Semester 2019 (DOC)

Page 8, Line 0: Section Report: CONFLICT RESOLUTION & PEACE BUILDING (IRL7019 (DC18)) Section: A, Instructor:
   Next line: Ali, Fatuma Summer Semester 2019 (DOC)

Page 13, Line 0: Section Report: DISSERTATION (PSY7735 (DC18)) Section: A, Instructor: Arasa, Josephine Summer
   Next line: Semester 2019 (DOC)

Page 16, Line 0: Section Report: MEMORY & COGNITION (PSY7736 (DC18)) Section: A, Instructor: Brown, Dana
   Next line: Summer Semester 2019 (DOC)

Page 20, Line 0: Section Report: ENTREPRENEURSHIP (DBA7020 (DC18)) Section: A, Instructor: Kamau, Joseph
   Next line: Summer Semester 2019 (DOC)

Page 27, Line 1: Instructor: Kimotho, Stephen Summer Semester 2019 (DOC)
   Next line: Satisfact Below Dept All

Page 31, Line 0: Section Report: LEADERSHIP (DBA7050 (DC18)) Section: A, Instructor: Koshal, Jeremiah Summer
   Next l

In [29]:
import pdfplumber
import re
import pandas as pd

pdf_path = r"C:\\Users\\Admin\\OneDrive\\Documents\\Schoolwork\\Projects\\UNI finals project\\Code stuff\\US 2019 - Doctorate Faculty Evaluation.pdf"

section_header_pattern = re.compile(
    r"\((?P<course>[A-Z]{2,}\d{4})\s*(?:\([A-Z0-9]+\))?\)\s*Section:\s*(?P<section>[A-Za-z0-9]+)",
    re.IGNORECASE
)
semester_pattern = re.compile(r"(Spring|Summer|Fall|Winter)\s+Semester\s+\d{4}(?:\s*\([^)]+\))?", re.IGNORECASE)

placeholder_normalize = {"staff, faculty": "Staff, Faculty"}

def clean_instructor(raw):
    if not raw:
        return None
    # Remove semester phrases
    cleaned = re.split(r"(Spring\s+Semester|Summer\s+Semester|Fall\s+Semester|Winter\s+Semester)", raw)[0].strip().rstrip(",")
    # Remove dangling 'Summer'
    cleaned = re.sub(r"\bSummer$", "", cleaned).strip()
    key = cleaned.lower()
    if key in placeholder_normalize:
        return placeholder_normalize[key]
    return cleaned

def extract_instructors(pdf_path):
    records = []
    last_course, last_section, last_semester = None, None, None

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if not text:
                continue
            lines = text.split("\n")

            for i, line in enumerate(lines):
                # Detect course/section header
                cs_match = section_header_pattern.search(line)
                if cs_match:
                    last_course, last_section = cs_match.groups()

                # Detect semester (carry forward if on separate line)
                sem_match = semester_pattern.search(line)
                if sem_match:
                    last_semester = sem_match.group(0).strip()

                # Instructor detection
                if "instructor:" in line.lower():
                    # Inline instructor case
                    if not line.strip().lower().endswith("instructor:"):
                        instructor_raw = line.split("Instructor:")[1].strip()
                        sem_match = semester_pattern.search(instructor_raw)
                        semester = sem_match.group(0).strip() if sem_match else last_semester
                        instructor_clean = clean_instructor(instructor_raw)
                        if instructor_clean and "," in instructor_clean:
                            records.append({
                                "Course Code": last_course,
                                "Section Code": last_section,
                                "Instructor": instructor_clean,
                                "Semester": semester
                            })
                    else:
                        # Next line case
                        j = i+1
                        while j < len(lines) and not lines[j].strip():
                            j += 1
                        if j < len(lines):
                            instructor_raw = lines[j].strip()
                            sem_match = semester_pattern.search(instructor_raw)
                            semester = sem_match.group(0).strip() if sem_match else last_semester
                            instructor_clean = clean_instructor(instructor_raw)
                            if instructor_clean and "," in instructor_clean:
                                records.append({
                                    "Course Code": last_course,
                                    "Section Code": last_section,
                                    "Instructor": instructor_clean,
                                    "Semester": semester
                                })

    df = pd.DataFrame(records).drop_duplicates().reset_index(drop=True)
    print(f"Extracted {len(df)} instructors")
    return df

# Run extractor
df_instructors = extract_instructors(pdf_path)
print(df_instructors)

Extracted 15 instructors
   Course Code Section Code          Instructor                    Semester
0      DBA7030            A      Achoki, George  Summer Semester 2019 (DOC)
1      IRL7019            A         Ali, Fatuma  Summer Semester 2019 (DOC)
2      PSY7735            A    Arasa, Josephine  Summer Semester 2019 (DOC)
3      PSY7736            A         Brown, Dana  Summer Semester 2019 (DOC)
4      DBA7020            A       Kamau, Joseph  Summer Semester 2019 (DOC)
5      PSY7727            A    Kimotho, Stephen  Summer Semester 2019 (DOC)
6      DBA7050            A    Koshal, Jeremiah  Summer Semester 2019 (DOC)
7      IRL7006            A        Mbae, Justus  Summer Semester 2019 (DOC)
8      PSY7743            A        Muturi, Lucy  Summer Semester 2019 (DOC)
9      PSY7722            A        Muturi, Lucy  Summer Semester 2019 (DOC)
10     PSY7734            A        Nyaga, Nancy  Summer Semester 2019 (DOC)
11     PSY7760            A  Ongecha, Francisca  Summer Semeste

OK so this will be my final attempt for a while. if it was work, damn it all we focus on the undergraduate level and we return to this when we finish at the undergradaute level

In [1]:
import pdfplumber
import re
import pandas as pd

pdf_path = r"C:\\Users\\Admin\\OneDrive\\Documents\\Schoolwork\\Projects\\UNI finals project\\Code stuff\\US 2019 - Doctorate Faculty Evaluation.pdf"

section_header_pattern = re.compile(
    r"\((?P<course>[A-Z]{2,}\d{4})\s*(?:\([A-Z0-9]+\))?\)\s*Section:\s*(?P<section>[A-Za-z0-9]+)",
    re.IGNORECASE
)
semester_pattern = re.compile(r"(Spring|Summer|Fall|Winter)\s+Semester\s+\d{4}(?:\s*\([^)]+\))?", re.IGNORECASE)

placeholder_normalize = {"staff, faculty": "Staff, Faculty"}

def clean_instructor(raw):
    if not raw:
        return None
    # Remove semester phrases
    cleaned = re.split(r"(Spring\s+Semester|Summer\s+Semester|Fall\s+Semester|Winter\s+Semester)", raw)[0].strip().rstrip(",")
    cleaned = re.sub(r"\bSummer$", "", cleaned).strip()
    key = cleaned.lower()
    if key in placeholder_normalize:
        return placeholder_normalize[key]
    return cleaned

def extract_instructors(pdf_path):
    records = []
    last_course, last_section, last_semester = None, None, None

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if not text:
                continue
            lines = text.split("\n")

            for i, line in enumerate(lines):
                # Detect course/section header
                cs_match = section_header_pattern.search(line)
                if cs_match:
                    last_course, last_section = cs_match.groups()

                # Detect semester (carry forward if on separate line)
                sem_match = semester_pattern.search(line)
                if sem_match:
                    last_semester = sem_match.group(0).strip()

                # Instructor detection
                if "instructor:" in line.lower():
                    # Inline instructor case
                    if not line.strip().lower().endswith("instructor:"):
                        instructor_raw = line.split("Instructor:")[1].strip()
                    else:
                        # Next line case
                        j = i+1
                        while j < len(lines) and not lines[j].strip():
                            j += 1
                        instructor_raw = lines[j].strip() if j < len(lines) else None

                    if instructor_raw:
                        sem_match = semester_pattern.search(instructor_raw)
                        semester = sem_match.group(0).strip() if sem_match else last_semester
                        instructor_clean = clean_instructor(instructor_raw)
                        if instructor_clean and "," in instructor_clean:
                            records.append({
                                "Course Code": last_course,
                                "Section Code": last_section,
                                "Instructor": instructor_clean,
                                "Semester": semester
                            })

    # Don’t drop duplicates yet — let’s see the raw capture
    df = pd.DataFrame(records).reset_index(drop=True)
    print(f"Extracted {len(df)} instructors")
    return df

# Run extractor
df_instructors = extract_instructors(pdf_path)
print(df_instructors)

Extracted 22 instructors
   Course Code Section Code          Instructor                    Semester
0      DBA7030            A      Achoki, George  Summer Semester 2019 (DOC)
1      IRL7019            A         Ali, Fatuma  Summer Semester 2019 (DOC)
2      PSY7735            A    Arasa, Josephine  Summer Semester 2019 (DOC)
3      PSY7736            A         Brown, Dana  Summer Semester 2019 (DOC)
4      DBA7020            A       Kamau, Joseph  Summer Semester 2019 (DOC)
5      PSY7727            A    Kimotho, Stephen  Summer Semester 2019 (DOC)
6      DBA7050            A    Koshal, Jeremiah  Summer Semester 2019 (DOC)
7      IRL7006            A        Mbae, Justus  Summer Semester 2019 (DOC)
8      PSY7743            A        Muturi, Lucy  Summer Semester 2019 (DOC)
9      PSY7722            A        Muturi, Lucy  Summer Semester 2019 (DOC)
10     PSY7734            A        Nyaga, Nancy  Summer Semester 2019 (DOC)
11     PSY7760            A  Ongecha, Francisca  Summer Semeste

ok we are so close with it, most names are being registered as they should be and with their course code, but now Simeon is no where to be seen. Neither is omollo. next is its repeating the course code 'PSY773' despite it not being for those lectures. So its just names that are not being capture and imporper course codes.

- Our current extractor only grabs the next line if "Instructor:" ends the line. But here, "Instructor: Sungi," is inline, and "Simeon Summer Semester 2019 (DOC)" is on the next line. We need to merge both lines when the instructor name is split across two lines.
- Omollo Joseph: Same issue — "Instructor: Omollo," on one line, "Joseph Summer Semester 2019 (DOC)" on the next. Again, we need to merge.
- Repeating PSY7733: This happens because we’re carrying forward the last seen course code until the next header. For Staff, Faculty, multiple dissertation reports are being collapsed into PSY7733. We need to re‑parse the course code from the same line as "Section Report:" every time, instead of carrying it forward too loosely.

i dont want to give up just yet so we are going to do 3 more pushes before we accpet defeat

In [1]:
import pdfplumber
import re
import pandas as pd

pdf_path = r"C:\\Users\\Admin\\OneDrive\\Documents\\Schoolwork\\Projects\\UNI finals project\\Code stuff\\US 2019 - Doctorate Faculty Evaluation.pdf"

# Regex patterns
section_header_pattern = re.compile(
    r"\((?P<course>[A-Z]{2,}\d{4})\s*(?:\([A-Z0-9]+\))?\)\s*Section:\s*(?P<section>[A-Za-z0-9]+)",
    re.IGNORECASE
)
semester_pattern = re.compile(r"(Spring|Summer|Fall|Winter)\s+Semester\s+\d{4}(?:\s*\([^)]+\))?", re.IGNORECASE)

placeholder_normalize = {"staff, faculty": "Staff, Faculty"}

def clean_instructor(raw):
    if not raw:
        return None
    # Remove semester phrases
    cleaned = re.split(r"(Spring\s+Semester|Summer\s+Semester|Fall\s+Semester|Winter\s+Semester)", raw)[0].strip().rstrip(",")
    # Remove dangling 'Summer'
    cleaned = re.sub(r"\bSummer$", "", cleaned).strip()
    key = cleaned.lower()
    if key in placeholder_normalize:
        return placeholder_normalize[key]
    return cleaned

def extract_instructors(pdf_path):
    records = []
    last_course, last_section, last_semester = None, None, None

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if not text:
                continue
            lines = text.split("\n")

            for i, line in enumerate(lines):
                # Detect course/section header
                if line.lower().startswith("section report:"):
                    cs_match = section_header_pattern.search(line)
                    if cs_match:
                        last_course, last_section = cs_match.groups()

                # Detect semester (carry forward if on separate line)
                sem_match = semester_pattern.search(line)
                if sem_match:
                    last_semester = sem_match.group(0).strip()

                # Instructor detection
                if "instructor:" in line.lower():
                    instructor_raw = None
                    if not line.strip().lower().endswith("instructor:"):
                        # Inline instructor, but may be split
                        instructor_raw = line.split("Instructor:")[1].strip()
                        # If next line looks like continuation (capitalized, not a header), merge
                        if i+1 < len(lines) and re.match(r"^[A-Z][a-z]+", lines[i+1]):
                            instructor_raw = instructor_raw + " " + lines[i+1].strip()
                    else:
                        # Next line case
                        j = i+1
                        while j < len(lines) and not lines[j].strip():
                            j += 1
                        if j < len(lines):
                            instructor_raw = lines[j].strip()

                    if instructor_raw:
                        sem_match = semester_pattern.search(instructor_raw)
                        semester = sem_match.group(0).strip() if sem_match else last_semester
                        instructor_clean = clean_instructor(instructor_raw)
                        if instructor_clean and "," in instructor_clean:
                            records.append({
                                "Course Code": last_course,
                                "Section Code": last_section,
                                "Instructor": instructor_clean,
                                "Semester": semester
                            })

    df = pd.DataFrame(records).drop_duplicates().reset_index(drop=True)
    print(f"Extracted {len(df)} instructors")
    return df

# Run extractor
df_instructors = extract_instructors(pdf_path)
print(df_instructors)

Extracted 18 instructors
   Course Code Section Code          Instructor                    Semester
0      DBA7030            A      Achoki, George  Summer Semester 2019 (DOC)
1      IRL7019            A         Ali, Fatuma  Summer Semester 2019 (DOC)
2      PSY7735            A    Arasa, Josephine  Summer Semester 2019 (DOC)
3      PSY7736            A         Brown, Dana  Summer Semester 2019 (DOC)
4      DBA7020            A       Kamau, Joseph  Summer Semester 2019 (DOC)
5      PSY7727            A    Kimotho, Stephen  Summer Semester 2019 (DOC)
6      DBA7050            A    Koshal, Jeremiah  Summer Semester 2019 (DOC)
7      IRL7006            A        Mbae, Justus  Summer Semester 2019 (DOC)
8      PSY7743            A        Muturi, Lucy  Summer Semester 2019 (DOC)
9      PSY7722            A        Muturi, Lucy  Summer Semester 2019 (DOC)
10     PSY7734            A        Nyaga, Nancy  Summer Semester 2019 (DOC)
11     PSY7760            A      Omollo, Joseph  Summer Semeste

ok great more progess weve finally solved the omollo and Simeon issue, and most course codes are being addressed to their respective courses apart from ongesha who was the coruse code 'PSY7760' yet its meant to be 'PSY7719 ' FIN7030 hasn't been registered at all, the course and the fact that it is under staff,faculty. Same as to 'MGT7030D' and 'LED7030'. And lastly cassandrais not being picked up twice for 'IRL7900C' and 'IRL7900D' it seems as if the bleed down of the last read is affecting the reading.

1. 	Ongecha mis‑coded:
➝ Cause: the parser is carrying forward the last course code (PSY7760) instead of re‑parsing the header line for her section.
2. 	FIN7030D, MGT7030D, LED7030 missing: These Staff, Faculty dissertation reports aren’t being registered.
➝ Cause: the regex is too strict — it only matches , so codes with a trailing letter (like ) are skipped.
3. 	Veney Cassandra only once: She should appear twice (IRL7900C and IRL7900D).
➝ Cause: bleed‑down of course code — the parser is reusing IRL7017 instead of re‑parsing IRL7900C/D headers.

In [2]:
import pdfplumber
import re
import pandas as pd

pdf_path = r"C:\\Users\\Admin\\OneDrive\\Documents\\Schoolwork\\Projects\\UNI finals project\\Code stuff\\US 2019 - Doctorate Faculty Evaluation.pdf"

# Regex patterns
section_header_pattern = re.compile(
    r"\((?P<course>[A-Z]{2,}\d{4}[A-Z]?)\s*(?:\([A-Z0-9]+\))?\)\s*Section:\s*(?P<section>[A-Za-z0-9]+)",
    re.IGNORECASE
)
semester_pattern = re.compile(r"(Spring|Summer|Fall|Winter)\s+Semester\s+\d{4}(?:\s*\([^)]+\))?", re.IGNORECASE)

placeholder_normalize = {"staff, faculty": "Staff, Faculty"}

def clean_instructor(raw):
    if not raw:
        return None
    # Remove semester phrases
    cleaned = re.split(r"(Spring\s+Semester|Summer\s+Semester|Fall\s+Semester|Winter\s+Semester)", raw)[0].strip().rstrip(",")
    # Remove dangling 'Summer'
    cleaned = re.sub(r"\bSummer$", "", cleaned).strip()
    key = cleaned.lower()
    if key in placeholder_normalize:
        return placeholder_normalize[key]
    return cleaned

def extract_instructors(pdf_path):
    records = []
    last_course, last_section, last_semester = None, None, None

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if not text:
                continue
            lines = text.split("\n")

            for i, line in enumerate(lines):
                # Detect course/section header
                if line.lower().startswith("section report:"):
                    cs_match = section_header_pattern.search(line)
                    if cs_match:
                        last_course, last_section = cs_match.groups()

                # Detect semester (carry forward if on separate line)
                sem_match = semester_pattern.search(line)
                if sem_match:
                    last_semester = sem_match.group(0).strip()

                # Instructor detection
                if "instructor:" in line.lower():
                    instructor_raw = None
                    if not line.strip().lower().endswith("instructor:"):
                        # Inline instructor, but may be split
                        instructor_raw = line.split("Instructor:")[1].strip()
                        # Merge continuation line if needed
                        if i+1 < len(lines) and re.match(r"^[A-Z][a-z]+", lines[i+1]):
                            instructor_raw = instructor_raw + " " + lines[i+1].strip()
                    else:
                        # Next line case
                        j = i+1
                        while j < len(lines) and not lines[j].strip():
                            j += 1
                        if j < len(lines):
                            instructor_raw = lines[j].strip()

                    if instructor_raw:
                        sem_match = semester_pattern.search(instructor_raw)
                        semester = sem_match.group(0).strip() if sem_match else last_semester
                        instructor_clean = clean_instructor(instructor_raw)
                        if instructor_clean and "," in instructor_clean:
                            records.append({
                                "Course Code": last_course,
                                "Section Code": last_section,
                                "Instructor": instructor_clean,
                                "Semester": semester
                            })

    df = pd.DataFrame(records).reset_index(drop=True)
    print(f"Extracted {len(df)} instructors")
    return df

# Run extractor
df_instructors = extract_instructors(pdf_path)
print(df_instructors)

Extracted 25 instructors
   Course Code Section Code          Instructor                    Semester
0      DBA7030            A      Achoki, George  Summer Semester 2019 (DOC)
1      IRL7019            A         Ali, Fatuma  Summer Semester 2019 (DOC)
2      PSY7735            A    Arasa, Josephine  Summer Semester 2019 (DOC)
3      PSY7736            A         Brown, Dana  Summer Semester 2019 (DOC)
4      DBA7020            A       Kamau, Joseph  Summer Semester 2019 (DOC)
5      PSY7727            A    Kimotho, Stephen  Summer Semester 2019 (DOC)
6      DBA7050            A    Koshal, Jeremiah  Summer Semester 2019 (DOC)
7      IRL7006            A        Mbae, Justus  Summer Semester 2019 (DOC)
8      PSY7743            A        Muturi, Lucy  Summer Semester 2019 (DOC)
9      PSY7722            A        Muturi, Lucy  Summer Semester 2019 (DOC)
10     PSY7734            A        Nyaga, Nancy  Summer Semester 2019 (DOC)
11     PSY7760            A      Omollo, Joseph  Summer Semeste

ok big growth but we have an issue with ongecha she has th wrong course code, shes meant to have 'PSY7719' and webbo rosyslynne  has the wrong course code, shes meant to have 'PSY7718'. just those two tweaks and i think we have it done right

In [4]:
import pdfplumber
import re
import pandas as pd

pdf_path = r"C:\\Users\\Admin\\OneDrive\\Documents\\Schoolwork\\Projects\\UNI finals project\\Code stuff\\US 2019 - Doctorate Faculty Evaluation.pdf"

# Regex patterns
section_header_pattern = re.compile(
    r"\((?P<course>[A-Z]{2,}\d{4}[A-Z]?)\s*(?:\([A-Z0-9]+\))?\)\s*Section:\s*(?P<section>[A-Za-z0-9]+)",
    re.IGNORECASE
)
semester_pattern = re.compile(r"(Spring|Summer|Fall|Winter)\s+Semester\s+\d{4}(?:\s*\([^)]+\))?", re.IGNORECASE)

placeholder_normalize = {"staff, faculty": "Staff, Faculty"}

def clean_instructor(raw):
    if not raw:
        return None
    # Remove semester phrases
    cleaned = re.split(r"(Spring\s+Semester|Summer\s+Semester|Fall\s+Semester|Winter\s+Semester)", raw)[0].strip().rstrip(",")
    # Remove dangling 'Summer'
    cleaned = re.sub(r"\bSummer$", "", cleaned).strip()
    key = cleaned.lower()
    if key in placeholder_normalize:
        return placeholder_normalize[key]
    return cleaned

def extract_instructors(pdf_path):
    records = []
    last_course, last_section, last_semester = None, None, None

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if not text:
                continue
            lines = text.split("\n")

            for i, line in enumerate(lines):
                # Detect course/section header
                if line.lower().startswith("section report:"):
                    cs_match = section_header_pattern.search(line)
                    if cs_match:
                        last_course, last_section = cs_match.groups()

                # Detect semester (carry forward if on separate line)
                sem_match = semester_pattern.search(line)
                if sem_match:
                    last_semester = sem_match.group(0).strip()

                # Instructor detection
                if "instructor:" in line.lower():
                    instructor_raw = None
                    if not line.strip().lower().endswith("instructor:"):
                        # Inline instructor, but may be split
                        instructor_raw = line.split("Instructor:")[1].strip()
                        # Merge continuation line if needed
                        if i+1 < len(lines) and re.match(r"^[A-Z][a-z]+", lines[i+1]):
                            instructor_raw = instructor_raw + " " + lines[i+1].strip()
                    else:
                        # Next line case
                        j = i+1
                        while j < len(lines) and not lines[j].strip():
                            j += 1
                        if j < len(lines):
                            instructor_raw = lines[j].strip()

                    if instructor_raw:
                        sem_match = semester_pattern.search(instructor_raw)
                        semester = sem_match.group(0).strip() if sem_match else last_semester
                        instructor_clean = clean_instructor(instructor_raw)
                        if instructor_clean and "," in instructor_clean:
                            # Always attach the most recent course code from Section Report
                            records.append({
                                "Course Code": last_course,
                                "Section Code": last_section,
                                "Instructor": instructor_clean,
                                "Semester": semester
                            })

    df = pd.DataFrame(records).reset_index(drop=True)
    print(f"Extracted {len(df)} instructors")
    return df

# Run extractor
df_instructors = extract_instructors(pdf_path)
print(df_instructors)

Extracted 25 instructors
   Course Code Section Code          Instructor                    Semester
0      DBA7030            A      Achoki, George  Summer Semester 2019 (DOC)
1      IRL7019            A         Ali, Fatuma  Summer Semester 2019 (DOC)
2      PSY7735            A    Arasa, Josephine  Summer Semester 2019 (DOC)
3      PSY7736            A         Brown, Dana  Summer Semester 2019 (DOC)
4      DBA7020            A       Kamau, Joseph  Summer Semester 2019 (DOC)
5      PSY7727            A    Kimotho, Stephen  Summer Semester 2019 (DOC)
6      DBA7050            A    Koshal, Jeremiah  Summer Semester 2019 (DOC)
7      IRL7006            A        Mbae, Justus  Summer Semester 2019 (DOC)
8      PSY7743            A        Muturi, Lucy  Summer Semester 2019 (DOC)
9      PSY7722            A        Muturi, Lucy  Summer Semester 2019 (DOC)
10     PSY7734            A        Nyaga, Nancy  Summer Semester 2019 (DOC)
11     PSY7760            A      Omollo, Joseph  Summer Semeste

yeah i tried, well come back to this another time. 