In [1]:
from PyPDF2 import PdfReader, PdfWriter

reader = PdfReader("C:\\Users\\Admin\\OneDrive\\Documents\\Schoolwork\\Projects\\UNI finals project\\Code stuff\\US 2019 - Undergraduate Faculty Evaluation.pdf")
writer = PdfWriter()

for i in range(19):  # pages are zero-indexed
    writer.add_page(reader.pages[i])

with open("output_first19.pdf", "wb") as f:
    writer.write(f)

# This script extracts the first 19 pages from a specified PDF file and saves them into a new PDF file.
# here i want to save the output with a custom name so that it can be easily identified later but in the same folder as the original file
with open("C:\\Users\\Admin\\OneDrive\\Documents\\Schoolwork\\Projects\\UNI finals project\\Code stuff\\output_first19.pdf", "wb") as f:
    writer.write(f)

# to confirm that the file has been saved correctly, you can print a success message
print("The first 19 pages have been successfully extracted and saved as 'output_first19.pdf'.")


The first 19 pages have been successfully extracted and saved as 'output_first19.pdf'.


In [2]:
import re
import pandas as pd
from PyPDF2 import PdfReader

# -----------------------------
# CONFIGURATION
# -----------------------------
pdf_path = "output_first19.pdf"   # your PDF sample
excel_path = "US 2019 Faculty Evaluation.xlsx"  # your Excel file
output_path = "unified_feedback_output.csv"

# Mapping of question prompts to clean categories
question_map = {
    "enjoyed most": "Strengths",
    "didn't like": "Weaknesses",
    "improve": "Suggestions",
    "feedback on assignments": "Feedback Evaluation",
    "course text": "Course Text",
    "materials or resources": "Resources",
    "overall evaluation": "Overall"
}

# Regex patterns
course_section_pattern = re.compile(
    r"\((?P<course>[A-Z]{3,}\d{3,})\s*\(UG\d+\)\)\s*Section:\s*(?P<section>[A-Z])",
    re.IGNORECASE
)
instructor_pattern = re.compile(r"Instructor:\s*(?P<instructor>.+)", re.IGNORECASE)
footer_pattern = re.compile(r"Course Evaluation Section Report", re.IGNORECASE)

# -----------------------------
# STEP 1: Extract comments from PDF
# -----------------------------
reader = PdfReader(pdf_path)
records = []
current_course = current_section = current_instructor = current_type = None

for page in reader.pages:
    text = page.extract_text()
    if not text:
        continue
    lines = text.split("\n")
    for line in lines:
        line = line.strip()
        if not line or footer_pattern.search(line):
            continue

        # Detect course/section
        cs_match = course_section_pattern.search(line)
        if cs_match:
            current_course = cs_match.group("course").strip()
            current_section = cs_match.group("section").strip()
            continue

        # Detect instructor
        instr_match = instructor_pattern.search(line)
        if instr_match:
            current_instructor = re.sub(
                r"\s+Summer Semester.*", "", instr_match.group("instructor")
            ).strip()
            continue

        # Detect question type
        matched_type = None
        for key, label in question_map.items():
            if key in line.lower():
                current_type = label
                matched_type = True
                break
        if matched_type:
            continue

        # Otherwise treat as comment text
        if current_type and current_course and current_section and current_instructor:
            if re.match(r"^[\.\,\-\s]*$", line):
                continue
            records.append({
                "Course Code": current_course,
                "Section Code": current_section,
                "Instructor": current_instructor,
                "Comment Type": current_type,
                "Comment Text": line
            })

comments_df = pd.DataFrame(records)

# Ensure expected comment columns exist (even if empty) to avoid KeyError later
_expected_cols = [
    "Course Code", "Section Code", "Instructor", "Comment Type",
    "Comment Text", "Course_Section"
]
for _c in _expected_cols:
    if _c not in comments_df.columns:
        comments_df[_c] = None

# -----------------------------
# STEP 2: Debug check
# -----------------------------
print("\nSample parsed rows (before merge):")
_cols_to_show = [c for c in ["Course Code", "Section Code", "Instructor", "Comment Type"] if c in comments_df.columns]
# If there are rows, show the requested columns; otherwise print helpful diagnostic info
if len(comments_df) > 0 and _cols_to_show:
    print(comments_df[_cols_to_show].drop_duplicates().head(15))
else:
    print("No parsed comment rows available.")
    print("comments_df columns:", list(comments_df.columns))
    # show a small sample if any rows exist (even if they contain None)
    if len(comments_df) > 0:
        print(comments_df.head(5))

# -----------------------------
# STEP 3: Load Excel closed-ended data
# -----------------------------
excel_df = pd.read_excel(excel_path)

# Normalize Excel identifiers
excel_df["Instructor"] = excel_df["Full Name of Faculty"].str.strip()
excel_df["Course Code"] = excel_df["Course Code"].astype(str).str.strip()
excel_df["Section Code"] = excel_df["Section Code"].astype(str).str.strip()

# -----------------------------
# STEP 4: Merge datasets (robust)
# -----------------------------
# Helper: find first existing candidate column in comments_df
def _first_existing(col_candidates, df):
    for c in col_candidates:
        if c in df.columns:
            return c
    return None

# candidate name lists (extend as needed)
course_candidates = ["Course Code","Course Code_comments","Course Code_x","Course_Code","Course_Code_comments","Course Code_comments "]
section_candidates = ["Section Code","Section Code_comments","Section Code_x","Section_Code"]
instr_candidates = ["Instructor","Instructor_name","Full Name of Faculty","Full Name","Instructor_x","Instructor_y"]
ctype_candidates = ["Comment Type","Comment_Type"]
ctext_candidates = ["Comment Text","Comment_Text","Comment_Text_x","Comment Text_x"]

# detect actual columns and prepare rename map
rename_map = {}
_course = _first_existing(course_candidates, comments_df)
_section = _first_existing(section_candidates, comments_df)
_instr = _first_existing(instr_candidates, comments_df)
_ctype = _first_existing(ctype_candidates, comments_df)
_ctext = _first_existing(ctext_candidates, comments_df)

if _course and _course != "Course Code":
    rename_map[_course] = "Course Code"
if _section and _section != "Section Code":
    rename_map[_section] = "Section Code"
if _instr and _instr != "Instructor":
    rename_map[_instr] = "Instructor"
if _ctype and _ctype != "Comment Type":
    rename_map[_ctype] = "Comment Type"
if _ctext and _ctext != "Comment Text":
    rename_map[_ctext] = "Comment Text"

if rename_map:
    comments_df = comments_df.rename(columns=rename_map)

# If Course/Section still missing but Course_Section exists, try to parse it
if (("Course Code" not in comments_df.columns or "Section Code" not in comments_df.columns) 
        and "Course_Section" in comments_df.columns):
    def _parse_course_section(s):
        if pd.isna(s):
            return (None, None)
        s = str(s)
        # capture course code like IST2040 and last token as section letter if present
        m_course = re.search(r'([A-Z]{3,}\d{3})', s)
        parts = s.strip().split()
        m_section = None
        if parts:
            last = parts[-1]
            if re.fullmatch(r'[A-Z]', last):
                m_section = last
        course = m_course.group(1) if m_course else None
        section = m_section if m_section else None
        return (course, section)
    parsed = comments_df["Course_Section"].apply(_parse_course_section)
    if "Course Code" not in comments_df.columns:
        comments_df["Course Code"] = parsed.apply(lambda x: x[0])
    if "Section Code" not in comments_df.columns:
        comments_df["Section Code"] = parsed.apply(lambda x: x[1])

# Ensure Instructor column exists; try to infer if missing
if "Instructor" not in comments_df.columns:
    possible_instr_cols = [c for c in comments_df.columns if "instruct" in c.lower() or "full name" in c.lower()]
    if possible_instr_cols:
        comments_df = comments_df.rename(columns={possible_instr_cols[0]: "Instructor"})
    else:
        # create Instructor column so merge doesn't crash - values may be None
        comments_df["Instructor"] = None

# Now verify required columns exist before merging
required = ["Course Code","Section Code","Instructor"]
missing = [c for c in required if c not in comments_df.columns]
if missing:
    raise ValueError(f"Cannot merge: missing columns in comments_df: {missing}")

# Trim whitespace and coerce to string for merge keys
for c in required:
    comments_df[c] = comments_df[c].astype(str).str.strip()
    # ensure corresponding excel columns exist and are normalized
    if c in excel_df.columns:
        excel_df[c] = excel_df[c].astype(str).str.strip()

# If excel_df uses a different instructor column, ensure it has 'Instructor' normalized earlier
# (original code already set excel_df['Instructor'] = excel_df['Full Name of Faculty'].str.strip())

merged = comments_df.merge(
    excel_df,
    on=["Course Code","Section Code","Instructor"],
    how="left",
    suffixes=("_comments","_excel")
)

# -----------------------------
# STEP 5: Save unified dataset
# -----------------------------
merged.to_csv(output_path, index=False)

print(f"\nUnified dataset saved to: {output_path}")
print("Merged sample:")
print(merged.head(10))


Sample parsed rows (before merge):
No parsed comment rows available.
comments_df columns: ['Course Code', 'Section Code', 'Instructor', 'Comment Type', 'Comment Text', 'Course_Section']

Unified dataset saved to: unified_feedback_output.csv
Merged sample:
Empty DataFrame
Columns: [Course Code, Section Code, Instructor, Comment Type, Comment Text, Course_Section, Faculty ID, Highest Degree of Faculty, Title of Faculty, Full Name of Faculty, Total Number of Students in the Class, Total Respondents, % of Respondents, Mean Score, % Score, Letter Grade, Course Description, School, Are they Adjunct (Y/N), Program, Total Number of Classes Being Taught by the Faculty, Total Number of Students  Being Taught by the Faculty]
Index: []

[0 rows x 22 columns]


In [3]:
import pdfplumber
import re
import pandas as pd

pdf_path = "output_first19.pdf"

# Map question prompts to categories
question_map = {
    "enjoyed most": "Strengths",
    "didn't like": "Weaknesses",
    "improve": "Suggestions",
    "feedback on assignments": "Feedback Evaluation",
    "course text": "Course Text",
    "materials or resources": "Resources",
    "overall evaluation": "Overall"
}

records = []
current_course = current_section = current_instructor = current_type = None

with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        text = page.extract_text()
        if not text:
            continue
        # Normalize whitespace
        text = re.sub(r"\s+", " ", text)

        # Detect course/section
        cs_match = re.search(r"\(([A-Z]{3,}\d{3,}) \(UG\d+\)\) Section: ([A-Z])", text)
        if cs_match:
            current_course = cs_match.group(1)
            current_section = cs_match.group(2)

        # Detect instructor
        instr_match = re.search(r"Instructor:\s*([^,]+)", text)
        if instr_match:
            current_instructor = instr_match.group(1).strip()

        # Split into sentences
        sentences = re.split(r"(?<=[.?!])\s+", text)

        for sentence in sentences:
            # Detect question type
            for key, label in question_map.items():
                if key in sentence.lower():
                    current_type = label
                    break

            # Capture comment text
            if current_type and len(sentence.strip()) > 2:
                # Skip if sentence is just the question itself
                if any(key in sentence.lower() for key in question_map):
                    continue
                records.append({
                    "Course Code": current_course,
                    "Section Code": current_section,
                    "Instructor": current_instructor,
                    "Comment Type": current_type,
                    "Comment Text": sentence.strip()
                })

            if len(records) >= 50:
                break
        if len(records) >= 50:
            break

# Convert to DataFrame for inspection
df = pd.DataFrame(records)

print("\nFirst 50 parsed comment rows:\n")
print(df.head(50).to_string(index=False))





First 50 parsed comment rows:

Course Code Section Code Instructor        Comment Type                                                                                                                                                                                                                                                                                                                                                                                                  Comment Text
    IST3005            A     Afundi         Course Text                                                                                                                                                                                                                                                                                                                44.8% 44.8% 10.3% 0.0% 0.0% 100.0% 18 6 5 0 0 29 The lecturer attended all the class sessions.
    IST3005            A     Afundi         Course Text     

In [4]:
# Here i want to simplify the output a bit more so i will now view the first 15 rows instead
print(df.head(15).to_string(index=False))



Course Code Section Code Instructor Comment Type                                                                                                                                                                                                                                                                                                                                                                                                  Comment Text
    IST3005            A     Afundi  Course Text                                                                                                                                                                                                                                                                                                                44.8% 44.8% 10.3% 0.0% 0.0% 100.0% 18 6 5 0 0 29 The lecturer attended all the class sessions.
    IST3005            A     Afundi  Course Text                                                          

In [5]:
import pdfplumber
import re
import pandas as pd

pdf_path = "output_first19.pdf"

# Map question prompts to categories
question_map = {
    "enjoyed most": "Strengths",
    "didn't like": "Weaknesses",
    "improve": "Suggestions",
    "feedback on assignments": "Feedback Evaluation",
    "course text": "Course Text",
    "materials or resources": "Resources",
    "overall evaluation": "Overall"
}

# Regex patterns
section_header_pattern = re.compile(r"\(([A-Z]{3,}\d{3,}) \(UG\d+\)\) Section: ([A-Z])", re.IGNORECASE)
instructor_pattern = re.compile(r"Instructor:\s*(.+)", re.IGNORECASE)

# Noise filters
noise_patterns = [
    re.compile(r"\bAverage\b", re.IGNORECASE),
    re.compile(r"\bMean\b", re.IGNORECASE),
    re.compile(r"\bDept Mean\b", re.IGNORECASE),
    re.compile(r"\bAll College\b", re.IGNORECASE),
    re.compile(r"Course Evaluation Section Report", re.IGNORECASE),
    re.compile(r"^\s*[\d\.\%\s]+$")  # mostly numbers/percentages
]

def is_noise(line):
    # Skip if line matches any known noise pattern
    if any(p.search(line) for p in noise_patterns):
        return True
    # Skip if more than half tokens are numeric/percentages
    tokens = line.split()
    numeric_tokens = [t for t in tokens if re.match(r"^[\d\.\%]+$", t)]
    return len(tokens) > 0 and len(numeric_tokens) > len(tokens) / 2

records = []
current_course = current_section = current_instructor = current_type = None

with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        text = page.extract_text()
        if not text:
            continue
        lines = text.split("\n")
        for line in lines:
            line = re.sub(r"\s+", " ", line).strip()
            if not line:
                continue

            # Detect course/section
            cs_match = section_header_pattern.search(line)
            if cs_match:
                current_course, current_section = cs_match.groups()
                continue

            # Detect instructor
            instr_match = instructor_pattern.search(line)
            if instr_match:
                current_instructor = re.sub(r"\s+Summer Semester.*", "", instr_match.group(1)).strip()
                continue

            # Detect question type
            for key, label in question_map.items():
                if key in line.lower():
                    current_type = label
                    break

            # Skip noise
            if is_noise(line):
                continue

            # Keep only lines with alphabetic content
            if re.search(r"[A-Za-z]{3,}", line) and current_type:
                records.append({
                    "Course Code": current_course,
                    "Section Code": current_section,
                    "Instructor": current_instructor,
                    "Comment Type": current_type,
                    "Comment Text": line
                })

# Convert to DataFrame and print first 50 rows
df_comments = pd.DataFrame(records)
print("\nFirst 50 parsed comment rows:\n")
print(df_comments.head(50).to_string(index=False))

# Here i want save the output to a csv file for easier viewing later on
df_comments.to_csv("parsed_comments_output.csv", index=False)

# Here i am going to save it within the same folder
df_comments.to_csv("C:\\Users\\Admin\\OneDrive\\Documents\\Schoolwork\\Projects\\UNI finals project\\Code stuff\\parsed_comments_output.csv", index=False)



First 50 parsed comment rows:

Course Code Section Code      Instructor Comment Type                                                                      Comment Text
    IST3005            A Afundi, Patrick  Course Text                       My rating of the course text for this course 13 13 3 0 0 29
    IST3005            A Afundi, Patrick  Course Text                      The lecturer attended all the class sessions. 4.45 4.41 4.48
    IST3005            A Afundi, Patrick  Course Text evaluation and discussed the results in class. 48.3% 27.6% 10.3% 6.9% 6.9% 100.0%
    IST3005            A Afundi, Patrick  Course Text                         The course materials availed on Blackboard 15 10 3 0 1 29
    IST3005            A Afundi, Patrick  Course Text                                                                 COURSE EVALUATION
    IST3005            A Afundi, Patrick  Course Text                                                                           Most of
    IST3005     

Noise filtering: drops lines dominated by numbers, percentages, or known table keywords.
Instructor cleanup: strips semester/session text.
Alphabetic check: ensures only comment‑like text is kept.
Debug output: prints the first 50 rows so you can visually confirm the parsing is clean.


In [6]:
import pdfplumber
import re
import pandas as pd

pdf_path = "output_first19.pdf"

# Map question prompts to categories
question_map = {
    "enjoyed most": "Strengths",
    "didn't like": "Weaknesses",
    "improve": "Suggestions",
    "feedback on assignments": "Feedback Evaluation",
    "course text": "Course Text",
    "materials or resources": "Resources",
    "overall evaluation": "Overall"
}

# Regex patterns
section_header_pattern = re.compile(r"\(([A-Z]{3,}\d{3,}) \(UG\d+\)\)\s*Section:\s*([A-Z])", re.IGNORECASE)
instructor_pattern = re.compile(r"Instructor:\s*(.+)", re.IGNORECASE)

# Prompts to skip (used only to set context)
prompt_patterns = [
    re.compile(r"Your lecturer would like to know", re.IGNORECASE),
    re.compile(r"Explain your evaluation", re.IGNORECASE),
    re.compile(r"What other materials", re.IGNORECASE),
    re.compile(r"My overall evaluation", re.IGNORECASE)
]

# Noise filters
noise_patterns = [
    re.compile(r"Average|Mean|Dept Mean|All College|Course Evaluation Section Report", re.IGNORECASE),
    re.compile(r"^\s*[\d\.\%\s]+$"),
]

def clean_line(line):
    # Normalize whitespace
    line = re.sub(r"\s+", " ", line).strip()
    # Repair broken words like "D is c u s s ion" → "Discussion"
    line = re.sub(r"(\w)\s+(\w)", r"\1\2", line)
    return line

def is_noise(line):
    return any(p.search(line) for p in noise_patterns)

def is_prompt(line):
    return any(p.search(line) for p in prompt_patterns)

records = []
current_course = current_section = current_instructor = current_type = None
buffer = ""

with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        text = page.extract_text()
        if not text:
            continue
        lines = text.split("\n")
        for line in lines:
            line = clean_line(line)
            if not line:
                continue

            # Detect course/section
            cs_match = section_header_pattern.search(line)
            if cs_match:
                current_course, current_section = cs_match.groups()
                continue

            # Detect instructor
            instr_match = instructor_pattern.search(line)
            if instr_match:
                current_instructor = re.sub(r"\s+Summer Semester.*", "", instr_match.group(1)).strip()
                continue

            # Detect question type
            for key, label in question_map.items():
                if key in line.lower():
                    current_type = label
                    buffer = ""
                    break

            # Skip prompts and noise
            if is_prompt(line) or is_noise(line):
                continue

            # Accumulate buffer until punctuation
            buffer += " " + line
            if re.search(r"[.!?]$", line):
                comment_text = buffer.strip()
                if re.search(r"[A-Za-z]{3,}", comment_text) and current_type:
                    records.append({
                        "Course Code": current_course,
                        "Section Code": current_section,
                        "Instructor": current_instructor,
                        "Comment Type": current_type,
                        "Comment Text": comment_text
                    })
                buffer = ""

# Convert to DataFrame and save/inspect
df_comments = pd.DataFrame(records)
print("\nFirst 50 parsed comment rows:\n")
print(df_comments.head(50).to_string(index=False))

# Here i want it to be saved within the same folder
df_comments.to_csv("C:\\Users\\Admin\\OneDrive\\Documents\\Schoolwork\\Projects\\UNI finals project\\Code stuff\\parsed_comments_cleaned.csv", index=False)


First 50 parsed comment rows:

Course Code Section Code                               Instructor Comment Type                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          

Concatenates fragments: uses a buffer until punctuation is reached, so prompts/comments aren’t split across rows.
Repairs broken words: collapses intra‑word spaces (D is c u s s ion → Discussion).
Skips prompts: “Your lecturer would like to know…” lines are used only to set Comment Type, not saved as comments.
Filters noise: drops table rows, percentages, and evaluation headers.
Outputs clean comments: only genuine student responses remain, grouped under the right category.


In [7]:
import pdfplumber
import re
import pandas as pd

pdf_path = "output_first19.pdf"

# Map question prompts to categories
question_map = {
    "enjoyed most": "Strengths",
    "didn't like": "Weaknesses",
    "improve": "Suggestions",
    "feedback on assignments": "Feedback Evaluation",
    "course text": "Course Text",
    "materials or resources": "Resources",
    "overall evaluation": "Overall"
}

# Regex patterns
section_header_pattern = re.compile(r"\(([A-Z]{3,}\d{3,}) \(UG\d+\)\)\s*Section:\s*([A-Z])", re.IGNORECASE)
instructor_pattern = re.compile(r"Instructor:\s*(.+)", re.IGNORECASE)

# Prompts to skip (used only to set context)
prompt_patterns = [
    re.compile(r"Your lecturer would like to know", re.IGNORECASE),
    re.compile(r"Explain your evaluation", re.IGNORECASE),
    re.compile(r"What other materials", re.IGNORECASE),
    re.compile(r"My overall evaluation", re.IGNORECASE)
]

# Noise filters
noise_patterns = [
    re.compile(r"Average|Mean|Dept Mean|All College|Course Evaluation Section Report", re.IGNORECASE),
    re.compile(r"^\s*[\d\.\%\s]+$"),
]

def clean_line(line):
    # Normalize whitespace
    line = re.sub(r"\s+", " ", line).strip()
    # Repair broken words like "D is c u s s ion" → "Discussion"
    line = re.sub(r"(\w)\s+(\w)", r"\1\2", line)
    return line

def is_noise(line):
    return any(p.search(line) for p in noise_patterns)

def is_prompt(line):
    return any(p.search(line) for p in prompt_patterns)

def looks_like_table(line):
    # If more digits than letters, treat as table row
    digits = sum(c.isdigit() for c in line)
    letters = sum(c.isalpha() for c in line)
    return digits > letters

records = []
current_course = current_section = current_instructor = current_type = None
buffer = ""

with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        text = page.extract_text()
        if not text:
            continue
        lines = text.split("\n")
        for line in lines:
            line = clean_line(line)
            if not line:
                continue

            # Detect course/section
            cs_match = section_header_pattern.search(line)
            if cs_match:
                current_course, current_section = cs_match.groups()
                continue

            # Detect instructor
            instr_match = instructor_pattern.search(line)
            if instr_match:
                current_instructor = re.sub(r"Summer Semester.*", "", instr_match.group(1)).strip()
                continue

            # Detect question type
            for key, label in question_map.items():
                if key in line.lower():
                    # Flush buffer before switching type
                    if buffer and current_type:
                        records.append({
                            "Course Code": current_course,
                            "Section Code": current_section,
                            "Instructor": current_instructor,
                            "Comment Type": current_type,
                            "Comment Text": buffer.strip()
                        })
                        buffer = ""
                    current_type = label
                    break

            # Skip prompts, noise, and table-like rows
            if is_prompt(line) or is_noise(line) or looks_like_table(line):
                buffer = ""  # reset so junk doesn't merge
                continue

            # Only keep lines with real words
            if re.search(r"[A-Za-z]{3,}\s+[A-Za-z]{3,}", line) and current_type:
                buffer += " " + line
                if re.search(r"[.!?]$", line):
                    records.append({
                        "Course Code": current_course,
                        "Section Code": current_section,
                        "Instructor": current_instructor,
                        "Comment Type": current_type,
                        "Comment Text": buffer.strip()
                    })
                    buffer = ""

# Final flush
if buffer and current_type:
    records.append({
        "Course Code": current_course,
        "Section Code": current_section,
        "Instructor": current_instructor,
        "Comment Type": current_type,
        "Comment Text": buffer.strip()
    })

# Convert to DataFrame and save/inspect
df_comments = pd.DataFrame(records)
print("\nFirst 50 parsed comment rows:\n")
print(df_comments.head(50).to_string(index=False))

df_comments.to_csv("parsed_comments_cleaned.csv", index=False)

# then i want it to be saved within the same folder
df_comments.to_csv("C:\\Users\\Admin\\OneDrive\\Documents\\Schoolwork\\Projects\\UNI finals project\\Code stuff\\parsed_comments_cleaned.csv", index=False)



First 50 parsed comment rows:

Course Code Section Code                               Instructor Comment Type                                                                                            Comment Text
    APT1040            A Afundi, PatrickSummerSemester2019 (UNDG)  Suggestions                        Thecoursetexthasa lotofinterestingcontentandithasexpandedmyknowledgeonwebdesign.
    APT1040            A Afundi, PatrickSummerSemester2019 (UNDG)  Suggestions expertiseanddiverseknowledgeandexperiencetoexplain. Also, wheneveri wantedhelp, hewouldalwayshelpmeeven


instructor names are still glued to the semester tag (Afundi, PatrickSummerSemester2019…).
Table rows and evaluation headers are sneaking through and getting concatenated into the comment buffer.
Multiple distinct comments are being merged into one giant string because the buffer isn’t being flushed often enough


This version ensures that table rows and headers are dropped, instructor names are cleaned, and each student comment is kept as its own row without being merged into long strings.

In [8]:
import pdfplumber
import re
import pandas as pd

pdf_path = "output_first19.pdf"

# Map question prompts to categories
question_map = {
    "enjoyed most": "Strengths",
    "didn't like": "Weaknesses",
    "improve": "Suggestions",
    "feedback on assignments": "Feedback Evaluation",
    "course text": "Course Text",
    "materials or resources": "Resources",
    "overall evaluation": "Overall"
}

# Regex patterns
section_header_pattern = re.compile(r"\(([A-Z]{3,}\d{3,}) \(UG\d+\)\)\s*Section:\s*([A-Z])", re.IGNORECASE)
instructor_pattern = re.compile(r"Instructor:\s*(.+)", re.IGNORECASE)

# Prompts to skip (used only to set context)
prompt_patterns = [
    re.compile(r"Your lecturer would like to know", re.IGNORECASE),
    re.compile(r"Explain your evaluation", re.IGNORECASE),
    re.compile(r"What other materials", re.IGNORECASE),
    re.compile(r"My overall evaluation", re.IGNORECASE)
]

# Noise filters
noise_patterns = [
    re.compile(r"Average|Mean|Dept Mean|All College|Course Evaluation Section Report", re.IGNORECASE),
    re.compile(r"^\s*[\d\.\%\s]+$"),
]

def clean_line(line):
    # Normalize whitespace
    line = re.sub(r"\s+", " ", line).strip()
    # Repair broken words like "D is c u s s ion" → "Discussion"
    line = re.sub(r"(?<=\w)\s(?=\w)", "", line)
    return line

def is_noise(line):
    return any(p.search(line) for p in noise_patterns)

def is_prompt(line):
    return any(p.search(line) for p in prompt_patterns)

def looks_like_table(line):
    tokens = line.split()
    numeric_tokens = [t for t in tokens if re.search(r"\d", t)]
    digits = sum(c.isdigit() for c in line)
    letters = sum(c.isalpha() for c in line)
    return len(numeric_tokens) >= 3 or digits > letters

def is_valid_comment(line):
    # At least two alphabetic words
    return bool(re.search(r"[A-Za-z]{3,}\s+[A-Za-z]{3,}", line))

records = []
current_course = current_section = current_instructor = current_type = None

with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        text = page.extract_text()
        if not text:
            continue
        lines = text.split("\n")
        for raw_line in lines:
            line = clean_line(raw_line)
            if not line:
                continue

            # Detect course/section
            cs_match = section_header_pattern.search(line)
            if cs_match:
                current_course, current_section = cs_match.groups()
                continue

            # Detect instructor
            instr_match = instructor_pattern.search(line)
            if instr_match:
                current_instructor = re.sub(r"Summer Semester.*", "", instr_match.group(1)).strip()
                continue

            # Detect question type
            for key, label in question_map.items():
                if key in line.lower():
                    current_type = label
                    continue

            # Skip prompts, noise, and table-like rows
            if is_prompt(line) or is_noise(line) or looks_like_table(line):
                continue

            # Save as comment if valid
            if current_course and current_section and current_instructor and current_type and is_valid_comment(line):
                records.append({
                    "Course Code": current_course,
                    "Section Code": current_section,
                    "Instructor": current_instructor,
                    "Comment Type": current_type,
                    "Comment Text": line
                })

# Convert to DataFrame and save/inspect
df_comments = pd.DataFrame(records)
print("\nFirst 50 parsed comment rows:\n")
print(df_comments.head(50).to_string(index=False))

df_comments.to_csv("parsed_comments_cleaned.csv", index=False)


First 50 parsed comment rows:

Empty DataFrame
Columns: []
Index: []


looks_like_table too aggressive: many real comments are short (e.g. “Good”, “None”, “Excellent”), which don’t have many letters. Our filter requiring “more letters than digits” or “≥3 numeric tokens” is wiping them out.
is_valid_comment too strict: requiring at least two alphabetic words excludes single‑word but valid comments like “Good” or “Fair”.
Buffer reset logic: by resetting too often, we may be discarding lines before they’re saved.


In [9]:
import pdfplumber
import re
import pandas as pd

pdf_path = "output_first19.pdf"

# Map question prompts to categories
question_map = {
    "enjoyed most": "Strengths",
    "didn't like": "Weaknesses",
    "improve": "Suggestions",
    "feedback on assignments": "Feedback Evaluation",
    "course text": "Course Text",
    "materials or resources": "Resources",
    "overall evaluation": "Overall"
}

# Regex patterns
section_header_pattern = re.compile(r"\(([A-Z]{3,}\d{3,}) \(UG\d+\)\)\s*Section:\s*([A-Z])", re.IGNORECASE)
instructor_pattern = re.compile(r"Instructor:\s*(.+)", re.IGNORECASE)

# Prompts to skip (used only to set context)
prompt_patterns = [
    re.compile(r"Your lecturer would like to know", re.IGNORECASE),
    re.compile(r"Explain your evaluation", re.IGNORECASE),
    re.compile(r"What other materials", re.IGNORECASE),
    re.compile(r"My overall evaluation", re.IGNORECASE)
]

# Noise filters
noise_patterns = [
    re.compile(r"Average|Mean|Dept Mean|All College|Course Evaluation Section Report", re.IGNORECASE),
    re.compile(r"^\s*[\d\.\%\s]+$"),
]

def clean_line(line):
    # Normalize whitespace
    line = re.sub(r"\s+", " ", line).strip()
    # Repair broken words like "D is c u s s ion" → "Discussion"
    line = re.sub(r"(?<=\w)\s(?=\w)", "", line)
    return line

def is_noise(line):
    return any(p.search(line) for p in noise_patterns)

def is_prompt(line):
    return any(p.search(line) for p in prompt_patterns)

def looks_like_table(line):
    tokens = line.split()
    numeric_tokens = [t for t in tokens if re.fullmatch(r"[0-9\.\%]+", t)]
    # Only drop if numeric tokens dominate
    return len(numeric_tokens) >= 3 and len(numeric_tokens) > len(tokens) / 2

def is_valid_comment(line):
    # Accept any line with at least one alphabetic word of length ≥2
    return bool(re.search(r"[A-Za-z]{2,}", line))

records = []
current_course = current_section = current_instructor = current_type = None

with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        text = page.extract_text()
        if not text:
            continue
        lines = text.split("\n")
        for raw_line in lines:
            line = clean_line(raw_line)
            if not line:
                continue

            # Detect course/section
            cs_match = section_header_pattern.search(line)
            if cs_match:
                current_course, current_section = cs_match.groups()
                continue

            # Detect instructor
            instr_match = instructor_pattern.search(line)
            if instr_match:
                current_instructor = re.sub(r"Summer Semester.*", "", instr_match.group(1)).strip()
                continue

            # Detect question type
            for key, label in question_map.items():
                if key in line.lower():
                    current_type = label
                    continue

            # Skip prompts, noise, and table-like rows
            if is_prompt(line) or is_noise(line) or looks_like_table(line):
                continue

            # Save as comment if valid
            if current_course and current_section and current_instructor and current_type and is_valid_comment(line):
                records.append({
                    "Course Code": current_course,
                    "Section Code": current_section,
                    "Instructor": current_instructor,
                    "Comment Type": current_type,
                    "Comment Text": line
                })

# Convert to DataFrame and save/inspect
df_comments = pd.DataFrame(records)
print("\nFirst 50 parsed comment rows:\n")
print(df_comments.head(50).to_string(index=False))

df_comments.to_csv("parsed_comments_cleaned.csv", index=False)


First 50 parsed comment rows:

Course Code Section Code                               Instructor Comment Type                                                                         Comment Text
    IST3005            A Afundi, PatrickSummerSemester2019 (UNDG)  Suggestions                                                  improvehis/herteachinginthiscourse.
    IST3005            A Afundi, PatrickSummerSemester2019 (UNDG)  Suggestions                                                                                 none
    IST3005            A Afundi, PatrickSummerSemester2019 (UNDG)  Suggestions                                CourseEvaluationSectionReport - 10/27/2025 - 16:15:35
    IST3005            A Afundi, PatrickSummerSemester2019 (UNDG)  Suggestions                                                                      postnotesontime
    IST3005            A Afundi, PatrickSummerSemester2019 (UNDG)  Suggestions                                                                 useof

Short comments preserved: “Good”, “None”, “Excellent” are no longer filtered out.
Table rows dropped: only lines dominated by numbers/percentages are skipped.
Instructor names cleaned: semester tags removed properly.
One comment per row: no more merged blocks; each valid line is saved individually.

OK MASSIVE improvements. its picking up the comments and even the short ones. But its now combing long comments into one combined statement with no spaces like such 'Heispositiveandteacheswell' and this 'Classesaregoodbutifthereisnocoursetextandblackboardisbeingunderutilizedthenstudyingoutsideclassbecome'  and others have this digits in them 'consultationwiththestudentsduringstated4.504.384.46 ' and this '15+ 12-149-117-84 -61-3DeptAll ',  'Forthiscourse, Iexpecttoreceiveagrade251210038 '  and sadly its still picking up the footer 'CourseEvaluationSectionReport - 10/27/2025 - 16:15:35 ' and also picking up column titles for the tables 'Afundi, PatrickSummerSemester2019 (UNDG) '

In [10]:
import pdfplumber
import re
import pandas as pd

pdf_path = "output_first19.pdf"

# Map question prompts to categories
question_map = {
    "enjoyed most": "Strengths",
    "didn't like": "Weaknesses",
    "improve": "Suggestions",
    "feedback on assignments": "Feedback Evaluation",
    "course text": "Course Text",
    "materials or resources": "Resources",
    "overall evaluation": "Overall"
}

# Regex patterns
section_header_pattern = re.compile(r"\(([A-Z]{3,}\d{3,}) \(UG\d+\)\)\s*Section:\s*([A-Z])", re.IGNORECASE)
instructor_pattern = re.compile(r"Instructor:\s*(.+)", re.IGNORECASE)

# Prompts to skip (used only to set context)
prompt_patterns = [
    re.compile(r"Your lecturer would like to know", re.IGNORECASE),
    re.compile(r"Explain your evaluation", re.IGNORECASE),
    re.compile(r"What other materials", re.IGNORECASE),
    re.compile(r"My overall evaluation", re.IGNORECASE)
]

# Noise filters (footers, headers, table labels)
noise_patterns = [
    re.compile(r"Course Evaluation Section Report", re.IGNORECASE),
    re.compile(r"Average|Mean|Dept Mean|All College", re.IGNORECASE),
    re.compile(r"15\+\s*12-14|9-11|7-8|4-6|1-3", re.IGNORECASE),
    re.compile(r"Afundi, Patrick.*Semester", re.IGNORECASE),
]

def repair_broken_words(text):
    # Fix cases like "D i s c u s s i o n" → "Discussion"
    return re.sub(r'\b(?:[A-Za-z]\s+){2,}[A-Za-z]\b',
                  lambda m: m.group(0).replace(" ", ""), text)

def clean_line(line):
    line = re.sub(r"\s+", " ", line).strip()
    line = repair_broken_words(line)
    return line

def is_noise(line):
    return any(p.search(line) for p in noise_patterns)

def is_prompt(line):
    return any(p.search(line) for p in prompt_patterns)

def looks_like_table(line):
    tokens = line.split()
    numeric_tokens = [t for t in tokens if re.fullmatch(r"[0-9\.\%]+", t)]
    # Drop if line has ≥2 numeric tokens and also letters
    return len(numeric_tokens) >= 2 and re.search(r"[A-Za-z]", line)

def is_valid_comment(line):
    # Accept any line with at least one alphabetic word of length ≥2
    return bool(re.search(r"[A-Za-z]{2,}", line))

records = []
current_course = current_section = current_instructor = current_type = None

with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        text = page.extract_text()
        if not text:
            continue
        lines = text.split("\n")
        for raw_line in lines:
            line = clean_line(raw_line)
            if not line:
                continue

            # Detect course/section
            cs_match = section_header_pattern.search(line)
            if cs_match:
                current_course, current_section = cs_match.groups()
                continue

            # Detect instructor
            instr_match = instructor_pattern.search(line)
            if instr_match:
                current_instructor = re.sub(r"Summer Semester.*", "", instr_match.group(1)).strip()
                continue

            # Detect question type
            for key, label in question_map.items():
                if key in line.lower():
                    current_type = label
                    continue

            # Skip prompts, noise, and table-like rows
            if is_prompt(line) or is_noise(line) or looks_like_table(line):
                continue

            # Save as comment if valid
            if current_course and current_section and current_instructor and current_type and is_valid_comment(line):
                records.append({
                    "Course Code": current_course,
                    "Section Code": current_section,
                    "Instructor": current_instructor,
                    "Comment Type": current_type,
                    "Comment Text": line
                })

# Convert to DataFrame and save/inspect
df_comments = pd.DataFrame(records)
print("\nFirst 50 parsed comment rows:\n")
print(df_comments.head(50).to_string(index=False))

df_comments.to_csv("parsed_comments_cleaned.csv", index=False)


First 50 parsed comment rows:

Course Code Section Code      Instructor Comment Type                                                                                     Comment Text
    IST3005            A Afundi, Patrick  Course Text                                                                                COURSE EVALUATION
    IST3005            A Afundi, Patrick  Course Text                                                                                          Most of
    IST3005            A Afundi, Patrick  Course Text                                                                                 Sometim Dept All
    IST3005            A Afundi, Patrick  Course Text                                                                                            Times
    IST3005            A Afundi, Patrick  Course Text                                                                          STUDENT SELF EVALUATION
    IST3005            A Afundi, Patrick  Course Text         

ok it worked and we getting much more closer. But a few more fine tuning the questions that the students are responding to are appearing spreadout through multiple lines like here 'Your lecturer would also like to know what specific things you believe might be done to good improve his/her teaching in this course. ' and here 'aspects of his/her teaching you enjoyed most D is cuss ion and why. ' with the discussion error from before. the table headins are almost gone but we have something like this 'STUDENT SELF EVALUATION How much time outside class did you spend per week? STUDENT SELF EVAL 2 Dept All STUDENT SELF EVAL 3 ' still bleeding in


In [11]:
import pdfplumber
import re
import pandas as pd

pdf_path = "output_first19.pdf"

# Map question prompts to categories
question_map = {
    "enjoyed most": "Strengths",
    "didn't like": "Weaknesses",
    "improve": "Suggestions",
    "feedback on assignments": "Feedback Evaluation",
    "course text": "Course Text",
    "materials or resources": "Resources",
    "overall evaluation": "Overall"
}

# Regex patterns
section_header_pattern = re.compile(r"\(([A-Z]{3,}\d{3,}) \(UG\d+\)\)\s*Section:\s*([A-Z])", re.IGNORECASE)
instructor_pattern = re.compile(r"Instructor:\s*(.+)", re.IGNORECASE)

# Prompt patterns (expanded)
prompt_patterns = [
    re.compile(r"your lecturer would like to know", re.IGNORECASE),
    re.compile(r"aspects of his/her teaching.*enjoyed most", re.IGNORECASE),
    re.compile(r"specific things you believe.*improve", re.IGNORECASE),
    re.compile(r"explain your evaluation", re.IGNORECASE),
    re.compile(r"what other materials", re.IGNORECASE),
    re.compile(r"my overall evaluation", re.IGNORECASE)
]

# Noise patterns (expanded)
noise_patterns = [
    re.compile(r"Course Evaluation Section Report", re.IGNORECASE),
    re.compile(r"Average|Mean|Dept Mean|All College", re.IGNORECASE),
    re.compile(r"STUDENT SELF EVALUATION", re.IGNORECASE),
    re.compile(r"STUDENT SELF EVAL 2", re.IGNORECASE),
    re.compile(r"STUDENT SELF EVAL 3", re.IGNORECASE),
    re.compile(r"15\+\s*12-14|9-11|7-8|4-6|1-3", re.IGNORECASE),
    re.compile(r"COURSE EVALUATION", re.IGNORECASE),
]

def repair_broken_words(text):
    # Fix cases like "D i s c u s s i o n" → "Discussion"
    text = re.sub(r'\b(?:[A-Za-z]\s+){2,}[A-Za-z]\b',
                  lambda m: m.group(0).replace(" ", ""), text)
    text = text.replace("D is cuss ion", "Discussion")
    return text

def clean_line(line):
    line = re.sub(r"\s+", " ", line).strip()
    line = repair_broken_words(line)
    return line

def is_noise(line):
    return any(p.search(line) for p in noise_patterns)

def is_prompt(line):
    return any(p.search(line) for p in prompt_patterns)

def looks_like_table(line):
    tokens = line.split()
    numeric_tokens = [t for t in tokens if re.search(r"\d", t)]
    return len(numeric_tokens) >= 2 and re.search(r"[A-Za-z]", line)

def is_valid_comment(line):
    return bool(re.search(r"[A-Za-z]{2,}", line))

records = []
current_course = current_section = current_instructor = current_type = None
buffer = ""

with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        text = page.extract_text()
        if not text:
            continue
        lines = text.split("\n")
        for raw_line in lines:
            line = clean_line(raw_line)
            if not line:
                continue

            # Detect course/section
            cs_match = section_header_pattern.search(line)
            if cs_match:
                current_course, current_section = cs_match.groups()
                continue

            # Detect instructor
            instr_match = instructor_pattern.search(line)
            if instr_match:
                current_instructor = re.sub(r"Summer Semester.*", "", instr_match.group(1)).strip()
                continue

            # Detect question type
            for key, label in question_map.items():
                if key in line.lower():
                    current_type = label
                    buffer = ""  # reset buffer when switching type
                    break

            # Skip noise/prompt/table lines before buffering
            if is_prompt(line) or is_noise(line) or looks_like_table(line):
                buffer = ""
                continue

            # Accumulate partial sentences
            buffer += " " + line
            if re.search(r"[.?!]$", line):
                candidate = buffer.strip()
                buffer = ""
                if current_course and current_section and current_instructor and current_type and is_valid_comment(candidate):
                    records.append({
                        "Course Code": current_course,
                        "Section Code": current_section,
                        "Instructor": current_instructor,
                        "Comment Type": current_type,
                        "Comment Text": candidate
                    })

# Convert to DataFrame and save/inspect
df_comments = pd.DataFrame(records)
print("\nFirst 50 parsed comment rows:\n")
print(df_comments.head(50).to_string(index=False))

df_comments.to_csv("parsed_comments_cleaned.csv", index=False)


First 50 parsed comment rows:

Course Code Section Code      Instructor        Comment Type                                                                                                                                                                                                                                              Comment Text
    IST3005            A Afundi, Patrick         Course Text                                                                                                                                                                                                        3.4% 37.9% 24.1% 17.2% 10.3% 6.9% 100.0% per week?
    IST3005            A Afundi, Patrick           Strengths                                                                                                                                                                                                                                                  and why.
    IST3005            A Afundi, Pa

Prompt reconstruction: multi‑line questions are concatenated and then filtered, so fragments don’t leak in.
Expanded prompt filters: catches “aspects of his/her teaching you enjoyed most” and “specific things you believe might be done to improve”.
Expanded noise filters: removes “STUDENT SELF EVALUATION” headers and their variants.
Word repair: fixes “D is cuss ion” → “Discussion” while preserving normal spacing.
One comment per row: only genuine student responses are saved.


In [12]:
import pdfplumber
import re
import pandas as pd

pdf_path = "output_first19.pdf"

# Map question prompts to categories
question_map = {
    "enjoyed most": "Strengths",
    "didn't like": "Weaknesses",
    "improve": "Suggestions",
    "feedback on assignments": "Feedback Evaluation",
    "course text": "Course Text",
    "materials or resources": "Resources",
    "overall evaluation": "Overall"
}

# Regex patterns
section_header_pattern = re.compile(r"\(([A-Z]{3,}\d{3,}) \(UG\d+\)\)\s*Section:\s*([A-Z])", re.IGNORECASE)
instructor_pattern = re.compile(r"Instructor:\s*(.+)", re.IGNORECASE)

# Prompt patterns (expanded to catch fragments)
prompt_patterns = [
    re.compile(r"your lecturer would like to know", re.IGNORECASE),
    re.compile(r"aspects of his/her teaching", re.IGNORECASE),
    re.compile(r"specific things you believe", re.IGNORECASE),
    re.compile(r"and why", re.IGNORECASE),
    re.compile(r"comments$", re.IGNORECASE),
    re.compile(r"explain your evaluation", re.IGNORECASE),
    re.compile(r"what other materials", re.IGNORECASE),
    re.compile(r"my overall evaluation", re.IGNORECASE)
]

# Noise patterns (expanded)
noise_patterns = [
    re.compile(r"Course Evaluation Section Report", re.IGNORECASE),
    re.compile(r"Average|Mean|Dept Mean|All College", re.IGNORECASE),
    re.compile(r"STUDENT SELF EVALUATION", re.IGNORECASE),
    re.compile(r"STUDENT SELF EVAL 2", re.IGNORECASE),
    re.compile(r"STUDENT SELF EVAL 3", re.IGNORECASE),
    re.compile(r"15\+\s*12-14|9-11|7-8|4-6|1-3", re.IGNORECASE),
    re.compile(r"COURSE EVALUATION", re.IGNORECASE),
]

def repair_broken_words(text):
    # Fix cases like "D i s c u s s i o n" → "Discussion"
    text = re.sub(r'\b(?:[A-Za-z]\s+){2,}[A-Za-z]\b',
                  lambda m: m.group(0).replace(" ", ""), text)
    text = text.replace("D is cuss ion", "Discussion")
    return text

def clean_line(line):
    line = re.sub(r"\s+", " ", line).strip()
    line = repair_broken_words(line)
    return line

def is_noise(line):
    return any(p.search(line) for p in noise_patterns)

def is_prompt(line):
    return any(p.search(line) for p in prompt_patterns)

def looks_like_table(line):
    # Drop if line has % or multiple numbers
    if "%" in line:
        return True
    if len(re.findall(r"\d+", line)) >= 2:
        return True
    return False

def is_valid_comment(line):
    # Skip very short fragments unless they are known valid comments
    valid_short = {"good","none","fair","excellent","poor","ok"}
    words = line.lower().split()
    if len(words) < 3 and words[0] not in valid_short:
        return False
    return bool(re.search(r"[A-Za-z]{2,}", line))

records = []
current_course = current_section = current_instructor = current_type = None
buffer = ""

with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        text = page.extract_text()
        if not text:
            continue
        lines = text.split("\n")
        for raw_line in lines:
            line = clean_line(raw_line)
            if not line:
                continue

            # Detect course/section
            cs_match = section_header_pattern.search(line)
            if cs_match:
                current_course, current_section = cs_match.groups()
                continue

            # Detect instructor
            instr_match = instructor_pattern.search(line)
            if instr_match:
                current_instructor = re.sub(r"Summer Semester.*", "", instr_match.group(1)).strip()
                continue

            # Detect question type
            for key, label in question_map.items():
                if key in line.lower():
                    current_type = label
                    buffer = ""  # reset buffer when switching type
                    break

            # Skip noise/prompt/table lines before buffering
            if is_prompt(line) or is_noise(line) or looks_like_table(line):
                buffer = ""
                continue

            # Accumulate partial sentences
            buffer += " " + line
            if re.search(r"[.?!]$", line):
                candidate = buffer.strip()
                buffer = ""
                if current_course and current_section and current_instructor and current_type and is_valid_comment(candidate):
                    records.append({
                        "Course Code": current_course,
                        "Section Code": current_section,
                        "Instructor": current_instructor,
                        "Comment Type": current_type,
                        "Comment Text": candidate
                    })

# Convert to DataFrame and save/inspect
df_comments = pd.DataFrame(records)
print("\nFirst 50 parsed comment rows:\n")
print(df_comments.head(50).to_string(index=False))

df_comments.to_csv("parsed_comments_cleaned.csv", index=False)


First 50 parsed comment rows:

Course Code Section Code      Instructor        Comment Type                                                                                                                                                                                                                                                                                                                                           Comment Text
    IST3005            A Afundi, Patrick           Strengths                                                                                                                                                                                                                           slides are understandable The discussions social media platforms exploration good constant engagement Lecture sessions N/A .
    IST3005            A Afundi, Patrick          Weaknesses                                                                                                    

improvment here by some of the one word texts are being merged into long text parts. Thus we are getting under represenation of response in comparision to students. ok we have improvement. But we are getting an under count for response from students incomparison to the responses. Im seeing this being due to the fact N/A na and NA responses are being merged into one response or into other responses like here 'n/a N/A n/a None N/A practical books None none N/A N/A Non none at the moment GOOD good - good n None Videos Coding references . ' , 'NA Non n/a n A more one on one assessment in the following coding languages instead of just letting us figure it out ourselves. '  and here '; none none more code G NA .... 


In [13]:
import pdfplumber
import re
import pandas as pd

pdf_path = "output_first19.pdf"

# Map question prompts to categories
question_map = {
    "enjoyed most": "Strengths",
    "didn't like": "Weaknesses",
    "improve": "Suggestions",
    "feedback on assignments": "Feedback Evaluation",
    "course text": "Course Text",
    "materials or resources": "Resources",
    "overall evaluation": "Overall"
}

# Regex patterns
section_header_pattern = re.compile(r"\(([A-Z]{3,}\d{3,}) \(UG\d+\)\)\s*Section:\s*([A-Z])", re.IGNORECASE)

# Capture only up to end of line (prevent spillover)
instructor_pattern = re.compile(r"^Instructor:\s*([^\n\r]+)$", re.IGNORECASE)

# Capture semester anywhere on a line
semester_pattern = re.compile(r"(Spring|Summer|Fall|Winter)\s+Semester\s+\d{4}(?:\s*\([^)]+\))?", re.IGNORECASE)

# If semester appears inside instructor due to OCR merging, strip it
strip_semester_in_line = re.compile(r"(Spring|Summer|Fall|Winter)\s+Semester\s+\d{4}(?:\s*\([^)]+\))?", re.IGNORECASE)

# Normalization
def normalize(text):
    text = str(text).lower()
    text = re.sub(r"[^\w\s]", "", text)  # remove punctuation
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Prompt fragments to exclude
prompt_patterns = [
    re.compile(r"your lecturer would like to know", re.IGNORECASE),
    re.compile(r"aspects of his/her teaching", re.IGNORECASE),
    re.compile(r"specific things you believe", re.IGNORECASE),
    re.compile(r"and why", re.IGNORECASE),
    re.compile(r"comments$", re.IGNORECASE),
    re.compile(r"comment on your evaluation", re.IGNORECASE),
    re.compile(r"explain your evaluation", re.IGNORECASE),
    re.compile(r"what other materials", re.IGNORECASE),
    re.compile(r"my overall evaluation", re.IGNORECASE),
    re.compile(r"per week\?", re.IGNORECASE),
    re.compile(r"should be added to support your learning\?", re.IGNORECASE),
    re.compile(r"improve (his/her|hisher) teaching in this course", re.IGNORECASE),
    re.compile(r"hours or by appointment", re.IGNORECASE),
    re.compile(r"satisfact below dept all", re.IGNORECASE),
]

# Noise patterns
noise_patterns = [
    re.compile(r"Course Evaluation Section Report", re.IGNORECASE),
    re.compile(r"Average|Mean|Dept Mean|All College", re.IGNORECASE),
    re.compile(r"STUDENT SELF EVALUATION", re.IGNORECASE),
    re.compile(r"STUDENT SELF EVAL 2", re.IGNORECASE),
    re.compile(r"STUDENT SELF EVAL 3", re.IGNORECASE),
    re.compile(r"15\+\s*12-14|9-11|7-8|4-6|1-3", re.IGNORECASE),
    re.compile(r"COURSE EVALUATION", re.IGNORECASE),
]

# Known atomic responses
atomic_responses = {"n/a","na","none","nil","ok","good","fair","excellent","poor"}

def repair_broken_words(text):
    text = re.sub(r'\b(?:[A-Za-z]\s+){2,}[A-Za-z]\b',
                  lambda m: m.group(0).replace(" ", ""), text)
    text = text.replace("D is cuss ion", "Discussion")
    return text

def clean_line(line):
    line = re.sub(r"\s+", " ", line).strip()
    line = repair_broken_words(line)
    return line

def is_noise(line):
    return any(p.search(line) for p in noise_patterns)

def is_prompt(line):
    return any(p.search(line) for p in prompt_patterns)

def looks_like_table(line):
    if "%" in line:
        return True
    if len(re.findall(r"\d+", line)) >= 2:
        return True
    return False

def flush_atomic_or_comment(candidate, records, current_course, current_section, current_instructor, current_semester, current_type):
    tokens = candidate.split()
    def append_record(text):
        records.append({
            "Course Code": current_course,
            "Section Code": current_section,
            "Instructor": current_instructor,
            "Semester": current_semester,
            "Comment Type": current_type,
            "Comment Text": text
        })

    # If the whole candidate is atomic, save directly
    if candidate.lower() in atomic_responses:
        append_record(candidate)
        return

    # Otherwise, split tokens and save atomics separately
    buffer_tokens = []
    for tok in tokens:
        if tok.lower() in atomic_responses:
            if buffer_tokens:
                append_record(" ".join(buffer_tokens))
                buffer_tokens = []
            append_record(tok)
        else:
            buffer_tokens.append(tok)
    if buffer_tokens:
        append_record(" ".join(buffer_tokens))

records = []
current_course = current_section = current_instructor = current_semester = current_type = None
buffer = ""

with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        text = page.extract_text()
        if not text:
            continue
        lines = text.split("\n")
        for raw_line in lines:
            line = clean_line(raw_line)
            if not line:
                continue

            # Detect course/section: reset metadata on new header
            cs_match = section_header_pattern.search(line)
            if cs_match:
                current_course, current_section = cs_match.groups()
                # when new section starts, clear instructor & semester until found again
                current_instructor = None
                current_semester = None
                continue

            # Detect instructor (strip any semester that bled into the same line)
            instr_match = instructor_pattern.search(line)
            if instr_match:
                instructor_raw = instr_match.group(1).strip()
                instructor_clean = strip_semester_in_line.sub("", instructor_raw).strip().rstrip(",")
                current_instructor = instructor_clean
                # If semester also appears on the same line, capture it
                sem_inline = semester_pattern.search(instructor_raw)
                if sem_inline:
                    current_semester = sem_inline.group(0).strip()
                continue

            # Detect semester (on its own line)
            sem_match = semester_pattern.search(line)
            if sem_match:
                current_semester = sem_match.group(0).strip()
                continue

            # Detect question type
            for key, label in question_map.items():
                if key in line.lower():
                    current_type = label
                    buffer = ""
                    break

            # Skip noise/prompt/table lines
            if is_prompt(line) or is_noise(line) or looks_like_table(line):
                buffer = ""
                continue

            # If atomic response, save immediately (only if metadata present)
            if line.lower() in atomic_responses:
                if current_course and current_section and current_instructor and current_semester and current_type:
                    flush_atomic_or_comment(line, records, current_course, current_section, current_instructor, current_semester, current_type)
                buffer = ""
                continue

            # Accumulate partial sentences
            buffer += " " + line
            if re.search(r"[.?!]$", line):
                candidate = buffer.strip()
                buffer = ""
                # Only write if all metadata present to avoid partial/duplicated instructor-semester issues
                if current_course and current_section and current_instructor and current_semester and current_type:
                    flush_atomic_or_comment(candidate, records, current_course, current_section, current_instructor, current_semester, current_type)

# Convert to DataFrame and save/inspect
df_comments = pd.DataFrame(records)
print("\nFirst 50 parsed comment rows:\n")
print(df_comments.head(50).to_string(index=False))

df_comments.to_csv("parsed_comments_cleaned.csv", index=False)


First 50 parsed comment rows:

Course Code Section Code      Instructor                    Semester Comment Type                                                                                     Comment Text
    IST3005            A Afundi, Patrick Summer Semester 2019 (UNDG)    Strengths                                                                                              N/A
    IST3005            A Afundi, Patrick Summer Semester 2019 (UNDG)    Strengths                                                                                               ok
    IST3005            A Afundi, Patrick Summer Semester 2019 (UNDG)    Strengths                                                                                              n/a
    IST3005            A Afundi, Patrick Summer Semester 2019 (UNDG)    Strengths                                                                                             Good
    IST3005            A Afundi, Patrick Summer Semester 2019 (UNDG)    S

Its now perfect. As all things should be.It finally works as it should. Ive confirmed some of the comments with the pdf and they match up. Theres no underrepresentation, nothing is being merged, and fullstops and ellipse responses that were placed in as comments are being registered. Its possible to filter down by course code and course letter. So that means its should be possible to filter down a lectures by name. 

So we can finally upsacle it to the full pdf

had to comeback here to update it twice
once because of a section of his and her that was being picked up,
then a second time because i did not have a semester column that i needed down the line

In [14]:
import pdfplumber
import re
import pandas as pd

# Full PDF path
pdf_path = r"C:\Users\Admin\OneDrive\Documents\Schoolwork\Projects\UNI finals project\Code stuff\US 2019 - Undergraduate Faculty Evaluation.pdf"

# Output Excel file
output_excel = "faculty_evaluations_cleaned.xlsx"

# Map question prompts to categories
question_map = {
    "enjoyed most": "Strengths",
    "didn't like": "Weaknesses",
    "improve": "Suggestions",
    "feedback on assignments": "Feedback Evaluation",
    "course text": "Course Text",
    "materials or resources": "Resources",
    "overall evaluation": "Overall"
}

# Regex patterns
section_header_pattern = re.compile(r"\(([A-Z]{3,}\d{3,}) \(UG\d+\)\)\s*Section:\s*([A-Z])", re.IGNORECASE)
instructor_pattern = re.compile(r"Instructor:\s*(.+)", re.IGNORECASE)
# Here we are normalizing beofre fitting
def normalize(text):
    text = str(text).lower()
    text = re.sub(r"[^\w\s]", "", text)  # remove punctuation
    text = re.sub(r"\s+", " ", text).strip()
    return text
# Prompt fragments to exclude
prompt_patterns = [
    re.compile(r"your lecturer would like to know", re.IGNORECASE),
    re.compile(r"aspects of his/her teaching", re.IGNORECASE),
    re.compile(r"specific things you believe", re.IGNORECASE),
    re.compile(r"and why", re.IGNORECASE),
    re.compile(r"comments$", re.IGNORECASE),
    re.compile(r"comment on your evaluation", re.IGNORECASE),
    re.compile(r"explain your evaluation", re.IGNORECASE),
    re.compile(r"what other materials", re.IGNORECASE),
    re.compile(r"my overall evaluation", re.IGNORECASE),
    re.compile(r"per week\?", re.IGNORECASE),
    re.compile(r"should be added to support your learning\?", re.IGNORECASE),
    re.compile(r"improve (his/her|hisher) teaching in this course", re.IGNORECASE), # Yup the change was made here as well
    re.compile(r"hours or by appointment", re.IGNORECASE),
    re.compile(r"satisfact below dept all", re.IGNORECASE),
]



# Noise patterns
noise_patterns = [
    re.compile(r"Course Evaluation Section Report", re.IGNORECASE),
    re.compile(r"Average|Mean|Dept Mean|All College", re.IGNORECASE),
    re.compile(r"STUDENT SELF EVALUATION", re.IGNORECASE),
    re.compile(r"STUDENT SELF EVAL 2", re.IGNORECASE),
    re.compile(r"STUDENT SELF EVAL 3", re.IGNORECASE),
    re.compile(r"15\+\s*12-14|9-11|7-8|4-6|1-3", re.IGNORECASE),
    re.compile(r"COURSE EVALUATION", re.IGNORECASE),
]

# Known atomic responses
atomic_responses = {"n/a","na","none","nil","ok","good","fair","excellent","poor"}

def repair_broken_words(text):
    text = re.sub(r'\b(?:[A-Za-z]\s+){2,}[A-Za-z]\b',
                  lambda m: m.group(0).replace(" ", ""), text)
    text = text.replace("D is cuss ion", "Discussion")
    return text

def clean_line(line):
    line = re.sub(r"\s+", " ", line).strip()
    line = repair_broken_words(line)
    return line

def is_noise(line):
    return any(p.search(line) for p in noise_patterns)

def is_prompt(line):
    return any(p.search(line) for p in prompt_patterns)

def looks_like_table(line):
    if "%" in line:
        return True
    if len(re.findall(r"\d+", line)) >= 2:
        return True
    return False

def flush_atomic_or_comment(candidate, records, current_course, current_section, current_instructor, current_type):
    tokens = candidate.split()
    if candidate.lower() in atomic_responses:
        records.append({
            "Course Code": current_course,
            "Section Code": current_section,
            "Instructor": current_instructor,
            "Comment Type": current_type,
            "Comment Text": candidate
        })
        return
    buffer_tokens = []
    for tok in tokens:
        if tok.lower() in atomic_responses:
            if buffer_tokens:
                records.append({
                    "Course Code": current_course,
                    "Section Code": current_section,
                    "Instructor": current_instructor,
                    "Comment Type": current_type,
                    "Comment Text": " ".join(buffer_tokens)
                })
                buffer_tokens = []
            records.append({
                "Course Code": current_course,
                "Section Code": current_section,
                "Instructor": current_instructor,
                "Comment Type": current_type,
                "Comment Text": tok
            })
        else:
            buffer_tokens.append(tok)
    if buffer_tokens:
        records.append({
            "Course Code": current_course,
            "Section Code": current_section,
            "Instructor": current_instructor,
            "Comment Type": current_type,
            "Comment Text": " ".join(buffer_tokens)
        })

records = []
current_course = current_section = current_instructor = current_type = None
buffer = ""

with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        text = page.extract_text()
        if not text:
            continue
        lines = text.split("\n")
        for raw_line in lines:
            line = clean_line(raw_line)
            if not line:
                continue

            # Detect course/section
            cs_match = section_header_pattern.search(line)
            if cs_match:
                current_course, current_section = cs_match.groups()
                continue

            # Detect instructor
            instr_match = instructor_pattern.search(line)
            if instr_match:
                current_instructor = re.sub(r"Summer Semester.*", "", instr_match.group(1)).strip()
                continue

            # Detect question type
            for key, label in question_map.items():
                if key in line.lower():
                    current_type = label
                    buffer = ""
                    break

            # Skip noise/prompt/table lines
            if is_prompt(line) or is_noise(line) or looks_like_table(line):
                buffer = ""
                continue

            # If atomic response, save immediately
            if line.lower() in atomic_responses:
                flush_atomic_or_comment(line, records, current_course, current_section, current_instructor, current_type)
                buffer = ""
                continue

            # Accumulate partial sentences
            buffer += " " + line
            if re.search(r"[.?!]$", line):
                candidate = buffer.strip()
                buffer = ""
                if current_course and current_section and current_instructor and current_type:
                    flush_atomic_or_comment(candidate, records, current_course, current_section, current_instructor, current_type)

# Convert to DataFrame and save to Excel
df_comments = pd.DataFrame(records)
print(f"Extracted {len(df_comments)} comments")
print(df_comments.head(20).to_string(index=False))

df_comments.to_excel(output_excel, index=False)

Extracted 73801 comments
Course Code Section Code      Instructor Comment Type  Comment Text
    IST3005            A Afundi, Patrick    Strengths           N/A
    IST3005            A Afundi, Patrick    Strengths            ok
    IST3005            A Afundi, Patrick    Strengths           n/a
    IST3005            A Afundi, Patrick    Strengths          Good
    IST3005            A Afundi, Patrick    Strengths          Good
    IST3005            A Afundi, Patrick    Strengths          good
    IST3005            A Afundi, Patrick    Strengths           N/A
    IST3005            A Afundi, Patrick    Strengths             .
    IST3005            A Afundi, Patrick    Strengths PowerPoints .
    IST3005            A Afundi, Patrick   Weaknesses           n/a
    IST3005            A Afundi, Patrick   Weaknesses           N/A
    IST3005            A Afundi, Patrick   Weaknesses            ok
    IST3005            A Afundi, Patrick   Weaknesses           n/a
    IST3005            

ok so it works but we still have some minute bleed thru Its looking great, but I am still noticing some slight bleeding of  the term 'per week? ' from one of the tables and 'should be added to support your learning? ' and 'improve his/her teaching in this course.  ' from browsing through the completed data. so we are going to fix that.
- Removes all known prompt fragments and table bleed-through, including:
- “per week?”
- “should be added to support your learning?”
- “improve his/her teaching in this course.”
- “hours or by appointment”
- “comment on your evaluation…”


that improve his/her teaching thing came to bite me back like a week later. youll understand in the next markdown.

In [1]:
import pdfplumber
import re
import pandas as pd

# Full PDF path
pdf_path = r"C:\Users\Admin\OneDrive\Documents\Schoolwork\Projects\UNI finals project\Code stuff\US 2019 - Undergraduate Faculty Evaluation.pdf"

# Output Excel file
output_excel = "faculty_evaluations_cleaned.xlsx"

# Map question prompts to categories
question_map = {
    "enjoyed most": "Strengths",
    "didn't like": "Weaknesses",
    "improve": "Suggestions",
    "feedback on assignments": "Feedback Evaluation",
    "course text": "Course Text",
    "materials or resources": "Resources",
    "overall evaluation": "Overall"
}

# Regex patterns
section_header_pattern = re.compile(r"\(([A-Z]{3,}\d{3,}) \(UG\d+\)\)\s*Section:\s*([A-Z])", re.IGNORECASE)

# Instructor: capture up to end-of-line
instructor_pattern = re.compile(r"^Instructor:\s*([^\n\r]+)$", re.IGNORECASE)

# Semester: allow trailing parenthetical (e.g., (UNDG))
semester_pattern = re.compile(r"(Spring|Summer|Fall|Winter)\s+Semester\s+\d{4}(?:\s*\([^)]+\))?", re.IGNORECASE)

# Strip any semester tokens if they bleed into the instructor line
strip_semester_in_line = re.compile(r"(Spring|Summer|Fall|Winter)\s+Semester\s+\d{4}(?:\s*\([^)]+\))?", re.IGNORECASE)

# Prompt fragments to exclude
prompt_patterns = [
    re.compile(r"your lecturer would like to know", re.IGNORECASE),
    re.compile(r"aspects of his/her teaching", re.IGNORECASE),
    re.compile(r"specific things you believe", re.IGNORECASE),
    re.compile(r"and why", re.IGNORECASE),
    re.compile(r"comments$", re.IGNORECASE),
    re.compile(r"comment on your evaluation", re.IGNORECASE),
    re.compile(r"explain your evaluation", re.IGNORECASE),
    re.compile(r"what other materials", re.IGNORECASE),
    re.compile(r"my overall evaluation", re.IGNORECASE),
    re.compile(r"per week\?", re.IGNORECASE),
    re.compile(r"should be added to support your learning\?", re.IGNORECASE),
    re.compile(r"improve (his/her|hisher) teaching in this course", re.IGNORECASE),
    re.compile(r"hours or by appointment", re.IGNORECASE),
    re.compile(r"satisfact below dept all", re.IGNORECASE),
]

# Table and header noise to exclude
noise_patterns = [
    re.compile(r"Course Evaluation Section Report", re.IGNORECASE),
    re.compile(r"Average|Mean|Dept Mean|All College", re.IGNORECASE),
    re.compile(r"STUDENT SELF EVALUATION", re.IGNORECASE),
    re.compile(r"STUDENT SELF EVAL 2", re.IGNORECASE),
    re.compile(r"STUDENT SELF EVAL 3", re.IGNORECASE),
    re.compile(r"15\+\s*12-14|9-11|7-8|4-6|1-3", re.IGNORECASE),
    re.compile(r"COURSE EVALUATION", re.IGNORECASE),
]

# Known atomic responses
atomic_responses = {"n/a", "na", "none", "nil", "ok", "good", "fair", "excellent", "poor"}

def repair_broken_words(text):
    text = re.sub(
        r'\b(?:[A-Za-z]\s+){2,}[A-Za-z]\b',
        lambda m: m.group(0).replace(" ", ""),
        text
    )
    text = text.replace("D is cuss ion", "Discussion")
    return text

def clean_line(line):
    line = re.sub(r"\s+", " ", line).strip()
    line = repair_broken_words(line)
    return line

def is_noise(line):
    return any(p.search(line) for p in noise_patterns)

def is_prompt(line):
    return any(p.search(line) for p in prompt_patterns)

def looks_like_table(line):
    if "%" in line:
        return True
    if len(re.findall(r"\d+", line)) >= 2:
        return True
    return False

def flush_atomic_or_comment(candidate, records,
                            current_course, current_section,
                            current_instructor, current_semester,
                            current_type):
    tokens = candidate.split()

    def append_record(text):
        records.append({
            "Course Code": current_course,
            "Section Code": current_section,
            "Instructor": current_instructor,
            "Semester": current_semester,
            "Comment Type": current_type,
            "Comment Text": text
        })

    # If the whole candidate is atomic, save directly
    if candidate.lower() in atomic_responses:
        append_record(candidate)
        return

    # Otherwise, split tokens and save atomics separately
    buffer_tokens = []
    for tok in tokens:
        if tok.lower() in atomic_responses:
            if buffer_tokens:
                append_record(" ".join(buffer_tokens))
                buffer_tokens = []
            append_record(tok)
        else:
            buffer_tokens.append(tok)
    if buffer_tokens:
        append_record(" ".join(buffer_tokens))

records = []
current_course = current_section = current_instructor = current_semester = current_type = None
buffer = ""

with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        text = page.extract_text()
        if not text:
            continue
        lines = text.split("\n")
        for raw_line in lines:
            line = clean_line(raw_line)
            if not line:
                continue

            # Detect course/section: reset instructor & semester on new section header
            cs_match = section_header_pattern.search(line)
            if cs_match:
                current_course, current_section = cs_match.groups()
                current_instructor = None
                current_semester = None
                continue

            # Detect instructor (strip any inline semester)
            instr_match = instructor_pattern.search(line)
            if instr_match:
                instructor_raw = instr_match.group(1).strip()
                instructor_clean = strip_semester_in_line.sub("", instructor_raw).strip().rstrip(",")
                current_instructor = instructor_clean
                # Also capture semester if it appears inline with instructor
                sem_inline = semester_pattern.search(instructor_raw)
                if sem_inline:
                    current_semester = sem_inline.group(0).strip()
                continue

            # Detect semester (on its own line)
            sem_match = semester_pattern.search(line)
            if sem_match:
                current_semester = sem_match.group(0).strip()
                continue

            # Detect question type
            for key, label in question_map.items():
                if key in line.lower():
                    current_type = label
                    buffer = ""
                    break

            # Skip noise/prompt/table lines
            if is_prompt(line) or is_noise(line) or looks_like_table(line):
                buffer = ""
                continue

            # If atomic response, save immediately (only if metadata present)
            if line.lower() in atomic_responses:
                if current_course and current_section and current_instructor and current_semester and current_type:
                    flush_atomic_or_comment(line, records, current_course, current_section, current_instructor, current_semester, current_type)
                buffer = ""
                continue

            # Accumulate partial sentences
            buffer += " " + line
            if re.search(r"[.?!]$", line):
                candidate = buffer.strip()
                buffer = ""
                # Only write if all metadata present
                if current_course and current_section and current_instructor and current_semester and current_type:
                    flush_atomic_or_comment(candidate, records, current_course, current_section, current_instructor, current_semester, current_type)

# Convert to DataFrame and save to Excel
df_comments = pd.DataFrame(records)
print(f"Extracted {len(df_comments)} comments")
print(df_comments.head(20).to_string(index=False))

df_comments.to_excel(output_excel, index=False)

Extracted 11349 comments
Course Code Section Code      Instructor                    Semester Comment Type  Comment Text
    IST3005            A Afundi, Patrick Summer Semester 2019 (UNDG)    Strengths           N/A
    IST3005            A Afundi, Patrick Summer Semester 2019 (UNDG)    Strengths            ok
    IST3005            A Afundi, Patrick Summer Semester 2019 (UNDG)    Strengths           n/a
    IST3005            A Afundi, Patrick Summer Semester 2019 (UNDG)    Strengths          Good
    IST3005            A Afundi, Patrick Summer Semester 2019 (UNDG)    Strengths          Good
    IST3005            A Afundi, Patrick Summer Semester 2019 (UNDG)    Strengths          good
    IST3005            A Afundi, Patrick Summer Semester 2019 (UNDG)    Strengths           N/A
    IST3005            A Afundi, Patrick Summer Semester 2019 (UNDG)    Strengths             .
    IST3005            A Afundi, Patrick Summer Semester 2019 (UNDG)    Strengths PowerPoints .
    IST3005    

Finally its done. i CAN REST for a bit. The data is as it should, even the meaningless ellipse have been maintained, theres no bleed through and things are just great.

I HATE THIS LIFE SOMETIMES. i made the change in an earliet chunk that i should have cause i was confirming the code chunk. then i begun scrolling downa and relasied i was like 2 code chunks to early. im not going to change it.Youll understand the work flow from these comments. I apologise if it doesnt make sense. But but works on the app so this doesnt really matter

there is trully no rest for the wicked.