# Aspect Classification for Educational Feedback
Aspect-focused notebook with baselines, prompting, cross-domain checks, and explainability.

## 1. Setup
Load data, define helper utilities, and prepare aspect glossary for prompts and interpretability.

In [None]:
import json
from pathlib import Path
from typing import List

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

import shap
from lime.lime_text import LimeTextExplainer

try:
    from sentence_transformers import SentenceTransformer
except Exception:
    SentenceTransformer = None

In [None]:
def load_aspect_data(path=Path("data_feedback.xlsx")):
    if path.exists():
        df = pd.read_excel(path)
    else:
        df = pd.DataFrame(
            {
                "teacher/course": ["teacher", "course"],
                "comments": ["great teacher", "great course"],
                "aspect": ["general", "relevancy"],
            }
        )
    df = df.rename(columns={"comments": "text", "teacher/course": "topic"})
    return df.dropna(subset=["text", "aspect"])

## 2. Glossary and preprocessing helpers

In [None]:
aspect_glossary = {
    "teaching skills": "Pedagogical clarity, examples, pacing, interaction.",
    "behaviour": "Politeness, respect, supportiveness, responsiveness.",
    "knowledge": "Depth and breadth of subject knowledge.",
    "relevancy": "Alignment of content and practice with course goals.",
    "general": "General praise or criticism without a specific aspect.",
}


def prepend_topic(texts: List[str], topics: List[str]):
    return [f"[TOPIC={t}] {x}" for x, t in zip(texts, topics)]

## 3. TF–IDF baselines (word + char) with explainability

In [None]:
def run_aspect_tfidf(train_df, val_df, analyzer="word", prompt_topic=False):
    X_train = train_df["text"] if not prompt_topic else prepend_topic(train_df["text"].tolist(), train_df["topic"].tolist())
    X_val = val_df["text"] if not prompt_topic else prepend_topic(val_df["text"].tolist(), val_df["topic"].tolist())
    y_train, y_val = train_df["aspect"], val_df["aspect"]

    pipe = Pipeline(
        [
            ("tfidf", TfidfVectorizer(analyzer=analyzer, ngram_range=(1, 2))),
            ("clf", LogisticRegression(max_iter=200, class_weight="balanced")),
        ]
    )
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_val)
    print(classification_report(y_val, preds, zero_division=0))
    print(confusion_matrix(y_val, preds))

    explainer = LimeTextExplainer(class_names=pipe.classes_)
    explanation = explainer.explain_instance(X_val.iloc[0], pipe.predict_proba, num_features=8)
    return pipe, explanation

## 4. Sentence embedding baseline (SBERT)

In [None]:
def run_aspect_sbert(train_df, val_df, model_name="all-MiniLM-L6-v2", prompt_topic=True):
    if SentenceTransformer is None:
        raise ImportError("sentence-transformers not installed")

    model = SentenceTransformer(model_name)
    X_train = train_df["text"] if not prompt_topic else prepend_topic(train_df["text"].tolist(), train_df["topic"].tolist())
    X_val = val_df["text"] if not prompt_topic else prepend_topic(val_df["text"].tolist(), val_df["topic"].tolist())
    y_train, y_val = train_df["aspect"], val_df["aspect"]

    train_emb = model.encode(list(X_train), batch_size=16, show_progress_bar=True)
    val_emb = model.encode(list(X_val), batch_size=16, show_progress_bar=True)

    clf = LogisticRegression(max_iter=200, class_weight="balanced")
    clf.fit(train_emb, y_train)
    preds = clf.predict(val_emb)
    print(classification_report(y_val, preds, zero_division=0))
    return clf, model

## 5. Cross-domain / cross-topic evaluation
Train on teacher-only vs course-only to measure transfer.

In [None]:
def cross_domain(train_df, test_df, model_fn):
    model = model_fn(train_df, test_df)
    return model

## 6. Prompting baseline
Zero/low-shot LLM baseline using aspect definitions.

In [None]:
def prompt_aspect(texts: List[str], model_name="gpt-4o-mini"):
    import openai

    system_msg = "Identify the aspect label using the glossary and respond with the label only."
    client = openai.OpenAI()
    outputs = []
    for t in texts:
        res = client.chat.completions.create(
            model=model_name,
            messages=[{"role": "system", "content": system_msg}, {"role": "user", "content": t}],
            temperature=0,
        )
        outputs.append(res.choices[0].message["content"])
    return outputs

## 7. Explainability utilities

In [None]:
def shap_for_aspect(model, X_samples: List[str]):
    explainer = shap.Explainer(model.predict_proba, masker=shap.maskers.Text())
    values = explainer(X_samples)
    shap.plots.text(values, display=False)
    return values

## 8. Error inspection helpers

In [None]:
def error_breakdown(model, val_df, prompt_topic=False):
    X_val = val_df["text"] if not prompt_topic else prepend_topic(val_df["text"].tolist(), val_df["topic"].tolist())
    preds = model.predict(X_val)
    df_err = val_df.copy()
    df_err["pred"] = preds
    df_err["correct"] = df_err["pred"] == df_err["aspect"]
    return df_err.sort_values("correct")

## 9. CLI entry point

In [None]:
def main_cli():
    import argparse

    parser = argparse.ArgumentParser(description="Aspect classification experiments")
    parser.add_argument("--model", choices=["tfidf", "char", "sbert"], default="tfidf")
    args = parser.parse_args()

    df = load_aspect_data()
    train_df, val_df = train_test_split(df, test_size=0.25, random_state=42, stratify=df["aspect"])

    if args.model == "tfidf":
        run_aspect_tfidf(train_df, val_df, analyzer="word", prompt_topic=True)
    elif args.model == "char":
        run_aspect_tfidf(train_df, val_df, analyzer="char", prompt_topic=True)
    elif args.model == "sbert":
        run_aspect_sbert(train_df, val_df)


if __name__ == "__main__":
    main_cli()