# Aspect Classification for Educational Feedback

Covers baselines, transformers, prompting, cross-domain checks, and error analysis.

## 1. Setup

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

DATA_PATH = Path("data_feedback.xlsx")
RANDOM_SEED = 13

## 2. Load data and glossary

In [None]:
ASPECT_GLOSSARY = {"teaching skills": "Ability to deliver content clearly and effectively.", "behaviour": "Professionalism, politeness, and supportiveness.", "knowledge": "Depth of subject-matter expertise.", "relevancy": "Fit between course content and learner needs.", "general": "General praise/critique without specific focus."}
df = pd.read_excel(DATA_PATH).rename(columns={"teacher/course": "topic"})
print(df.head())
print("
Aspect counts:
", df["aspect"].value_counts())

## 3. Split and helpers

In [None]:
train_df, val_df = train_test_split(df, test_size=0.25, stratify=df["aspect"], random_state=RANDOM_SEED)

def evaluate(model, x_train, y_train, x_val, y_val, label):
    model.fit(x_train, y_train)
    preds = model.predict(x_val)
    print(f"=== {label} ===")
    print(classification_report(y_val, preds))
    cm = confusion_matrix(y_val, preds, labels=sorted(y_val.unique()))
    sns.heatmap(cm, annot=True, fmt="d", xticklabels=sorted(y_val.unique()), yticklabels=sorted(y_val.unique()))
    plt.title(label)
    plt.xlabel("Pred")
    plt.ylabel("True")
    plt.show()
    return preds

## 4. TF-IDF baselines

In [None]:
word_clf = Pipeline([("tfidf", TfidfVectorizer(ngram_range=(1,2))), ("clf", LogisticRegression(max_iter=1000, class_weight="balanced"))])
_ = evaluate(word_clf, train_df["comments"], train_df["aspect"], val_df["comments"], val_df["aspect"], "Word TF-IDF")
char_clf = Pipeline([("tfidf", TfidfVectorizer(analyzer="char", ngram_range=(3,5))), ("clf", LogisticRegression(max_iter=500, class_weight="balanced"))])
_ = evaluate(char_clf, train_df["comments"], train_df["aspect"], val_df["comments"], val_df["aspect"], "Char TF-IDF")

## 5. Topic-prefixed variant

In [None]:
def add_topic_prefix(texts, topics):
    return [f"[TOPIC={t}] {txt}" for t, txt in zip(topics, texts)]
pref_train = add_topic_prefix(train_df["comments"], train_df["topic"])
pref_val = add_topic_prefix(val_df["comments"], val_df["topic"])
_ = evaluate(word_clf, pref_train, train_df["aspect"], pref_val, val_df["aspect"], "Topic-prefixed TF-IDF")

## 6. Sentence-embedding classifier

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import StandardScaler
embedder = SentenceTransformer("all-mpnet-base-v2")
train_vecs = embedder.encode(train_df["comments"].tolist(), show_progress_bar=False)
val_vecs = embedder.encode(val_df["comments"].tolist(), show_progress_bar=False)
embed_clf = LogisticRegression(max_iter=1000, class_weight="balanced")
embed_clf.fit(train_vecs, train_df["aspect"])
preds = embed_clf.predict(val_vecs)
print(classification_report(val_df["aspect"], preds))

## 7. Transformer fine-tuning

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
label2id = {l:i for i,l in enumerate(sorted(df["aspect"].unique()))}
id2label = {i:l for l,i in label2id.items()}
def tok(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)
train_ds = Dataset.from_dict({"text": train_df["comments"].tolist(), "label": [label2id[x] for x in train_df["aspect"]]})
val_ds = Dataset.from_dict({"text": val_df["comments"].tolist(), "label": [label2id[x] for x in val_df["aspect"]]})
train_ds = train_ds.map(tok, batched=True)
val_ds = val_ds.map(tok, batched=True)
train_ds.set_format("torch")
val_ds.set_format("torch")
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label2id), id2label=id2label, label2id=label2id)
args = TrainingArguments(output_dir="./aspect_distilbert", evaluation_strategy="epoch", save_strategy="epoch", num_train_epochs=5, per_device_train_batch_size=8, per_device_eval_batch_size=16, learning_rate=2e-5, weight_decay=0.01, logging_steps=10)
trainer = Trainer(model=model, args=args, train_dataset=train_ds, eval_dataset=val_ds, tokenizer=tokenizer)
# trainer.train()
# trainer.evaluate()

## 8. Cross-domain/generalization

In [None]:
teacher_df = df[df["topic"] == "teacher"]
course_df = df[df["topic"] == "course"]
sub_clf = Pipeline([("tfidf", TfidfVectorizer(ngram_range=(1,2))), ("clf", LogisticRegression(max_iter=1000, class_weight="balanced"))])
sub_clf.fit(teacher_df["comments"], teacher_df["aspect"])
transfer_preds = sub_clf.predict(course_df["comments"])
print(classification_report(course_df["aspect"], transfer_preds))

## 9. Prompting baseline

In [None]:
from transformers import pipeline

def build_prompt(row):
    glossary = ' '.join([f"{k}: {v}" for k, v in ASPECT_GLOSSARY.items()])
    return f"Given the following aspect definitions: {glossary}. Choose the best aspect for: {row['comments']}"

prompts = val_df.apply(build_prompt, axis=1)
zero_shot = pipeline('text-classification', model='facebook/bart-large-mnli')
print(zero_shot(prompts.iloc[0]))

## 10. Error analysis

In [None]:
def collect_errors(model, x_val, y_val):
    preds = model.predict(x_val)
    return pd.DataFrame({"text": x_val, "true": y_val, "pred": preds}).query("true != pred")
err_df = collect_errors(word_clf, val_df["comments"], val_df["aspect"])
err_df.head()

## 11. CLI

In [None]:
import argparse

def run_cli():
    parser = argparse.ArgumentParser(description="Aspect classification baseline")
    parser.add_argument("--use_topic", action="store_true", help="prepend topic tags")
    args = parser.parse_args(args=[])
    x_train = train_df["comments"] if not args.use_topic else add_topic_prefix(train_df["comments"], train_df["topic"])
    x_val = val_df["comments"] if not args.use_topic else add_topic_prefix(val_df["comments"], val_df["topic"])
    model = Pipeline([("tfidf", TfidfVectorizer(ngram_range=(1,2))), ("clf", LogisticRegression(max_iter=1000, class_weight="balanced"))])
    evaluate(model, x_train, train_df["aspect"], x_val, val_df["aspect"], "CLI run")

# run_cli()

---