# Teacher vs Course Classification
Notebook for topic discrimination with explainable baselines, prompting, and robustness slices.

## 1. Setup

In [None]:
from pathlib import Path
from typing import List

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

import shap
from lime.lime_text import LimeTextExplainer

try:
    from sentence_transformers import SentenceTransformer
except Exception:
    SentenceTransformer = None

## 2. Load data

In [None]:
def load_topic_data(path=Path("data_feedback.xlsx")):
    if path.exists():
        df = pd.read_excel(path)
    else:
        df = pd.DataFrame(
            {
                "teacher/course": ["teacher", "course"],
                "comments": ["great teacher", "great course"],
            }
        )
    df = df.rename(columns={"comments": "text", "teacher/course": "topic"})
    return df

## 3. Heuristic baseline

In [None]:
KEYWORDS = {
    "teacher": ["teacher", "sir", "madam", "prof"],
    "course": ["course", "syllabus", "lecture", "practical", "module"],
}


def keyword_rule(text: str):
    lower = text.lower()
    for label, words in KEYWORDS.items():
        if any(w in lower for w in words):
            return label
    return "course"

## 4. TF–IDF baselines with explainability

In [None]:
def run_topic_tfidf(train_df, val_df, analyzer="word"):
    pipe = Pipeline(
        [
            ("tfidf", TfidfVectorizer(analyzer=analyzer, ngram_range=(1, 2))),
            ("clf", LogisticRegression(max_iter=200, class_weight="balanced")),
        ]
    )
    pipe.fit(train_df["text"], train_df["topic"])
    preds = pipe.predict(val_df["text"])
    print(classification_report(val_df["topic"], preds, zero_division=0))
    print(confusion_matrix(val_df["topic"], preds))

    explainer = LimeTextExplainer(class_names=pipe.classes_)
    explanation = explainer.explain_instance(val_df["text"].iloc[0], pipe.predict_proba, num_features=8)
    return pipe, explanation

## 5. SBERT baseline

In [None]:
def run_topic_sbert(train_df, val_df, model_name="all-MiniLM-L6-v2"):
    if SentenceTransformer is None:
        raise ImportError("sentence-transformers not installed")

    model = SentenceTransformer(model_name)
    train_emb = model.encode(train_df["text"].tolist(), batch_size=16, show_progress_bar=True)
    val_emb = model.encode(val_df["text"].tolist(), batch_size=16, show_progress_bar=True)

    clf = LogisticRegression(max_iter=200, class_weight="balanced")
    clf.fit(train_emb, train_df["topic"])
    preds = clf.predict(val_emb)
    print(classification_report(val_df["topic"], preds, zero_division=0))
    return clf, model

## 6. Prompting baseline with rationale

In [None]:
def prompt_topic(texts: List[str], model_name="gpt-4o-mini"):
    import openai

    client = openai.OpenAI()
    outputs = []
    for t in texts:
        res = client.chat.completions.create(
            model=model_name,
            messages=[
                {"role": "system", "content": "Decide if the comment is about a teacher or a course and respond with the label and a short justification."},
                {"role": "user", "content": t},
            ],
            temperature=0,
        )
        outputs.append(res.choices[0].message["content"])
    return outputs

## 7. Robustness slice: sentiment-stratified

In [None]:
def slice_by_sentiment(df: pd.DataFrame, sentiment_col="sentiment"):
    if sentiment_col not in df:
        return df
    return df.groupby(sentiment_col)["text"].apply(list)

## 8. CLI entry point

In [None]:
def main_cli():
    import argparse

    parser = argparse.ArgumentParser(description="Teacher vs course classification")
    parser.add_argument("--model", choices=["tfidf", "char", "sbert", "rule"], default="tfidf")
    args = parser.parse_args()

    df = load_topic_data()
    train_df, val_df = train_test_split(df, test_size=0.25, random_state=42, stratify=df["topic"])

    if args.model == "rule":
        preds = [keyword_rule(t) for t in val_df["text"]]
        print(classification_report(val_df["topic"], preds, zero_division=0))
    elif args.model == "tfidf":
        run_topic_tfidf(train_df, val_df, analyzer="word")
    elif args.model == "char":
        run_topic_tfidf(train_df, val_df, analyzer="char")
    elif args.model == "sbert":
        run_topic_sbert(train_df, val_df)


if __name__ == "__main__":
    main_cli()