# Aspect Classification with Explainable and Cross-Domain Experiments

In [None]:
import pandas as pdimport numpy as npfrom pathlib import Pathfrom dataclasses import dataclassfrom sklearn.model_selection import train_test_splitfrom sklearn.feature_extraction.text import TfidfVectorizerfrom sklearn.linear_model import LogisticRegressionfrom sklearn.pipeline import Pipelinefrom sklearn.metrics import classification_report, confusion_matriximport shapfrom lime.lime_text import LimeTextExplainertry:    from sentence_transformers import SentenceTransformerexcept Exception:    SentenceTransformer = None

In [None]:
@dataclassclass Config:    data_path: Path = Path("data_feedback.xlsx")    text_col: str = "comments"    label_col: str = "aspect"    random_state: int = 42CFG = Config()

In [None]:
def load_data(cfg=CFG):    if cfg.data_path.exists():        df = pd.read_excel(cfg.data_path)    else:        df = pd.DataFrame({            'comments': ["helpful person", "good knowledge", "practical should be by our theory books"],            'aspect': ["general", "knowledge", "relevancy"],            'teacher/course': ['teacher','teacher','course'],            'sentiment': ['positive','positive','neutral']        })    return df.rename(columns={cfg.text_col:'text'})df = load_data()train_df, val_df = train_test_split(df, test_size=0.2, random_state=CFG.random_state, stratify=df['aspect'])print(train_df.head())

## TF–IDF + Logistic baseline (word and character)

In [None]:
def run_tfidf(train_df, val_df, analyzer='word', ngram_range=(1,2)):    pipe = Pipeline([        ('tfidf', TfidfVectorizer(analyzer=analyzer, ngram_range=ngram_range)),        ('clf', LogisticRegression(max_iter=1000, class_weight='balanced'))    ])    pipe.fit(train_df['text'], train_df['aspect'])    preds = pipe.predict(val_df['text'])    print(classification_report(val_df['aspect'], preds))    print(confusion_matrix(val_df['aspect'], preds))    return pipeword_model = run_tfidf(train_df, val_df)char_model = run_tfidf(train_df, val_df, analyzer='char', ngram_range=(3,5))

## SHAP and LIME explanations for the baseline

In [None]:
def explain_example(model: Pipeline, text: str):    vect = model.named_steps['tfidf']    clf = model.named_steps['clf']    explainer = shap.LinearExplainer(clf, vect.transform(train_df['text']))    shap_values = explainer(vect.transform([text]))    shap.waterfall_plot(shap.Explanation(values=shap_values.toarray()[0],                                         base_values=explainer.expected_value,                                         data=vect.transform([text]).toarray()[0],                                         feature_names=vect.get_feature_names_out()), show=False)    lime = LimeTextExplainer(class_names=sorted(train_df['aspect'].unique()))    return lime.explain_instance(text, model.predict_proba, num_features=8).as_list()# explain_example(word_model, val_df.iloc[0]['text'])

## Sentence-embedding classifier with cross-domain (teacher→course) checks

In [None]:
def run_sbert(train_df, val_df, model_name='all-MiniLM-L6-v2'):    if SentenceTransformer is None:        raise ImportError("sentence-transformers not installed")    encoder = SentenceTransformer(model_name)    emb_train = encoder.encode(train_df['text'].tolist(), show_progress_bar=False)    emb_val = encoder.encode(val_df['text'].tolist(), show_progress_bar=False)    clf = LogisticRegression(max_iter=1000, class_weight='balanced')    clf.fit(emb_train, train_df['aspect'])    preds = clf.predict(emb_val)    print(classification_report(val_df['aspect'], preds))    return encoder, clf# encoder, clf = run_sbert(train_df, val_df)def cross_domain(train_df, val_df, encoder, clf):    mask_teacher = train_df['teacher/course'] == 'teacher'    mask_course = train_df['teacher/course'] == 'course'    if mask_teacher.any() and mask_course.any():        tr = train_df[mask_teacher]        te = val_df[val_df['teacher/course'] == 'course']        emb_tr = encoder.encode(tr['text'].tolist(), show_progress_bar=False)        emb_te = encoder.encode(te['text'].tolist(), show_progress_bar=False)        clf.fit(emb_tr, tr['aspect'])        preds = clf.predict(emb_te)        print("Teacher→Course transfer")        print(classification_report(te['aspect'], preds))

## Prompting baseline

In [None]:
def prompt_aspect(texts, model_name="gpt-4o-mini"):    import openai    client = openai.OpenAI()    outputs = []    for t in texts:        resp = client.responses.create(            model=model_name,            input=[{"role":"system","content":"Classify aspect of feedback (teaching skills, behaviour, knowledge, relevancy, general). Return JSON {aspect, rationale}."},                   {"role":"user","content":t}],            response_format={"type":"json_object"}        )        outputs.append(resp.output_text)    return outputs

## Error analysis helper

In [None]:
def error_table(model, val_df):    preds = model.predict(val_df['text'])    errors = val_df.copy()    errors['pred'] = preds    return errors[errors['pred'] != errors['aspect']][['text','aspect','pred','teacher/course','sentiment']]# error_table(word_model, val_df)

## CLI entry point

In [None]:
def main_cli():    import argparse    parser = argparse.ArgumentParser()    parser.add_argument('--model', choices=['tfidf','char','sbert'], default='tfidf')    args = parser.parse_args()    df = load_data()    train_df, val_df = train_test_split(df, test_size=0.2, random_state=CFG.random_state, stratify=df['aspect'])    if args.model == 'tfidf':        run_tfidf(train_df, val_df)    elif args.model == 'char':        run_tfidf(train_df, val_df, analyzer='char', ngram_range=(3,5))    elif args.model == 'sbert':        run_sbert(train_df, val_df)if __name__ == '__main__':    # main_cli()    pass