In [None]:
%%bash
pip install -q pandas scikit-learn numpy matplotlib seaborn


### Cross-Domain Generalization
Train on one aspect (e.g., teacher) and evaluate on the other (course) to measure transfer.


In [None]:
import pandas as pd
from pathlib import Path

data_path = Path('../../data/comments.csv')
df = pd.read_csv(data_path)
print(df['aspect'].value_counts())


In [None]:
from sklearn.model_selection import train_test_split

def prepare_split(df, aspect):
    subset = df[df['aspect'] == aspect].copy()
    if subset.empty:
        return None, None
    subset['stratify_key'] = subset['label']
    train, test = train_test_split(
        subset, test_size=0.2, random_state=42, stratify=subset['stratify_key']
    )
    for frame in (train, test):
        frame['text_with_aspect'] = 'Aspect: ' + frame['aspect'] + ' | ' + frame['comment']
    return train, test

teacher_train, teacher_test = prepare_split(df, 'teacher')
course_train, course_test = prepare_split(df, 'course')


In [None]:
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.linear_model import LogisticRegression
    from sklearn.pipeline import Pipeline
    from sklearn.metrics import classification_report

    def train_eval(train_df, eval_df, title):
        pipe = Pipeline([
            ('tfidf', TfidfVectorizer(ngram_range=(1,2))),
            ('clf', LogisticRegression(max_iter=400))
        ])
        pipe.fit(train_df['text_with_aspect'], train_df['label'])
        preds = pipe.predict(eval_df['text_with_aspect'])
        print(f"
{title}")
        print(classification_report(eval_df['label'], preds))
        return pipe

    teacher_model = train_eval(teacher_train, teacher_test, 'Teacher -> Teacher holdout')
    if course_train is not None and course_test is not None:
        cross_preds = teacher_model.predict(course_test['text_with_aspect'])
        print('
Teacher-trained model evaluated on course comments')
        print(classification_report(course_test['label'], cross_preds))
