In [None]:
%%bash
pip install -q pandas scikit-learn numpy matplotlib seaborn


### Baseline: TF-IDF + Logistic Regression
This notebook loads the teacher/course sentiment data, builds aspect-aware TF-IDF features, and trains a linear classifier.


In [None]:
    import pandas as pd
    from pathlib import Path

    data_path = Path('../../data/comments.csv')
    df = pd.read_csv(data_path)
    print(df.head())
    print('
Label distribution:
', df.groupby(['aspect','label']).size())


In [None]:
from sklearn.model_selection import train_test_split

df['stratify_key'] = df['aspect'] + '_' + df['label']
train_df, test_df = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df['stratify_key']
)
for split, split_df in [('train', train_df), ('test', test_df)]:
    split_df['text_with_aspect'] = 'Aspect: ' + split_df['aspect'] + ' | ' + split_df['comment']
    print(f"{split} size: {len(split_df)}")


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

pipe = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2), min_df=1)),
    ('clf', LogisticRegression(max_iter=500, n_jobs=-1))
])
pipe.fit(train_df['text_with_aspect'], train_df['label'])
preds = pipe.predict(test_df['text_with_aspect'])
print(classification_report(test_df['label'], preds))

cm = confusion_matrix(test_df['label'], preds, labels=pipe.classes_)
sns.heatmap(cm, annot=True, fmt='d', xticklabels=pipe.classes_, yticklabels=pipe.classes_)
plt.xlabel('Predicted'); plt.ylabel('True'); plt.title('Baseline confusion matrix');
plt.show()


In [None]:
import joblib
from pathlib import Path

model_path = Path('../../outputs/baseline')
model_path.mkdir(parents=True, exist_ok=True)
joblib.dump(pipe, model_path / 'tfidf_logreg.joblib')
print('Saved model to', model_path / 'tfidf_logreg.joblib')
