In [None]:
%%bash
pip install -q pandas scikit-learn numpy matplotlib seaborn


### Explainability for Linear Models
Train a TF-IDF + Logistic Regression model and inspect the most influential n-grams per sentiment label.


In [None]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

df = pd.read_csv(Path('../../data/comments.csv'))
df['stratify_key'] = df['aspect'] + '_' + df['label']
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['stratify_key'])
for frame in (train_df, test_df):
    frame['text_with_aspect'] = 'Aspect: ' + frame['aspect'] + ' | ' + frame['comment']


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import numpy as np

vectorizer = TfidfVectorizer(ngram_range=(1,2))
X_train = vectorizer.fit_transform(train_df['text_with_aspect'])
clf = LogisticRegression(max_iter=400)
clf.fit(X_train, train_df['label'])

feature_names = vectorizer.get_feature_names_out()

def top_features(label, k=10):
    label_idx = list(clf.classes_).index(label)
    coefs = clf.coef_[label_idx]
    topk = np.argsort(coefs)[-k:][::-1]
    return [(feature_names[i], coefs[i]) for i in topk]

for label in clf.classes_:
    print(f"Top features for label={label}")
    for feat, weight in top_features(label):
        print(f"{feat}: {weight:.3f}")
    print()


In [None]:
from sklearn.metrics import classification_report

X_test = vectorizer.transform(test_df['text_with_aspect'])
preds = clf.predict(X_test)
print(classification_report(test_df['label'], preds))
