In [8]:
import pandas as pd
from joblib import load

pipe = load("../models/baseline_tfidf_logreg.joblib")
vec = pipe.named_steps['tfidf']
clf = pipe.named_steps['clf']
feature_names = vec.get_feature_names_out()
coefs = clf.coef_[0]  # positive class coefficients


In [9]:
import numpy as np

top_pos_idx = np.argsort(coefs)[-30:][::-1]
top_neg_idx = np.argsort(coefs)[:30]

top_pos = pd.DataFrame( {
    'ngram':feature_names[top_pos_idx],
    'coef':coefs[top_pos_idx]
})

top_neg = pd.DataFrame({
    'ngram':feature_names[top_neg_idx],
    'coef':coefs[top_neg_idx]
})

top_pos.head(10),top_neg.head(10)

(     ngram      coef
 0    great  8.023406
 1     reat  6.970111
 2   little  6.766305
 3      bit  5.884272
 4     nice  5.753139
 5      ice  5.692471
 6     orks  5.674732
 7  perfect  5.666218
 8    price  5.402466
 9     well  5.235521,
           ngram      coef
 0        return -6.528087
 1      returned -6.473920
 2  disappointed -6.249882
 3          poor -5.736172
 4       useless -5.673207
 5         waste -5.541893
 6          even -5.086355
 7     returning -5.082953
 8          junk -4.641397
 9      horrible -4.394413)

In [10]:
top_pos.to_csv("../reports/figures/model_top_positive_ngrams.csv", index=False)
top_neg.to_csv("../reports/figures/model_top_negative_ngrams.csv", index=False)

In [11]:
# quick bar plot
import matplotlib.pyplot as plt

def barh(df,title,fname):
    ax = df.sort_values('coef').plot(kind='barh',x='ngram',y='coef',figsize=(8,6),legend=False)
    ax.set_title(title)
    plt.savefig(fname,dpi=150)
    plt.close()
    
barh(top_pos.head(20),"Top positive ngrams","../reports/figures/model_top_pos_bars.png")
barh(top_neg.head(20),"Top negative ngrams","../reports/figures/model_top_neg_bars.png")