In [13]:
import numpy as np
import pandas as pd
from joblib import load
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from scipy.optimize import linear_sum_assignment

In [20]:
# =====================================
# Verify clustering results (Hungarian Accuracy)
# =====================================

# pick the dataset and model you want to test
CSV_PATH  = "cleaned_tweets.csv"
X_PATH    = "model/X_glove_full.npy"
PKL_PATH  = "model/glove_pipeline.pkl"  # change for sbert/bertweet/cardiff etc.
TEXT_COL  = "text"
LABEL_COL = "airline_sentiment"
# --------------------------------------------

# Load dataset and encode labels
df = pd.read_csv(CSV_PATH)
y_true_raw = df[LABEL_COL].astype(str).tolist()
le = LabelEncoder()
y_true = le.fit_transform(y_true_raw)
print(f"Loaded {len(df)} rows | classes: {list(le.classes_)}")

# Load embeddings (to feed into pipeline)
X = np.load(X_PATH)
print("Embeddings shape:", X.shape, "from:", X_PATH)

# Load trained pipeline (fitted reducer + clusterer)
pipe = load(PKL_PATH)
print(f"Loaded trained pipeline: {PKL_PATH}")
print("Available attributes:", [a for a in dir(pipe) if a.endswith('_')])

# Use stored cluster labels from the trained model
y_pred = pipe.labels_
print("Using stored cluster labels from trained pipeline.")

# Compute Hungarian Accuracy
def hungarian_accuracy(y_true_int, y_pred_int):
    cm = confusion_matrix(y_true_int, y_pred_int)
    r, c = linear_sum_assignment(cm.max() - cm)
    matched = cm[r, c].sum()
    return matched / cm.sum(), cm, r, c

ha, cm, r, c = hungarian_accuracy(y_true, y_pred)
mapping = {int(col): le.classes_[int(row)] for row, col in zip(r, c)}

# Display results
print(f"\nHungarian Accuracy: {ha:.3f}")
print("Optimal cluster → label mapping:", mapping)
print("Confusion matrix:\n", cm)

Loaded 14639 rows | classes: [np.str_('negative'), np.str_('neutral'), np.str_('positive')]
Embeddings shape: (14639, 200) from: model/X_glove_full.npy
Loaded trained pipeline: model/glove_pipeline.pkl
Available attributes: ['X_use_', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'clusterer_', 'labels_', 'reducer_', 'score_']
Using stored cluster labels from trained pipeline.

Hungarian Accuracy: 0.653
Optimal cluster → label mapping: {2: np.str_('negative'), 0: np.str_('neutral'), 1: np.str_('positive')}
Confusion matrix:
 [[  62    1 9115]
 [ 186   64 2849]
 [  95  257 2010]]
