In [None]:
# ===============================================================
# SBERT SEMANTIC MODEL – EVALUATION ON SAME DATA
# CONFUSION MATRIX + PRECISION + RECALL + F1
# ===============================================================

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    confusion_matrix
)
from sentence_transformers import SentenceTransformer

# -----------------------------------------------------------
# 1. Load Data
# -----------------------------------------------------------
DESC_COL = "What are your most important reasons for giving us that score?"
THEME_COL = "CSAT Theme"
SCORE_COL = "CSAT score"

data = df[[DESC_COL, THEME_COL, SCORE_COL]].copy()
data[DESC_COL] = data[DESC_COL].astype(str)

# -----------------------------------------------------------
# 2. Clean Themes
# -----------------------------------------------------------
data[THEME_COL] = data[THEME_COL].astype(object)
data[THEME_COL] = data[THEME_COL].replace([0, "0"], np.nan)
data[THEME_COL] = data[THEME_COL].astype(str).str.strip()
data[THEME_COL] = data[THEME_COL].replace(["", "nan", "NaN", "None"], np.nan)
data["CSAT Theme Cleaned"] = data[THEME_COL].fillna("Others")

# -----------------------------------------------------------
# 3. Filter CSAT Score < 6
# -----------------------------------------------------------
data[SCORE_COL] = pd.to_numeric(data[SCORE_COL], errors="coerce")
filtered = data[data[SCORE_COL] < 6].copy()
filtered = filtered[filtered[DESC_COL].str.strip() != ""]

print("Rows with CSAT Score < 6:", filtered.shape[0])

# -----------------------------------------------------------
# 4. Label Encoding
# -----------------------------------------------------------
label_encoder = LabelEncoder()
filtered["label"] = label_encoder.fit_transform(filtered["CSAT Theme Cleaned"])

y_true = filtered["label"].values
labels = label_encoder.classes_
print("Themes:", list(labels))

# -----------------------------------------------------------
# 5. Load SBERT (NO API KEY REQUIRED)
# -----------------------------------------------------------
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# -----------------------------------------------------------
# 6. Convert Text to Embeddings
# -----------------------------------------------------------
X_embeddings = model.encode(
    filtered[DESC_COL].tolist(),
    batch_size=32,
    show_progress_bar=True
)

print("Embedding shape:", X_embeddings.shape)

# -----------------------------------------------------------
# 7. Train Classifier on Entire Dataset
# -----------------------------------------------------------
clf = LogisticRegression(max_iter=4000, class_weight="balanced")
clf.fit(X_embeddings, y_true)

# -----------------------------------------------------------
# 8. Predict on SAME DATA
# -----------------------------------------------------------
y_pred = clf.predict(X_embeddings)
filtered["Predicted Theme"] = label_encoder.inverse_transform(y_pred)

# -----------------------------------------------------------
# 9. EVALUATION METRICS (ON SAME DATA)
# -----------------------------------------------------------

# Accuracy
accuracy = accuracy_score(y_true, y_pred)
print("\n===============================")
print("ACCURACY =", round(accuracy, 4))
print("===============================\n")

# Classification Report
print("=== PRECISION • RECALL • F1 ===")
print(classification_report(y_true, y_pred, target_names=labels))

# Confusion Matrix
cm = confusion_matrix(y_true, y_pred)
cm_df = pd.DataFrame(cm, index=labels, columns=labels)

print("\n=== CONFUSION MATRIX ===")
print(cm_df)

# -----------------------------------------------------------
# 10. Save Output
# -----------------------------------------------------------
output_path = "csat_semantic_sbert_predictions.xlsx"
filtered.to_excel(output_path, index=False)

print(f"\nSaved predictions with evaluation metrics to {output_path}")


In [None]:
# ===============================================================
# OPTIMIZED SBERT CLASSIFICATION + FULL DATA OUTPUT
# ===============================================================

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sentence_transformers import SentenceTransformer

# ==============================
# 1. Load Required Columns
# ==============================
DESC_COL = "What are your most important reasons for giving us that score?"
THEME_COL = "CSAT Theme"
SCORE_COL = "CSAT score"

df_work = df.copy()  # keep original safe
df_work[DESC_COL] = df_work[DESC_COL].astype(str)

# ==============================
# 2. Clean Themes
# ==============================
df_work[THEME_COL] = df_work[THEME_COL].astype(object)
df_work[THEME_COL] = df_work[THEME_COL].replace([0, "0"], np.nan)
df_work[THEME_COL] = df_work[THEME_COL].astype(str).str.strip()
df_work[THEME_COL] = df_work[THEME_COL].replace(["", "nan", "NaN", "None"], np.nan)
df_work["CSAT Theme Cleaned"] = df_work[THEME_COL].fillna("Others")

# ==============================
# 3. Filter for CSAT Score < 6
# ==============================
df_work[SCORE_COL] = pd.to_numeric(df_work[SCORE_COL], errors="coerce")
final_df = df_work[df_work[SCORE_COL] < 6].copy()
final_df = final_df[final_df[DESC_COL].str.strip() != ""]

print("Total rows used for training + evaluation:", final_df.shape[0])

# ==============================
# 4. Label Encoding
# ==============================
label_encoder = LabelEncoder()
final_df["Actual_Label"] = label_encoder.fit_transform(final_df["CSAT Theme Cleaned"])
labels = label_encoder.classes_

# ==============================
# 5. SBERT Embeddings
# ==============================
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

X = model.encode(
    final_df[DESC_COL].tolist(),
    batch_size=32,
    show_progress_bar=True
)

y = final_df["Actual_Label"].values

# ==============================
# 6. Train Classifier
# ==============================
clf = LogisticRegression(max_iter=4000, class_weight="balanced")
clf.fit(X, y)

# ==============================
# 7. Predictions
# ==============================
y_pred = clf.predict(X)

final_df["Predicted_Label"] = y_pred
final_df["Predicted Theme"] = label_encoder.inverse_transform(y_pred)

# ==============================
# 8. Metrics (On Same Data)
# ==============================
accuracy = accuracy_score(y, y_pred)
cls_report = classification_report(y, y_pred, target_names=labels)
cm = pd.DataFrame(confusion_matrix(y, y_pred), index=labels, columns=labels)

print("\n========= ACCURACY =========")
print(round(accuracy, 4))

print("\n===== CLASSIFICATION REPORT =====")
print(cls_report)

print("\n===== CONFUSION MATRIX =====")
print(cm)

# ==============================
# 9. Save Full Final Output File
# ==============================
output_path = "SBERT_CSAT_Theme_Predictions.xlsx"
final_df.to_excel(output_path, index=False)

print(f"\nFile saved successfully with ALL columns → {output_path}")
