In [None]:
# ===============================================================
# SBERT MODEL TRAINING (NO CLEANING OF THEMES)
# ONLY CSAT Score <= 6 FILTER + BASIC STATS + MODEL TRAINING
# ===============================================================

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sentence_transformers import SentenceTransformer

# ---------------------------------------------
# 1. Column Names
# ---------------------------------------------
DESC_COL = "What are your most important reasons for giving us that score?"
THEME_COL = "CSAT Theme"
SCORE_COL = "CSAT score"

# ---------------------------------------------
# 2. Copy Data
# ---------------------------------------------
df_work = df.copy()
df_work[DESC_COL] = df_work[DESC_COL].astype(str)

# ---------------------------------------------
# 3. Theme Column Stats (NO CLEANING)
# ---------------------------------------------
unique_themes = df_work[THEME_COL].unique()
null_themes = df_work[THEME_COL].isna().sum()

print("\n========== UNIQUE THEMES (NO CLEANING APPLIED) ==========")
print(unique_themes)
print("Total unique themes:", len(unique_themes))

print("\n========== NULL THEME COUNT ==========")
print(null_themes)

# ---------------------------------------------
# 4. Filter for CSAT Score <= 6
# ---------------------------------------------
df_work[SCORE_COL] = pd.to_numeric(df_work[SCORE_COL], errors="coerce")
filtered = df_work[df_work[SCORE_COL] <= 6].copy()

print("\n========== TOTAL RECORDS WITH CSAT SCORE <= 6 ==========")
print(filtered.shape[0])

# ---------------------------------------------
# 5. Label Encoding for Themes (NO CLEANING)
# ---------------------------------------------
label_encoder = LabelEncoder()
filtered["label"] = label_encoder.fit_transform(filtered[THEME_COL].astype(str))

print("\n========== NUMBER OF CLASSES ==========")
print(len(label_encoder.classes_))
print(label_encoder.classes_)

# ---------------------------------------------
# 6. SBERT Embeddings
# ---------------------------------------------
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

X_embeddings = model.encode(
    filtered[DESC_COL].tolist(),
    batch_size=32,
    show_progress_bar=True
)

y = filtered["label"].values

print("\nEmbedding shape:", X_embeddings.shape)

# ---------------------------------------------
# 7. Train Logistic Regression (Multiclass)
# ---------------------------------------------
clf = LogisticRegression(max_iter=4000, class_weight="balanced")
clf.fit(X_embeddings, y)

print("\n========== MODEL TRAINING COMPLETED ==========")


In [None]:
 #===============================================================
# INFERENCE BLOCK – REPLACE ONLY THEME COLUMN WITH PREDICTIONS
# ===============================================================

import pandas as pd
import numpy as np

# -----------------------------
# 1. Load New File
# -----------------------------
input_path = r"your_file_path_here.xlsx"   # <-- replace with actual path
df_new = pd.read_excel(input_path)

DESC_COL = "What are your most important reasons for giving us that score?"
SCORE_COL = "CSAT score"
THEME_COL = "CSAT Theme"   # this is what we will REPLACE

df_new[DESC_COL] = df_new[DESC_COL].astype(str)
df_new[SCORE_COL] = pd.to_numeric(df_new[SCORE_COL], errors="coerce")

# -----------------------------
# 2. Filter Score <= 6 (Same Logic)
# -----------------------------
df_infer = df_new[df_new[SCORE_COL] <= 6].copy()
df_infer = df_infer[df_infer[DESC_COL].str.strip() != ""]

print("Rows used for inference:", df_infer.shape[0])

# -----------------------------
# 3. Convert Text → SBERT Embeddings
# -----------------------------
X_new = model.encode(
    df_infer[DESC_COL].tolist(),
    batch_size=32,
    show_progress_bar=True
)

# -----------------------------
# 4. Predict Themes
# -----------------------------
y_new_pred = clf.predict(X_new)
predicted_themes = label_encoder.inverse_transform(y_new_pred)

# -----------------------------
# 5. Replace the CSAT Theme Column IN ORIGINAL DATAFRAME
# -----------------------------
df_new.loc[df_infer.index, THEME_COL] = predicted_themes

# -----------------------------
# 6. Save Updated File
# -----------------------------
output_path = "CSAT_Updated_With_Predicted_Themes.xlsx"
df_new.to_excel(output_path, index=False)

print(f"\nFile saved successfully → {output_path}")
print("Only the Theme column was replaced. All other columns remain unchanged.")
