In [None]:
# ===============================================================
# SBERT MODEL TRAINING (NO CLEANING OF THEMES)
# ONLY CSAT Score <= 6 FILTER + BASIC STATS + MODEL TRAINING
# ===============================================================

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sentence_transformers import SentenceTransformer

# ---------------------------------------------
# 1. Column Names
# ---------------------------------------------
DESC_COL = "What are your most important reasons for giving us that score?"
THEME_COL = "CSAT Theme"
SCORE_COL = "CSAT score"

# ---------------------------------------------
# 2. Copy Data
# ---------------------------------------------
df_work = df.copy()
df_work[DESC_COL] = df_work[DESC_COL].astype(str)

# ---------------------------------------------
# 3. Theme Column Stats (NO CLEANING)
# ---------------------------------------------
unique_themes = df_work[THEME_COL].unique()
null_themes = df_work[THEME_COL].isna().sum()

print("\n========== UNIQUE THEMES (NO CLEANING APPLIED) ==========")
print(unique_themes)
print("Total unique themes:", len(unique_themes))

print("\n========== NULL THEME COUNT ==========")
print(null_themes)

# ---------------------------------------------
# 4. Filter for CSAT Score <= 6
# ---------------------------------------------
df_work[SCORE_COL] = pd.to_numeric(df_work[SCORE_COL], errors="coerce")
filtered = df_work[df_work[SCORE_COL] <= 6].copy()

print("\n========== TOTAL RECORDS WITH CSAT SCORE <= 6 ==========")
print(filtered.shape[0])

# ---------------------------------------------
# 5. Label Encoding for Themes (NO CLEANING)
# ---------------------------------------------
label_encoder = LabelEncoder()
filtered["label"] = label_encoder.fit_transform(filtered[THEME_COL].astype(str))

print("\n========== NUMBER OF CLASSES ==========")
print(len(label_encoder.classes_))
print(label_encoder.classes_)

# ---------------------------------------------
# 6. SBERT Embeddings
# ---------------------------------------------
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

X_embeddings = model.encode(
    filtered[DESC_COL].tolist(),
    batch_size=32,
    show_progress_bar=True
)

y = filtered["label"].values

print("\nEmbedding shape:", X_embeddings.shape)

# ---------------------------------------------
# 7. Train Logistic Regression (Multiclass)
# ---------------------------------------------
clf = LogisticRegression(max_iter=4000, class_weight="balanced")
clf.fit(X_embeddings, y)

print("\n========== MODEL TRAINING COMPLETED ==========")


In [None]:
#FINAL CSAT ASAT ALL COLUMNS

In [None]:
# ===============================================================
# TRAINING BLOCK — CSAT THEME + ASAT THEME
# SCORE <= 6 FOR BOTH MODELS
# ===============================================================

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sentence_transformers import SentenceTransformer

# -----------------------------
# 1. COLUMN NAMES
# -----------------------------
CSAT_DESC  = "What are your most important reasons for giving us that score?"
ASAT_DESC  = "How could the consultant improve how they handled your enquiry?"
ASAT_GOOD  = "We’d love to know what the consultant did to earn such a rating?"

CSAT_SCORE = "CSAT score"
ASAT_SCORE = "How would you rate the service you received from the consultant handling your enquiry?"

CSAT_THEME = "CSAT Theme"
ASAT_THEME = "ASAT Theme"

# -----------------------------
# 2. LOAD TRAINING FILE
# -----------------------------
train_path = r"training_file.xlsx"   # <-- change this
df = pd.read_excel(train_path)

# Convert all text to string
df[CSAT_DESC] = df[CSAT_DESC].astype(str)
df[ASAT_DESC] = df[ASAT_DESC].astype(str)
df[ASAT_GOOD] = df[ASAT_GOOD].astype(str)

df[CSAT_SCORE] = pd.to_numeric(df[CSAT_SCORE], errors="coerce")
df[ASAT_SCORE] = pd.to_numeric(df[ASAT_SCORE], errors="coerce")

# -----------------------------
# 3. FILTER SCORE <= 6 (CSAT)
# -----------------------------
df_csat = df[df[CSAT_SCORE] <= 6].copy()
df_csat = df_csat[df_csat[CSAT_DESC].str.strip() != ""]

print("\n======== UNIQUE CSAT THEMES (<= 6) =========")
print(df_csat[CSAT_THEME].unique())
print("Total:", len(df_csat[CSAT_THEME].unique()))

# -----------------------------
# 4. FILTER SCORE <= 6 (ASAT)
# -----------------------------
df_asat = df[df[ASAT_SCORE] <= 6].copy()
df_asat["ASAT_TEXT"] = (
    df_asat[ASAT_GOOD].fillna("") + " " +
    df_asat[ASAT_DESC].fillna("")
).str.strip()
df_asat = df_asat[df_asat["ASAT_TEXT"].str.strip() != ""]

print("\n======== UNIQUE ASAT THEMES (<= 6) =========")
print(df_asat[ASAT_THEME].unique())
print("Total:", len(df_asat[ASAT_THEME].unique()))

# -----------------------------
# 5. LABEL ENCODERS
# -----------------------------
le_csat = LabelEncoder()
df_csat["label"] = le_csat.fit_transform(df_csat[CSAT_THEME].astype(str))

le_asat = LabelEncoder()
df_asat["label"] = le_asat.fit_transform(df_asat[ASAT_THEME].astype(str))

# -----------------------------
# 6. SBERT EMBEDDINGS
# -----------------------------
sbert = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

X_csat = sbert.encode(df_csat[CSAT_DESC].tolist(),
                      batch_size=32, show_progress_bar=True)
y_csat = df_csat["label"].values

X_asat = sbert.encode(df_asat["ASAT_TEXT"].tolist(),
                      batch_size=32, show_progress_bar=True)
y_asat = df_asat["label"].values

# -----------------------------
# 7. TRAIN TWO MODELS
# -----------------------------
clf_csat = LogisticRegression(max_iter=4000, class_weight="balanced")
clf_csat.fit(X_csat, y_csat)

clf_asat = LogisticRegression(max_iter=4000, class_weight="balanced")
clf_asat.fit(X_asat, y_asat)

print("\n======== TRAINING COMPLETED FOR BOTH MODELS ========")

# Save these five items externally if needed:
# sbert, clf_csat, clf_asat, le_csat, le_asat


In [None]:
 #===============================================================
# INFERENCE BLOCK – REPLACE ONLY THEME COLUMN WITH PREDICTIONS
# ===============================================================

import pandas as pd
import numpy as np

# -----------------------------
# 1. Load New File
# -----------------------------
input_path = r"your_file_path_here.xlsx"   # <-- replace with actual path
df_new = pd.read_excel(input_path)

DESC_COL = "What are your most important reasons for giving us that score?"
SCORE_COL = "CSAT score"
THEME_COL = "CSAT Theme"   # this is what we will REPLACE

df_new[DESC_COL] = df_new[DESC_COL].astype(str)
df_new[SCORE_COL] = pd.to_numeric(df_new[SCORE_COL], errors="coerce")

# -----------------------------
# 2. Filter Score <= 6 (Same Logic)
# -----------------------------
df_infer = df_new[df_new[SCORE_COL] <= 6].copy()
df_infer = df_infer[df_infer[DESC_COL].str.strip() != ""]

print("Rows used for inference:", df_infer.shape[0])

# -----------------------------
# 3. Convert Text → SBERT Embeddings
# -----------------------------
X_new = model.encode(
    df_infer[DESC_COL].tolist(),
    batch_size=32,
    show_progress_bar=True
)

# -----------------------------
# 4. Predict Themes
# -----------------------------
y_new_pred = clf.predict(X_new)
predicted_themes = label_encoder.inverse_transform(y_new_pred)

# -----------------------------
# 5. Replace the CSAT Theme Column IN ORIGINAL DATAFRAME
# -----------------------------
df_new.loc[df_infer.index, THEME_COL] = predicted_themes

# -----------------------------
# 6. Save Updated File
# -----------------------------
output_path = "CSAT_Updated_With_Predicted_Themes.xlsx"
df_new.to_excel(output_path, index=False)

print(f"\nFile saved successfully → {output_path}")
print("Only the Theme column was replaced. All other columns remain unchanged.")


In [None]:
#ALL_SUBCATEGORY THEME SUBPRODUCT WORKTYPE

In [None]:
# ===============================================================
# FULL PIPELINE — TRAIN + INFERENCE FOR:
# SUBCATEGORY, WORK TYPE, SUB-PRODUCT
# ===============================================================

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

# ---------------------------------------------------------------
# 1. COMMON SETUP
# ---------------------------------------------------------------
model = SentenceTransformer("all-MiniLM-L6-v2")

DESC1    = "What are your most important reasons for giving us that score?"
ASAT_POS = "We’d love to know what the consultant did to earn such a rating?"
ASAT_IMP = "How could the consultant improve how they handled your enquiry?"

TARGETS = ["SUBCATEGORY", "Work Type", "Sub-Product"]


# ==================================================================
# 2. TRAINING FUNCTION (Reusable for all 3 models)
# ==================================================================
def train_text_classifier(df, target_col):

    df["TEXT_ALL"] = (
        df[DESC1].fillna("") + " " +
        df[ASAT_POS].fillna("") + " " +
        df[ASAT_IMP].fillna("")
    ).str.strip()

    df_train = df.dropna(subset=["TEXT_ALL", target_col]).copy()

    # Label encode
    le = LabelEncoder()
    y = le.fit_transform(df_train[target_col].astype(str))

    # Embeddings
    X_emb = model.encode(
        df_train["TEXT_ALL"].tolist(),
        batch_size=32,
        show_progress_bar=True
    )

    # Train elasticnet logistic regression
    clf = LogisticRegression(
        penalty="elasticnet",
        solver="saga",
        l1_ratio=0.5,
        max_iter=600,
        class_weight="balanced",
        n_jobs=-1
    )
    clf.fit(X_emb, y)

    print(f"\nModel trained for → {target_col}")
    return clf, le



# ==================================================================
# 3. INFERENCE FUNCTION (Reusable)
# ==================================================================
def predict_and_update(df_new, clf, le, target_col):

    df_new["TEXT_ALL"] = (
        df_new[DESC1].fillna("") + " " +
        df_new[ASAT_POS].fillna("") + " " +
        df_new[ASAT_IMP].fillna("")
    ).str.strip()

    df_infer = df_new[df_new["TEXT_ALL"].str.strip() != ""].copy()

    print(f"Rows used for inference ({target_col}):", df_infer.shape[0])

    # Encode text
    X_new = model.encode(
        df_infer["TEXT_ALL"].tolist(),
        batch_size=32,
        show_progress_bar=True
    )

    # Predict
    y_pred = clf.predict(X_new)
    pred_labels = le.inverse_transform(y_pred)

    # Update only that column
    df_new.loc[df_infer.index, target_col] = pred_labels

    print(f"{target_col} updated successfully.")

    return df_new



# ==================================================================
# 4. TRAIN ALL 3 MODELS (SUBCATEGORY, WORK TYPE, SUB-PRODUCT)
# ==================================================================
train_path = r"training_file.xlsx"     # <-- your May–Oct file
df_train_all = pd.read_excel(train_path)

trained_models = {}   # store models + encoders

for tgt in TARGETS:
    clf, le = train_text_classifier(df_train_all, tgt)
    trained_models[tgt] = (clf, le)



# ==================================================================
# 5. RUN INFERENCE ON NEW DATA (NOVEMBER FILE)
# ==================================================================
input_path  = r"your_november_file.xlsx"
output_path = r"Updated_All_3_Columns.xlsx"

df_new = pd.read_excel(input_path)

# Apply models
for tgt in TARGETS:
    clf, le = trained_models[tgt]
    df_new = predict_and_update(df_new, clf, le, tgt)

# Save final updated file
df_new.to_excel(output_path, index=False)

print("\n===================================================")
print("ALL 3 COLUMNS UPDATED SUCCESSFULLY:")
print(" → SUBCATEGORY")
print(" → WORK TYPE")
print(" → SUB-PRODUCT")
print("File saved as:", output_path)
print("===================================================\n")


In [None]:
# ===============================================================
# INFERENCE BLOCK — UPDATE CSAT THEME + ASAT THEME
# ONLY SCORE <= 6 ROWS ARE UPDATED
# ===============================================================

import pandas as pd
import numpy as np

# Make sure these exist from Block 1:
# sbert, clf_csat, clf_asat, le_csat, le_asat

# -----------------------------
# 1. LOAD NEW FILE
# -----------------------------
input_path = r"your_new_file.xlsx"   # <-- change this
output_path = r"Updated_CSAT_ASAT.xlsx"

df_new = pd.read_excel(input_path)

df_new[CSAT_DESC] = df_new[CSAT_DESC].astype(str)
df_new[ASAT_DESC] = df_new[ASAT_DESC].astype(str)
df_new[ASAT_GOOD] = df_new[ASAT_GOOD].astype(str)

df_new[CSAT_SCORE] = pd.to_numeric(df_new[CSAT_SCORE], errors="coerce")
df_new[ASAT_SCORE] = pd.to_numeric(df_new[ASAT_SCORE], errors="coerce")

# -----------------------------
# 2. CSAT INFERENCE (<= 6)
# -----------------------------
df_new_csat = df_new[
    (df_new[CSAT_SCORE] <= 6) &
    (df_new[CSAT_DESC].str.strip() != "")
].copy()

print("Rows used for CSAT inference:", df_new_csat.shape[0])

if df_new_csat.shape[0] > 0:
    X_new_csat = sbert.encode(
        df_new_csat[CSAT_DESC].tolist(),
        batch_size=32,
        show_progress_bar=True
    )
    preds_csat = clf_csat.predict(X_new_csat)
    df_new.loc[df_new_csat.index, CSAT_THEME] = le_csat.inverse_transform(preds_csat)

# -----------------------------
# 3. ASAT INFERENCE (<= 6)
# -----------------------------
df_new_asat = df_new[
    (df_new[ASAT_SCORE] <= 6)
].copy()

df_new_asat["ASAT_TEXT"] = (
    df_new_asat[ASAT_GOOD].fillna("") + " " +
    df_new_asat[ASAT_DESC].fillna("")
).str.strip()

df_new_asat = df_new_asat[df_new_asat["ASAT_TEXT"].str.strip() != ""]

print("Rows used for ASAT inference:", df_new_asat.shape[0])

if df_new_asat.shape[0] > 0:
    X_new_asat = sbert.encode(
        df_new_asat["ASAT_TEXT"].tolist(),
        batch_size=32,
        show_progress_bar=True
    )
    preds_asat = clf_asat.predict(X_new_asat)
    df_new.loc[df_new_asat.index, ASAT_THEME] = le_asat.inverse_transform(preds_asat)

# -----------------------------
# 4. SAVE UPDATED FILE
# -----------------------------
df_new.to_excel(output_path, index=False)

print("\n====================================")
print("UPDATED FILE SAVED SUCCESSFULLY:")
print(" → CSAT Theme (Only score ≤ 6 rows)")
print(" → ASAT Theme (Only score ≤ 6 rows)")
print("Path:", output_path)
print("====================================\n")


In [None]:
#SUB CATEGOTY AND OTHER COLUMNS

In [None]:
# ===============================================================
# FINAL PIPELINE — TRAIN + INFERENCE FOR:
# SUBCATEGORY, WORK TYPE, SUB-PRODUCT
# (Independent of CSAT/ASAT, No Score Filters)
# ===============================================================

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

# ---------------------------------------------------------------
# 1. COMMON SETUP
# ---------------------------------------------------------------
DESC1    = "What are your most important reasons for giving us that score?"
ASAT_POS = "We’d love to know what the consultant did to earn such a rating?"
ASAT_IMP = "How could the consultant improve how they handled your enquiry?"

TARGETS = ["SUBCATEGORY", "Work Type", "Sub-Product"]

sbert_model = SentenceTransformer("all-MiniLM-L6-v2")   # Load once


# ==================================================================
# 2. TRAINING FUNCTION (Reusable for all 3 models)
# ==================================================================
def train_text_classifier(df, target_col):

    # Build combined text
    df["TEXT_ALL"] = (
        df[DESC1].fillna("") + " " +
        df[ASAT_POS].fillna("") + " " +
        df[ASAT_IMP].fillna("")
    ).str.strip()

    # Use only rows where target is available
    df_train = df.dropna(subset=["TEXT_ALL", target_col]).copy()

    # Create label encoder
    le = LabelEncoder()
    y = le.fit_transform(df_train[target_col].astype(str))

    # SBERT embeddings
    X_emb = sbert_model.encode(
        df_train["TEXT_ALL"].tolist(),
        batch_size=32,
        show_progress_bar=True
    )

    # Model
    clf = LogisticRegression(
        penalty="elasticnet",
        solver="saga",
        l1_ratio=0.5,
        max_iter=600,
        class_weight="balanced",
        n_jobs=-1
    )
    clf.fit(X_emb, y)

    print(f"[TRAINED] {target_col}")

    return clf, le



# ==================================================================
# 3. INFERENCE FUNCTION (Reusable)
# ==================================================================
def predict_and_update(df_new, clf, le, target_col):

    # Build text for inference
    df_new["TEXT_ALL"] = (
        df_new[DESC1].fillna("") + " " +
        df_new[ASAT_POS].fillna("") + " " +
        df_new[ASAT_IMP].fillna("")
    ).str.strip()

    df_infer = df_new[df_new["TEXT_ALL"].str.strip() != ""].copy()

    print(f"Rows used for inference ({target_col}): {df_infer.shape[0]}")

    # Embeddings
    X_new = sbert_model.encode(
        df_infer["TEXT_ALL"].tolist(),
        batch_size=32,
        show_progress_bar=True
    )

    # Predictions
    preds = le.inverse_transform(clf.predict(X_new))

    # Ensure column exists --> CREATE if missing
    if target_col not in df_new.columns:
        df_new[target_col] = None

    # Update predictions
    df_new.loc[df_infer.index, target_col] = preds

    print(f"[UPDATED] {target_col}")

    return df_new



# ==================================================================
# 4. TRAIN ALL 3 MODELS (Using Master Training File)
# ==================================================================
train_path = r"training_file.xlsx"       # <-- Your May–Oct training file
df_train_all = pd.read_excel(train_path)

trained_models = {}

for tgt in TARGETS:
    clf, le = train_text_classifier(df_train_all, tgt)
    trained_models[tgt] = (clf, le)



# ==================================================================
# 5. INFERENCE ON UPDATED FILE (Generated Earlier by CSAT/ASAT)
# ==================================================================
updated_csat_asat_file = r"CSAT_ASAT_Updated.xlsx"   # <-- Output of previous block
final_output_path      = r"FINAL_All_Columns_Updated.xlsx"

df_final = pd.read_excel(updated_csat_asat_file)

# Predict + Update 3 columns
for tgt in TARGETS:
    clf, le = trained_models[tgt]
    df_final = predict_and_update(df_final, clf, le, tgt)

# Save final file
df_final.to_excel(final_output_path, index=False)

print("\n===================================================")
print("FINAL OUTPUT FILE CREATED SUCCESSFULLY")
print(" → SUBCATEGORY")
print(" → WORK TYPE")
print(" → SUB-PRODUCT")
print("CSAT + ASAT THEMES already updated earlier.")
print("Saved File:", final_output_path)
print("===================================================\n")
