In [3]:
# callsense_pipeline_tfidf_fixed.py
import os
import json
import yaml
import h5py
import joblib
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import lightgbm as lgb

from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
nltk.download("vader_lexicon", quiet=True)

# -------- CONFIG --------
BASE_DIR = r"C:\Users\NXTWAVE\Downloads\Customer Support Call Sentiment & Escalation Predictor"
DATA_CSV = os.path.join(BASE_DIR, "archive", "customer_call_transcriptions.csv")

OUT_SENTIMENT_MODEL = os.path.join(BASE_DIR, "sentiment_model.pkl")
OUT_ESCALATION_MODEL = os.path.join(BASE_DIR, "escalation_model.pkl")
OUT_H5 = os.path.join(BASE_DIR, "processed_calls.h5")
OUT_JSON = os.path.join(BASE_DIR, "insights.json")
OUT_YAML = os.path.join(BASE_DIR, "build_metadata.yaml")
OUT_PRED = os.path.join(BASE_DIR, "predictions.csv")
VISUAL_DIR = os.path.join(BASE_DIR, "visuals")

os.makedirs(VISUAL_DIR, exist_ok=True)

# -------- LOAD DATA --------
print("[INFO] Loading dataset...")
df = pd.read_csv(DATA_CSV)
print("[INFO] Raw columns:", list(df.columns))

# -------- SYNTHETIC COLUMNS --------
df = df.rename(columns={"text": "transcript", "sentiment_label": "sentiment"})
df["call_id"] = [f"call_{i}" for i in range(len(df))]
df["customer_id"] = [f"cust_{i%50}" for i in range(len(df))]
df["agent_id"] = [f"agent_{i%10}" for i in range(len(df))]
df["duration"] = np.random.randint(60, 600, size=len(df))
df["resolution_status"] = np.where(df["sentiment"]=="negative", "escalated", "resolved")
df["escalation"] = df["resolution_status"].apply(lambda x: 1 if x=="escalated" else 0)

# -------- FEATURES (TF-IDF + Lexicon) --------
print("[INFO] Building TF-IDF features...")
tfidf = TfidfVectorizer(max_features=5000, stop_words="english")
X_tfidf = tfidf.fit_transform(df["transcript"]).toarray()

print("[INFO] Adding lexicon sentiment features...")
sia = SentimentIntensityAnalyzer()
lex = np.array([list(sia.polarity_scores(t).values()) for t in df["transcript"]])

X = np.concatenate([X_tfidf, lex], axis=1)

# -------- LABELS --------
le_sent = LabelEncoder()
y_sent = le_sent.fit_transform(df["sentiment"])
y_esc = df["escalation"].astype(int).values

# -------- TRAIN SENTIMENT MODEL --------
print("[INFO] Training Sentiment Classifier...")
Xtr, Xte, ytr, yte = train_test_split(X, y_sent, test_size=0.2, random_state=42, stratify=y_sent)
clf_sent = LogisticRegression(max_iter=2000)
clf_sent.fit(Xtr, ytr)

y_pred = clf_sent.predict(Xte)
labels = sorted(np.unique(yte))
print("[RESULT] Sentiment Report:\n",
      classification_report(yte, y_pred,
                            labels=labels,
                            target_names=le_sent.classes_[labels]))

joblib.dump((clf_sent, le_sent, tfidf), OUT_SENTIMENT_MODEL)

# -------- TRAIN ESCALATION MODEL --------
print("[INFO] Training Escalation Risk Model...")
Xtr, Xte, ytr, yte = train_test_split(X, y_esc, test_size=0.2, random_state=42, stratify=y_esc)
clf_esc = lgb.LGBMClassifier()
clf_esc.fit(Xtr, ytr)
y_pred = clf_esc.predict(Xte)
print("[RESULT] Escalation Report:\n", classification_report(yte, y_pred))

joblib.dump(clf_esc, OUT_ESCALATION_MODEL)

# -------- PREDICTIONS --------
print("[INFO] Running inference on full dataset...")
sent_preds = clf_sent.predict(X)
sent_probs = clf_sent.predict_proba(X).max(axis=1)
esc_probs = clf_esc.predict_proba(X)[:,1]

df_out = df.copy()
df_out["sentiment_pred"] = le_sent.inverse_transform(sent_preds)
df_out["sentiment_conf"] = sent_probs
df_out["escalation_prob"] = esc_probs
df_out.to_csv(OUT_PRED, index=False)

# -------- INSIGHTS --------
print("[INFO] Generating insights...")
insights = {
    "top_escalation_customers": df_out.groupby("customer_id")["escalation_prob"].mean().sort_values(ascending=False).head(5).to_dict(),
    "agent_escalation_rates": df_out.groupby("agent_id")["escalation_prob"].mean().to_dict(),
    "sentiment_distribution": df_out["sentiment_pred"].value_counts().to_dict()
}
with open(OUT_JSON, "w") as f: json.dump(insights, f, indent=2)

# -------- VISUALS --------
print("[INFO] Plotting visuals...")
cm = confusion_matrix(yte, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Escalation Confusion Matrix")
plt.savefig(os.path.join(VISUAL_DIR, "escalation_confusion_matrix.png"))
plt.close()

pivot = df_out.pivot_table(index="agent_id", values="escalation_prob", aggfunc="mean")
sns.heatmap(pivot, annot=True, cmap="Reds")
plt.title("Escalation Risk Heatmap (by agent)")
plt.savefig(os.path.join(VISUAL_DIR, "escalation_risk_heatmap.png"))
plt.close()

sent_trend = df_out.groupby(df_out.index // 10)["sentiment_pred"].apply(lambda x: (x=="negative").mean())
sent_trend.plot()
plt.title("Sentiment Trend (chunks of 10 calls)")
plt.ylabel("Fraction Negative")
plt.savefig(os.path.join(VISUAL_DIR, "sentiment_trends.png"))
plt.close()

# -------- SAVE H5 --------
print("[INFO] Saving H5...")
with h5py.File(OUT_H5, "w") as f:
    f.create_dataset("features", data=X)
    f.create_dataset("sentiment_labels", data=y_sent)
    f.create_dataset("escalation_labels", data=y_esc)

# -------- SAVE YAML --------
print("[INFO] Saving metadata YAML...")
meta = {
    "dataset": DATA_CSV,
    "records": len(df),
    "features_dim": X.shape[1],
    "models": {
        "sentiment_model": OUT_SENTIMENT_MODEL,
        "escalation_model": OUT_ESCALATION_MODEL
    },
    "artifacts": {
        "predictions": OUT_PRED,
        "insights": OUT_JSON,
        "visuals": VISUAL_DIR
    }
}
with open(OUT_YAML, "w") as f: yaml.dump(meta, f)

print("[DONE] All artifacts saved to", BASE_DIR)


[INFO] Loading dataset...
[INFO] Raw columns: ['text', 'sentiment_label']
[INFO] Building TF-IDF features...
[INFO] Adding lexicon sentiment features...
[INFO] Training Sentiment Classifier...
[RESULT] Sentiment Report:
               precision    recall  f1-score   support

    negative       1.00      0.78      0.88         9
     neutral       0.86      1.00      0.92        12

    accuracy                           0.90        21
   macro avg       0.93      0.89      0.90        21
weighted avg       0.92      0.90      0.90        21

[INFO] Training Escalation Risk Model...
[LightGBM] [Info] Number of positive: 34, number of negative: 47
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000325 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 106
[LightGBM] [Info] Number of data points in the train set: 81, number of used features: 