In [19]:
!pip install -q pandas numpy sentence-transformers faiss-cpu nltk scikit-learn

In [20]:
import json
import numpy as np
import pandas as pd
import faiss
import nltk
from sentence_transformers import SentenceTransformer
from nltk.sentiment import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [21]:
with open("Conversational_Transcript_Dataset.json") as f:
    data = json.load(f)

print("Total transcripts:", len(data["transcripts"]))

Total transcripts: 5037


In [22]:
records = []

for t in data["transcripts"]:
    text = ""
    for turn in t["conversation"]:
        text += f"{turn['speaker']}: {turn['text']} "

    records.append({
        "transcript_id": t["transcript_id"],
        "intent": t["intent"],
        "text": text
    })

df = pd.DataFrame(records)
df.head()

Unnamed: 0,transcript_id,intent,text
0,6794-8660-4606-3216,Delivery Investigation,"Agent: Hello, thank you for contacting BuyNow...."
1,7034-5430-2980-5483,Escalation - Repeated Service Failures,Agent: Thank you for calling MedicalGroup. Thi...
2,1846-5500-2990-8975,Fraud Alert Investigation,Agent: Thank you for calling ProtectPlus Fraud...
3,1616-8531-3291-5075,Fraud Alert Investigation,Agent: Thank you for calling SecureBank Fraud ...
4,7441-4348-3458-2384,Account Access Issues,Agent: Thank you for calling DataLink. This is...


In [23]:
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(df["text"].tolist(), show_progress_bar=True)

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Batches:   0%|          | 0/158 [00:00<?, ?it/s]

In [24]:
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))

In [25]:
def retrieve(query, k=4):
    q_emb = model.encode([query])
    _, idx = index.search(q_emb, k)
    return df.iloc[idx[0]]

In [26]:
complaint_keywords = [
    "delay", "problem", "issue", "again", "still",
    "not received", "waiting", "angry", "frustrated"
]

def extract_causal_signals(text):
    signals = {"repetition":0, "frustration":0, "agent_delay":0}

    for kw in complaint_keywords:
        signals["repetition"] += text.lower().count(kw)

    for segment in text.split("Customer:")[1:]:
        if sia.polarity_scores(segment)["compound"] < -0.3:
            signals["frustration"] += 1

    signals["agent_delay"] = max(0, text.count("Agent:") - text.count("Customer:"))
    return signals

In [27]:
def extract_evidence(text):
    evidence = []
    turns = text.split("Agent:")

    for t in turns:
        if "Customer:" in t:
            if sia.polarity_scores(t)["compound"] < -0.3:
                evidence.append("Customer:" + t.strip()[:200])

    for t in turns[:2]:
        if t.strip():
            evidence.append("Agent:" + t.strip()[:200])

    return evidence[:5]

In [28]:
def causal_explanation(query):
    results = retrieve(query)

    aggregate = {"repetition":0, "frustration":0, "agent_delay":0}

    for _, r in results.iterrows():
        sig = extract_causal_signals(r["text"])
        for k in aggregate:
            aggregate[k] += sig[k]

    explanation = []
    explanation.append("WHY THIS EVENT OCCURS:\n")

    if aggregate["repetition"] > 3:
        explanation.append("• Repeated unresolved customer issues")
    if aggregate["frustration"] > 2:
        explanation.append("• Escalating customer frustration")
    if aggregate["agent_delay"] > 2:
        explanation.append("• Delays or lack of resolution by agents")

    explanation.append("\nCAUSAL TRACE:")
    explanation.append("Repeated issue → Frustration → Escalation\n")

    explanation.append("EVIDENCE:\n")
    for _, r in results.iterrows():
        explanation.append(f"Transcript ID: {r['transcript_id']}")
        for e in extract_evidence(r["text"]):
            explanation.append(e)
        explanation.append("-"*40)

    explanation.append(f"\nConfidence: Supported by {len(results)} similar transcripts")

    return "\n".join(explanation)

In [29]:
print("\n" + "="*80)
print("TASK 1 OUTPUT — CAUSAL EXPLANATION")
print("="*80)

task1_query = "Why do customers escalate complaints?"
print(causal_explanation(task1_query))

print("\n" + "="*80)
print("END OF TASK 1")
print("="*80)


TASK 1 OUTPUT — CAUSAL EXPLANATION
WHY THIS EVENT OCCURS:

• Repeated unresolved customer issues
• Escalating customer frustration
• Delays or lack of resolution by agents

CAUSAL TRACE:
Repeated issue → Frustration → Escalation

EVIDENCE:

Transcript ID: 7461-4984-1291-5153
Customer:Thank you for calling ShopEasy. This is Olivia. How may I assist you today? Customer: I need to speak with a manager immediately. This is the three time you've sent me a damaged product. This is compl
Customer:I'm very sorry to hear about this issue. I'd be happy to help you resolve this right away. Can you tell me more about what happened? Customer: What happened is that your company keeps sending damaged 
Agent:Thank you for calling ShopEasy. This is Olivia. How may I assist you today? Customer: I need to speak with a manager immediately. This is the three time you've sent me a damaged product. This is compl
----------------------------------------
Transcript ID: 2506-1813-2516-9541
Customer:Thank you f

In [30]:
memory = {
    "last_query": None,
    "last_results": None
}

In [31]:
def is_followup(query):

    if memory["last_query"] is None:
        return False

    v1 = model.encode([query])
    v2 = model.encode([memory["last_query"]])

    similarity = float(np.dot(v1, v2.T))

    return similarity > 0.7

In [32]:
import warnings
warnings.filterwarnings("ignore")

def format_chat_response(query, explanation, is_followup=False):

    lines = explanation.split("\n")

    print("\n" + "="*80)

    if not is_followup:
        print("ASSISTANT:")
        print(f"I understand your question:\n→ \"{query}\"\n")
        print("Here’s what’s happening based on similar conversations:\n")
    else:
        print("ASSISTANT:")
        print("Good follow-up question.\n")
        print("Looking at similar past cases, this issue keeps happening because:\n")

    for line in lines:
        if line.startswith("•") or "WHY THIS EVENT OCCURS" in line:
            print(line)

    print("\nCAUSAL FLOW:")
    for line in lines:
        if "→" in line:
            print(line)

    print("\nSUPPORTING EXAMPLES:")
    shown = 0

    for line in lines:
        if line.startswith("Transcript ID"):
            print(line)

        if line.startswith("Customer:") or line.startswith("Agent:"):
            print(line)
            shown += 1

        if is_followup and shown >= 3:
            break

    print("\n" + "="*80)

In [33]:
def chat(query):

    followup = is_followup(query)

    if followup:
        results = memory["last_results"]
    else:
        results = retrieve(query)
        memory["last_query"] = query
        memory["last_results"] = results

    explanation = causal_explanation(query)

    format_chat_response(
        query=query,
        explanation=explanation,
        is_followup=followup
    )

In [34]:
query_df = pd.read_csv("query_dataset.csv")

print("\nTotal Queries Loaded:", len(query_df))
query_df.head()


Total Queries Loaded: 10


Unnamed: 0,query_id,query_text,expected_intent,task_type,is_followup,notes
0,Q1,Why do customers escalate complaints?,complaint_escalation,task1,no,Primary causal explanation query
1,Q2,What causes repeated customer complaints?,complaint_escalation,task1,no,Checks repetition signal
2,Q3,Why are customers asking for managers so often?,complaint_escalation,task1,no,Escalation trigger
3,Q4,Why do customers escalate complaints?,complaint_escalation,task2,no,Initial chat query
4,Q5,Why does this happen so frequently?,complaint_escalation,task2,yes,Follow-up relies on memory


In [35]:
for _, row in query_df.iterrows():

    print("\n" + "="*80)
    print(f"QUERY ID: {row['query_id']}")
    print(f"QUERY: {row['query_text']}")

    if row["task_type"] == "task1":

        print("\n[TASK 1 OUTPUT]")
        print(causal_explanation(row["query_text"]))

    elif row["task_type"] == "task2":

        print("\n[TASK 2 OUTPUT]")
        chat(row["query_text"])


QUERY ID: Q1
QUERY: Why do customers escalate complaints?

[TASK 1 OUTPUT]
WHY THIS EVENT OCCURS:

• Repeated unresolved customer issues
• Escalating customer frustration
• Delays or lack of resolution by agents

CAUSAL TRACE:
Repeated issue → Frustration → Escalation

EVIDENCE:

Transcript ID: 7461-4984-1291-5153
Customer:Thank you for calling ShopEasy. This is Olivia. How may I assist you today? Customer: I need to speak with a manager immediately. This is the three time you've sent me a damaged product. This is compl
Customer:I'm very sorry to hear about this issue. I'd be happy to help you resolve this right away. Can you tell me more about what happened? Customer: What happened is that your company keeps sending damaged 
Agent:Thank you for calling ShopEasy. This is Olivia. How may I assist you today? Customer: I need to speak with a manager immediately. This is the three time you've sent me a damaged product. This is compl
----------------------------------------
Transcript ID: 