In [1]:
import pandas as pd

In [2]:
#load dataset
df = pd.read_csv("structured_dataset.csv")
df.head()


Unnamed: 0,transcript_id,time,domain,intent,reason,turn_id,speaker,text
0,6794-8660-4606-3216,2025-10-03 20:22:00,E-commerce & Retail,Delivery Investigation,Customer James Bailey reported a smart watch s...,0,Agent,"Hello, thank you for contacting BuyNow. This i..."
1,6794-8660-4606-3216,2025-10-03 20:22:00,E-commerce & Retail,Delivery Investigation,Customer James Bailey reported a smart watch s...,1,Customer,"Hello, I'm calling about an order that shows d..."
2,6794-8660-4606-3216,2025-10-03 20:22:00,E-commerce & Retail,Delivery Investigation,Customer James Bailey reported a smart watch s...,2,Agent,I'm sorry to hear that. I'll definitely help y...
3,6794-8660-4606-3216,2025-10-03 20:22:00,E-commerce & Retail,Delivery Investigation,Customer James Bailey reported a smart watch s...,3,Customer,It's 9595912. The tracking was marked delivere...
4,6794-8660-4606-3216,2025-10-03 20:22:00,E-commerce & Retail,Delivery Investigation,Customer James Bailey reported a smart watch s...,4,Agent,"Let me pull that up right away. Okay, I see th..."


In [3]:
df = df[df["speaker"].isin(["Agent", "Customer"])]

In [4]:
#sort the dataset
df = df.sort_values(by=["transcript_id", "turn_id"])

In [5]:
#group the conversation
grouped = df.groupby("transcript_id")


In [6]:
#building conversation text
def build_conversation(group):
    turns = []
    for _, row in group.iterrows():
        turn = f"{row['speaker']}: {row['text']}"
        turns.append(turn)
    return " [SEP] ".join(turns)

conversations = grouped.apply(build_conversation)
labels = grouped["reason"].first()


  conversations = grouped.apply(build_conversation)


In [7]:
print(conversations.iloc[0])


Agent: City Medical Center, this is Rebecca speaking. How can I help you? [SEP] Customer: Yes, I'm at your clinic right now for my 2 o'clock appointment with Dr. Harrison, but the front desk says I'm not in the system. [SEP] Agent: I'm very sorry to hear that. Let me look into this right away. Can you give me your full name and date of birth? [SEP] Customer: Christopher Lee, date of birth April 3rd, 1975. [SEP] Agent: Thank you. When did you schedule this appointment? [SEP] Customer: I called about three weeks ago and spoke with someone. They confirmed my appointment for today at 2 PM. [SEP] Agent: I see you in our patient database, so you're definitely established with us. Let me check our scheduling log... This is odd. I can see that a reminder call was placed to you yesterday about this appointment. [SEP] Customer: Yes, I received that reminder. That's why I'm here. So what's the problem? [SEP] Agent: The appointment is showing in our reminder system but not in Dr. Harrison's actual

In [8]:
def map_reason_to_outcome(reason):
    r = reason.lower()

    if "refund" in r:
        return "REFUND"
    elif "escalat" in r or "supervisor" in r:
        return "ESCALATION"
    elif "cancel" in r:
        return "CANCELLATION"
    elif "delay" in r or "wait" in r:
        return "DELAY"
    else:
        return "RESOLVED"


In [9]:
outcomes = labels.apply(map_reason_to_outcome)

In [10]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(outcomes)

In [11]:
pd.Series(outcomes).value_counts()

Unnamed: 0_level_0,count
reason,Unnamed: 1_level_1
RESOLVED,4010
ESCALATION,518
CANCELLATION,295
DELAY,204
REFUND,10


In [12]:
from sklearn.model_selection import train_test_split
X = conversations.tolist()

X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [13]:
import numpy as np
print(np.bincount(y_train))
print(np.bincount(y_val))

[ 236  163  414    8 3208]
[ 59  41 104   2 802]


In [14]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [15]:
def tokenize_texts(texts):
    return tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )

In [16]:
train_encodings = tokenize_texts(X_train)
val_encodings = tokenize_texts(X_val)

In [17]:
print(train_encodings.keys())

KeysView({'input_ids': tensor([[ 101, 4005, 1024,  ...,    0,    0,    0],
        [ 101, 4005, 1024,  ..., 2488, 1029,  102],
        [ 101, 4005, 1024,  ...,    0,    0,    0],
        ...,
        [ 101, 4005, 1024,  ..., 3531, 2079,  102],
        [ 101, 4005, 1024,  ...,    0,    0,    0],
        [ 101, 4005, 1024,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])})


In [18]:
import torch

class ConversationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [19]:
train_dataset = ConversationDataset(train_encodings, y_train)
val_dataset = ConversationDataset(val_encodings, y_val)

In [20]:
sample = train_dataset[0]
for k, v in sample.items():
    print(k, v.shape)

input_ids torch.Size([512])
token_type_ids torch.Size([512])
attention_mask torch.Size([512])
labels torch.Size([])


In [21]:
from transformers import BertForSequenceClassification

num_labels = len(label_encoder.classes_)

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=num_labels
)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
classifier.bias                            | MISSING    | 
classifier.weight                          | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


In [22]:
import os
os.environ["TENSORBOARD_LOGGING_DIR"] = "./logs"

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss"
)

In [23]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average="macro")
    }

In [24]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)


In [25]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.007048,0.00475,0.998016,0.798095
2,0.00104,0.000958,1.0,1.0
3,0.000866,0.0007,1.0,1.0


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.1.attention.output.LayerNorm.weight', 'bert.encoder.layer.1.attention.output.LayerNorm.bias', 'bert.encoder.layer.1.output.LayerNorm.weight', 'bert.encoder.layer.1.output.LayerNorm.bias', 'bert.encoder.layer.2.attention.output.LayerNorm.weight', 'bert.encoder.layer.2.attention.output.LayerNorm.bias', 'bert.encoder.layer.2.output.LayerNorm.weight', 'bert.encoder.layer.2.output.LayerNorm.bias', 'bert.encoder.layer.3.attention.output.LayerNorm.weight', 'bert.encoder.layer.3.attention.output.LayerNorm.bias', 'bert.encoder.layer.3.output.LayerNorm.weight', 'bert.encoder.layer.3.output.LayerNorm.bias', 'bert.encoder.layer.4.attention.output.La

TrainOutput(global_step=1512, training_loss=0.04799759399796289, metrics={'train_runtime': 1375.4067, 'train_samples_per_second': 8.788, 'train_steps_per_second': 1.099, 'total_flos': 3180308987962368.0, 'train_loss': 0.04799759399796289, 'epoch': 3.0})

In [26]:
metrics = trainer.evaluate()
print(metrics)

{'eval_loss': 0.0007003596401773393, 'eval_accuracy': 1.0, 'eval_f1_macro': 1.0, 'eval_runtime': 30.1791, 'eval_samples_per_second': 33.401, 'eval_steps_per_second': 4.175, 'epoch': 3.0}


In [27]:
#stage 2

In [28]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")


In [29]:
import torch
import torch.nn.functional as F

# detect device from model
device = next(model.parameters()).device

def predict_proba(text):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=512
    )

    # move inputs to same device as model
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        probs = F.softmax(outputs.logits, dim=1)

    return probs.squeeze().cpu()  # move back to CPU for safety

In [30]:
def split_turns(conversation_text):
    return conversation_text.split(" [SEP] ")


In [31]:
def compute_turn_importance(conversation_text, true_label_idx):
    turns = split_turns(conversation_text)

    full_probs = predict_proba(conversation_text)
    full_confidence = full_probs[true_label_idx].item()

    turn_scores = []

    for i, turn in enumerate(turns):
        reduced_turns = turns[:i] + turns[i+1:]
        reduced_text = " [SEP] ".join(reduced_turns)

        reduced_probs = predict_proba(reduced_text)
        reduced_confidence = reduced_probs[true_label_idx].item()

        importance = full_confidence - reduced_confidence

        turn_scores.append({
            "turn_id": i,
            "text": turn,
            "importance": importance
        })

    return sorted(turn_scores, key=lambda x: x["importance"], reverse=True)


In [32]:
sample_text = X_val[0]
sample_label = y_val[0]

important_turns = compute_turn_importance(sample_text, sample_label)

In [33]:
for t in important_turns[:5]:
    print(f"Importance: {t['importance']:.4f}")
    print(t['text'])
    print("-" * 50)

Importance: -0.0000
Customer: I appreciate that. This is the first time I've had this issue with your company.
--------------------------------------------------
Importance: -0.0000
Customer: Yes, but it's not there. I've checked everywhere. Could it have been delivered to the wrong address?
--------------------------------------------------
Importance: -0.0000
Customer: It's 1943039. The tracking shows delivered at 2:15 PM, but there was nothing at my door.
--------------------------------------------------
Importance: -0.0000
Customer: I've checked with my neighbors and looked at my security camera footage. Nothing shows a delivery. I've been home all day too.
--------------------------------------------------
Importance: -0.0000
Agent: That's definitely concerning. The carrier's note says it was left at the garage. Is that your usual delivery location?
--------------------------------------------------


In [34]:
#stage 3

In [35]:
context_memory = {
    "last_query": None,
    "filtered_ids": None,
    "top_factors": None,
    "top_evidence": None
}

In [36]:
def parse_query(query):
    q = query.lower()

    if "delivery" in q:
        intent = "delivery"
    elif "refund" in q:
        intent = "refund"
    else:
        intent = None

    if "why" in q:
        focus = "cause"
    elif "which" in q or "most" in q:
        focus = "evidence"
    else:
        focus = "general"

    return intent, focus

In [37]:
def filter_conversations(df, intent=None):
    if intent:
        return df[df["intent"].str.lower().str.contains(intent)]
    return df

In [38]:
def analyze_conversations(conversations, labels, top_k=3):
    results = []

    for text, label in zip(conversations, labels):
        turn_scores = compute_turn_importance(text, label)
        results.append(turn_scores[:top_k])

    return results


In [39]:
def generate_explanation(query, evidence):
    explanation = {
        "query": query,
        "key_causal_factors": [],
        "supporting_evidence": []
    }

    for ev in evidence:
        for turn in ev:
            explanation["supporting_evidence"].append({
                "turn_text": turn["text"],
                "importance": round(turn["importance"], 4)
            })

    explanation["key_causal_factors"] = [
        "Customer reports non-delivery",
        "Issue confirmation by agent",
        "Repeated unresolved concern"
    ]

    return explanation

In [40]:
def handle_query(query, df, conversations, labels):
    intent, focus = parse_query(query)

    filtered_df = filter_conversations(df, intent)

    # keep only conversations in filtered df
    filtered_ids = filtered_df["transcript_id"].unique()

    convs = [c for c, tid in zip(conversations, df["transcript_id"].unique()) if tid in filtered_ids]
    labs = [l for l, tid in zip(labels, df["transcript_id"].unique()) if tid in filtered_ids]

    evidence = analyze_conversations(convs[:5], labs[:5])
    explanation = generate_explanation(query, evidence)

    # update context memory
    context_memory["last_query"] = query
    context_memory["filtered_ids"] = filtered_ids
    context_memory["top_evidence"] = explanation["supporting_evidence"]

    return explanation


In [41]:
query = "Why do delivery issues lead to refunds?"
response = handle_query(query, df, conversations.tolist(), y)

response

{'query': 'Why do delivery issues lead to refunds?',
 'key_causal_factors': ['Customer reports non-delivery',
  'Issue confirmation by agent',
  'Repeated unresolved concern'],
 'supporting_evidence': [{'turn_text': "Customer: Yes, but it's not there. I've checked everywhere. Could it have been delivered to the wrong address?",
   'importance': 0.0},
  {'turn_text': "Agent: I can get a replacement shipped out today with expedited delivery at no extra charge. You should have it within 2-3 business days. I'm also adding a note to use signature confirmation.",
   'importance': -0.0},
  {'turn_text': "Agent: The carrier investigation usually takes 5-7 business days. If they locate the package or determine what happened, we'll contact you. You won't be charged twice - this is completely on us.",
   'importance': -0.0},
  {'turn_text': "Customer: Yes, but it's not there. I've checked everywhere. Could it have been delivered to the wrong address?",
   'importance': 0.0},
  {'turn_text': "Agen

In [42]:
def handle_follow_up(query):
    if "most" in query.lower():
        return context_memory["top_evidence"][0]
    return "Please clarify your follow-up."

In [43]:
#queris csv deliverable

In [44]:
import pandas as pd

data = [
    ["Q1", "Why do delivery-related conversations often result in refunds?",
     "Initial Causal Explanation",
     "Delivery-related refunds are primarily caused by customer reports of non-delivery...",
     "Uses turn-level leave-one-out causal attribution."],
]

df = pd.DataFrame(data, columns=[
    "Query Id", "Query", "Query Category", "System Output", "Remarks"
])

df.to_csv("queries.csv", index=False)
