In [1]:
import pandas as pd
import re

df = pd.read_excel("data/Chat_Dataset.xlsx")

# -------------------------------
# 1. Extract Topic Into New Column
# -------------------------------
def extract_topic(text):
    text = str(text)
    match = re.search(r'Topic:\s*(.*)', text, flags=re.IGNORECASE)
    if match:
        return match.group(1).strip()
    return None

df["topic"] = df["Input (Informal Chat & Topic)"].apply(extract_topic)

# -------------------------------
# 2. Extract Chat Only (Remove Chat:, Remove Topic Section)
# -------------------------------
def extract_chat(text):
    text = str(text)

    # Remove Chat:
    text = re.sub(r'^Chat:\s*', '', text, flags=re.IGNORECASE)

    # Remove Topic and everything after
    text = re.sub(r'Topic:.*', '', text, flags=re.IGNORECASE)

    # Remove quotes
    text = text.replace('"', '').replace("'", "")

    # Normalize spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

df["input"] = df["Input (Informal Chat & Topic)"].apply(extract_chat)

# -------------------------------
# 3. Clean Output Column (Remove Resolution:)
# -------------------------------
def clean_output(text):
    text = str(text)

    # Remove Resolution:
    text = re.sub(r'^Resolution:\s*', '', text, flags=re.IGNORECASE)

    # Remove quotes
    text = text.replace('"', '').replace("'", "")

    # Normalize spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

df["output"] = df["Expected Output (Formal, Summarized Resolution Note)"].apply(clean_output)

# -------------------------------
# 4. Keep Only Required Columns
# -------------------------------
df = df[["input", "output", "topic"]]

# -------------------------------
# 5. Save Cleaned Dataset
# -------------------------------
df.to_json("data/metricon_clean_dataset.jsonl", orient="records", lines=True)


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 1. Load JSONL dataset
df = pd.read_json("data/metricon_clean_dataset.jsonl", lines=True)

# 3. Train-test split
train, test = train_test_split(df, test_size=0.15, random_state=42)

# 4. Save output JSONL files
train.to_json("data/train.jsonl", orient="records", lines=True)
test.to_json("data/test.jsonl", orient="records", lines=True)

# Topic Detection

In [3]:
import pandas as pd

df = pd.read_json("data/metricon_clean_dataset.jsonl", lines=True)

df_topic = df[["input", "topic"]].dropna()
df_topic.head()

Unnamed: 0,input,topic
0,"My Wi-Fi is acting up again, keeps dropping th...",Technical Issue - Connectivity
1,I got charged twice this month! Check my accou...,Billing - Overcharge
2,Wheres my order? It was supposed to be here ye...,Delivery - Late Shipment
3,"Hey, I need to change my email address on file...",Account Management - Personal Data Change
4,My new laptop screen is flickerin. Its only 3 ...,Technical Issue - Hardware Fault


In [4]:
from sentence_transformers import SentenceTransformer, util

# Load small model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Define topic list
topics = [
    "Billing",
    "Delivery - Order Modification",
    "Account Management - Communication Preferences",
    "Technical Issue - WiFi",
    "Warranty / Repairs",
    "Order Status",
    "Refund Request",
    "Product Issue",
    "Customer Account Update",
    "Others"
]

topic_embeddings = model.encode(topics, convert_to_tensor=True)

def predict_topic(text):
    text_embedding = model.encode(text, convert_to_tensor=True)
    scores = util.cos_sim(text_embedding, topic_embeddings)[0]
    best_idx = scores.argmax().item()
    return topics[best_idx]


  from .autonotebook import tqdm as notebook_tqdm


In [90]:
sample = "I want to change the product I ordered yesterday"
print(predict_topic(sample))

Delivery - Order Modification


# Sentiment Analysis

In [5]:
from transformers import pipeline

sentiment_model = pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english"
)

def get_sentiment(text):
    result = sentiment_model(text)[0]
    return result["label"], result["score"]

# Test
print(get_sentiment("I am really disappointed with the service!"))

Device set to use cpu


('NEGATIVE', 0.9997710585594177)


# Text Summarization

In [6]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load SML summarizer
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

def summarize_issue_and_solution(issue, solution):
    text = f"summarize: Issue: {issue} Solution: {solution}"

    inputs = tokenizer.encode(text, return_tensors="pt", max_length=512, truncation=True)

    summary_ids = model.generate(
        inputs,
        max_length=80,
        min_length=20,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )

    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [11]:
import re

issue = "The customer reports the roof is leaking again after last week's repair. They are very upset."
solution = "Agent apologized and scheduled an urgent inspection within 24 hours and escalated to senior technicians."



def fix_capitalization(text):
    text = text.strip()

    # Split by sentence-ending punctuation
    sentences = re.split(r'(?<=[.!?])\s+', text)

    fixed_sentences = []
    for s in sentences:
        s = s.strip()
        if not s:
            continue
        # Capitalize first letter of each sentence
        s = s[0].upper() + s[1:]
        fixed_sentences.append(s)

    return " ".join(fixed_sentences)

raw_summary = summarize_issue_and_solution(issue, solution)
clean_summary = fix_capitalization(raw_summary)

print(clean_summary)


The customer reports the roof is leaking again after last week's repair. Agent apologized and scheduled an urgent inspection within 24 hours.
