### Customer Support Dataset Cleaning Script

This notebook is for cleanings and preparing the Bitext customer support dataset for prompt engineering experiments.

In [11]:
import pandas as pd
from textblob import TextBlob
import re
import os

In [14]:
df = pd.read_csv("../data/raw_bitext_customer_support_data.csv")

df.head()


Unnamed: 0,flags,instruction,category,intent,response
0,B,question about cancelling order {{Order Number}},ORDER,cancel_order,I've understood you have a question regarding ...
1,BQZ,i have a question about cancelling oorder {{Or...,ORDER,cancel_order,I've been informed that you have a question ab...
2,BLQZ,i need help cancelling puchase {{Order Number}},ORDER,cancel_order,I can sense that you're seeking assistance wit...
3,BL,I need to cancel purchase {{Order Number}},ORDER,cancel_order,I understood that you need assistance with can...
4,BCELN,"I cannot afford this order, cancel purchase {{...",ORDER,cancel_order,I'm sensitive to the fact that you're facing f...


In [15]:
df = df.rename(columns={
    "instruction": "customer_query",
    "response": "expected_response"
})[["customer_query", "expected_response", "category", "intent"]]

df.dropna(subset=["customer_query", "expected_response"], inplace=True)
df.drop_duplicates(subset=["customer_query", "expected_response"], inplace=True)

print("After cleanup:", df.shape)
df.head()

After cleanup: (26872, 4)


Unnamed: 0,customer_query,expected_response,category,intent
0,question about cancelling order {{Order Number}},I've understood you have a question regarding ...,ORDER,cancel_order
1,i have a question about cancelling oorder {{Or...,I've been informed that you have a question ab...,ORDER,cancel_order
2,i need help cancelling puchase {{Order Number}},I can sense that you're seeking assistance wit...,ORDER,cancel_order
3,I need to cancel purchase {{Order Number}},I understood that you need assistance with can...,ORDER,cancel_order
4,"I cannot afford this order, cancel purchase {{...",I'm sensitive to the fact that you're facing f...,ORDER,cancel_order


In [16]:
def clean_text(text):
    text = str(text)
    text = re.sub(r"http\S+", "", text)  # remove URLs
    text = re.sub(r"\s+", " ", text).strip()  # normalize spaces
    return text

df["customer_query"] = df["customer_query"].apply(clean_text)
df["expected_response"] = df["expected_response"].apply(clean_text)

In [17]:
df["query_id"] = range(1, len(df) + 1)
df["response_length"] = df["expected_response"].apply(lambda x: len(str(x).split()))
df["sentiment"] = df["customer_query"].apply(lambda x: TextBlob(x).sentiment.polarity)

df = df[["query_id", "customer_query", "expected_response", "category", "intent", "sentiment", "response_length"]]

output_path = "../data/cleaned/customer_queries_cleaned.csv"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df.to_csv(output_path, index=False)

print(f"Cleaned dataset saved to: {output_path}")
print("Final shape:", df.shape)

Cleaned dataset saved to: ../data/cleaned/customer_queries_cleaned.csv
Final shape: (26872, 7)


In [27]:
# Stratified sampling to make sure of a variety of query types
sample = (
    df.groupby("category", group_keys=False)
      .apply(lambda x: x.sample(n=min(5, len(x)), random_state=42))
      .sample(50, random_state=42)
)

sample_path = "../data/cleaned/sample_queries.csv"
sample.to_csv(sample_path, index=False)

print(sample["category"].value_counts(normalize=True).round(2))


category
ORDER           0.10
CANCEL          0.10
DELIVERY        0.10
SHIPPING        0.10
ACCOUNT         0.10
FEEDBACK        0.10
CONTACT         0.08
REFUND          0.08
INVOICE         0.08
SUBSCRIPTION    0.08
PAYMENT         0.08
Name: proportion, dtype: float64


  .apply(lambda x: x.sample(n=min(5, len(x)), random_state=42))
