In [3]:
# Libraries
import json
import requests
from tqdm import tqdm
import os
import pandas as pd

In [39]:
# URL of the dataset (CSV version)
url = "https://huggingface.co/datasets/bitext/Bitext-insurance-llm-chatbot-training-dataset/resolve/main/bitext-insurance-llm-chatbot-training-dataset.csv"

# Create "data" folder if not exists
os.makedirs("data", exist_ok=True)

# File path to save
file_path = "data/insurance_llm_chatbot_training_dataset.csv"

# Download the dataset if not already downloaded
if not os.path.exists(file_path):
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(file_path, "wb") as f:
            for chunk in tqdm(response.iter_content(chunk_size=8192), desc="Downloading"):
                f.write(chunk)
        print("✅ Dataset downloaded successfully!")
    else:
        print(f"❌ Failed to download dataset. Status code: {response.status_code}")
else:
    print("⚡ Dataset already exists locally.")

# Load the dataset with pandas
df = pd.read_csv(file_path)

# Display first few rows
print(df.head())


⚡ Dataset already exists locally.
                                         instruction  \
0  I'd like to see my fucking auto insurance , I ...   
1  wanna know more about my auto insurance giev m...   
2  I'd like to sde my fucking auto insurance coul...   
3  wanna see my fucking auto insurance where coul...   
4  I need information about my fucking auto insur...   

                       intent        category      tags  \
0  information_auto_insurance  AUTO_INSURANCE    BCLPWZ   
1  information_auto_insurance  AUTO_INSURANCE      BCQZ   
2  information_auto_insurance  AUTO_INSURANCE  BCILPQWZ   
3  information_auto_insurance  AUTO_INSURANCE   BCILPQW   
4  information_auto_insurance  AUTO_INSURANCE      BCIW   

                                            response  
0  To retrieve the details of your auto insurance...  
1  To retrieve your auto insurance details, pleas...  
2  To obtain your auto insurance information, kin...  
3  To retrieve your auto insurance details, pleas...  

In [40]:
df

Unnamed: 0,instruction,intent,category,tags,response
0,"I'd like to see my fucking auto insurance , I ...",information_auto_insurance,AUTO_INSURANCE,BCLPWZ,To retrieve the details of your auto insurance...
1,wanna know more about my auto insurance giev m...,information_auto_insurance,AUTO_INSURANCE,BCQZ,"To retrieve your auto insurance details, pleas..."
2,I'd like to sde my fucking auto insurance coul...,information_auto_insurance,AUTO_INSURANCE,BCILPQWZ,"To obtain your auto insurance information, kin..."
3,wanna see my fucking auto insurance where coul...,information_auto_insurance,AUTO_INSURANCE,BCILPQW,"To retrieve your auto insurance details, pleas..."
4,I need information about my fucking auto insur...,information_auto_insurance,AUTO_INSURANCE,BCIW,"To retrieve your auto insurance details, pleas..."
...,...,...,...,...,...
38995,I would like to know about my travel insurance...,information_travel_insurance,TRAVEL_INSURANCE,BCIP,To obtain detailed information regarding your ...
38996,im looking for information about my travel in...,information_travel_insurance,TRAVEL_INSURANCE,BCIQZ,To obtain the necessary information regarding ...
38997,is it possible to see my fucking travel insura...,information_travel_insurance,TRAVEL_INSURANCE,BILW,"To access your travel insurance details, pleas..."
38998,would it be possible to see my travel insurance?,information_travel_insurance,TRAVEL_INSURANCE,BILP,"To access your travel insurance details, pleas..."


In [41]:
df['category'].value_counts()

category
CLAIMS                 6000
PAYMENT                5000
CONTACT                4000
COVERAGE               4000
ENROLLMENT             4000
COMPLAINTS             3000
QUOTE                  2000
INCIDENTS              2000
AUTO_INSURANCE         1000
PET_INSURANCE          1000
RENEW                  1000
POLICY                 1000
HOME_INSURANCE         1000
LIFE_INSURANCE         1000
HEALTH_INSURANCE       1000
GENERAL_INFORMATION    1000
TRAVEL_INSURANCE       1000
Name: count, dtype: int64

In [42]:
df[df['category'] == 'CLAIMS'][0:3].to_dict(orient='records')

[{'instruction': 'could i accept a fucking offer',
  'intent': 'accept_settlement',
  'category': 'CLAIMS',
  'tags': 'BILPQW',
  'response': 'To proceed with accepting a settlement offer, please follow these detailed instructions:\n\n1. Carefully examine the settlement offer details provided by your insurance company. It is essential that you fully comprehend the terms and conditions specified.\n2. Access your account by visiting our insurance portal at {{WEBSITE_URL}}.\n3. Go to the {{CLAIM_SECTION}} section to locate the settlement offer that you intend to accept.\n4. Choose the accept option by clicking on the {{ACCEPT_OFFER_BUTTON}}. \n5. Validate your acceptance by finishing any mandatory verification procedures, which may require your electronic signature or additional identifying information.\n6. After completing these actions, you will receive a confirmation message indicating that the settlement offer has been successfully accepted.\n\nIf you face any difficulties during this

In [43]:
df.shape


(39000, 5)

In [60]:
# Select a random 10% of the filtered data
df = df[df['category'] == 'CLAIMS']
insurance_df = df.sample(frac=0.065, random_state=1)  # 10% sample


In [61]:
insurance_df.shape

(390, 5)

In [None]:
#  # Filtrele: category sütununda "INSURANCE" geçenler
# insurance_df = df[df["category"].str.contains("INSURANCE", case=False)]

# # Kaç satır olduğunu görelim
# print("Number of rows:", len(insurance_df))

# # Hangi kategoriler var
# print(insurance_df["category"].value_counts())

# # İlk satırları göster
# print(insurance_df.head(),"\n",insurance_df.tail()) 

Number of rows: 6000
category
AUTO_INSURANCE      1000
HEALTH_INSURANCE    1000
HOME_INSURANCE      1000
LIFE_INSURANCE      1000
PET_INSURANCE       1000
TRAVEL_INSURANCE    1000
Name: count, dtype: int64
                                         instruction  \
0  I'd like to see my fucking auto insurance , I ...   
1  wanna know more about my auto insurance giev m...   
2  I'd like to sde my fucking auto insurance coul...   
3  wanna see my fucking auto insurance where coul...   
4  I need information about my fucking auto insur...   

                       intent        category      tags  \
0  information_auto_insurance  AUTO_INSURANCE    BCLPWZ   
1  information_auto_insurance  AUTO_INSURANCE      BCQZ   
2  information_auto_insurance  AUTO_INSURANCE  BCILPQWZ   
3  information_auto_insurance  AUTO_INSURANCE   BCILPQW   
4  information_auto_insurance  AUTO_INSURANCE      BCIW   

                                            response  
0  To retrieve the details of your auto insuran

In [62]:
# ✅ Use the filtered dataframe (insurance_df)
data = []

output_dir = "../Data"
os.makedirs(output_dir, exist_ok=True)
for category, group_df in insurance_df.groupby('category'):
    documents = []
    for _, row in group_df.iterrows():
        document = {
            "intent": row['intent'],
            "question": row['instruction'],
            "response": row['response']
        }
        documents.append(document)

    data.append({
        "category": category,
        "documents": documents
    })

# Save into JSON file inside ../Data/
output_path = os.path.join(output_dir, "insurance_documents.json")
with open(output_path, "w", encoding="utf-8") as jsonf:
    json.dump(data, jsonf, indent=2, ensure_ascii=False)

print(f"✅ JSON file saved at {output_path}")


✅ JSON file saved at ../Data/insurance_documents.json


In [63]:
with open('../Data/insurance_documents.json', 'rt') as f_in:
    documents_raw = json.load(f_in)

In [64]:
documents = []

for category in documents_raw:
    category_name = category['category']

    for doc in category['documents']:
        doc['category'] = category_name
        documents.append(doc)

In [65]:
import hashlib

def generate_document_id(doc):
    combined = f"{doc['category']}-{doc['question']}-{doc['response'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

In [66]:
for doc in documents:
    doc['id'] = generate_document_id(doc)

In [67]:
documents[3]

{'intent': 'receive_payment',
 'question': 'i got to receive fucking compensations can i get some ehlp',
 'response': 'To process your compensation payment for the insurance claim, please adhere to the following steps:\n\n1. Access your account by visiting {{WEBSITE_URL}}.\n2. Locate the {{CLAIM_SECTION}} for your specific claim.\n3. Identify the claim that you wish to be compensated for.\n4. Opt for the {{RECEIVE_PAYMENT_OPTION}} that best suits your preference.\n5. Follow the provided instructions to finalize your preferred method of payment and validate the information.\n\nShould you face any complications during this process, do not hesitate to reach out to our customer support team for immediate assistance.',
 'category': 'CLAIMS',
 'id': 'da0592e6'}

In [68]:
len(documents)

390

In [69]:
with open('../Data/documents-with-ids.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)