In [1]:
#import libraries
import pandas as pd
import json

#load datasets
#Since these can be handled by builtin functions in pandas 
df_judgements = pd.read_csv("/kaggle/input/indian-supreme-court-judgments/judgments.csv")
df_acts = pd.read_excel("/kaggle/input/git-hub-dataset/MILPaC_Acts_dataset.xlsx")
df_faq = pd.read_excel("/kaggle/input/git-hub-dataset/MILPaC_CCI_FAQ_dataset.xlsx")
df_ip = pd.read_excel("/kaggle/input/git-hub-dataset/MILPaC_IP_dataset.xlsx")

# llm fine tuning dataset of indian legal texts requires json module to load since its files
#are stored in json format
# here def load_json is the function used for loading the json files in the container named data
# which is then used to convert the data loaded to a dataframe using DataFrame function from the 
#pandas library
def load_json(path):
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return pd.DataFrame(data)

df_constitution = load_json("/kaggle/input/llm-fine-tuning-dataset-of-indian-legal-texts/constitution_qa.json")
df_crpc = load_json("/kaggle/input/llm-fine-tuning-dataset-of-indian-legal-texts/crpc_qa.json")
df_ipc = load_json("/kaggle/input/llm-fine-tuning-dataset-of-indian-legal-texts/ipc_qa.json")
df_articles = load_json("/kaggle/input/git-hub-dataset/constitution_of_india.json")
df_legal_dataset = load_json("/kaggle/input/git-hub-dataset/IndicLegalQA Dataset_10K_Revised.json")

In [2]:
# Standardizes datasets with question naswer labels
def standardize(df, q_col, a_col, source_name): 
    df_clean = df[[q_col, a_col]].dropna()
    df_clean.columns = ["question", "answer"]
    df_clean["source"] = source_name
    df_clean["intent"] = "law_info"
    return df_clean
    
# Standardizes datasets specifying articles, their titles and description 
def article_standardize(df):
    df_clean = pd.DataFrame()
    df_clean["question"] = df.apply(
        lambda row: f"What does Article {row['article']} ({row['title']}) state?", axis=1
    )
    df_clean["answer"] = df["description"]
    df_clean["source"] = "Constitution"
    df_clean["intent"] = "law_info"
    return df_clean
    
#Standardizes translation dataset in ques and answer pairs
def standardize_translation(df):
    df_clean = pd.DataFrame()
    df_clean["question"] = df["src"]
    df_clean["answer"]   = df["tgt"]
    df_clean["source"]   = df["dataset"]
    
    # If it's a translation dataset, intent can be "translation"
    # Otherwise, adjust based on your use case
    df_clean["intent"]   = "translation"
    return df_clean
    
#Standardizes the court cases in the form of QnA pairs to maintain consistency 
def standardize_cases(df):
    df_clean = pd.DataFrame()
    df_clean["question"] = df["question"]
    df_clean["answer"]   = df["answer"]
    df_clean["source"]   = df["case_name"] + " (" + df["judgement_date"] + ")"
    df_clean["intent"]   = "case_info"
    return df_clean

#Generalized llm data in the form of question answer format
df_constitution_clean = standardize(df_constitution, "question", "answer", "Constitution")
df_crpc_clean = standardize(df_crpc, "question", "answer", "CrPC")
df_ipc_clean = standardize(df_ipc, "question", "answer", "IPC")

#Articles of contitution converted to question answer format
df_articles_clean = article_standardize(df_articles)

#Question / Answers given in english as question and whose translations are given in 
#different regional languages as answers
df_acts_clean = standardize_translation(df_acts)
df_faq_clean = standardize_translation(df_faq)
df_ip_clean = standardize_translation(df_ip)

#Standardized court judgements
df_legal_dataset_clean = standardize_cases(df_legal_dataset)

In [3]:
import re

def clean_text(text):
    if pd.isna(text): return ""
    text = text.lower()
    text = re.sub(r"\s+", " ", text)   # remove extra spaces
    text = re.sub(r"[^\w\s.,?]", "", text)  # remove special chars
    return text.strip()

for df in [df_constitution_clean, df_crpc_clean, df_ipc_clean,
           df_acts_clean, df_faq_clean, df_ip_clean, df_articles_clean,df_legal_dataset_clean]:
    df["question"] = df["question"].apply(clean_text)
    df["answer"] = df["answer"].apply(clean_text)


In [4]:
df_final = pd.concat([
    df_constitution_clean, df_crpc_clean, df_ipc_clean,
           df_acts_clean, df_faq_clean, df_ip_clean, df_articles_clean,df_legal_dataset_clean
], ignore_index=True)

# Shuffle & reset index
df_final = df_final.sample(frac=1).reset_index(drop=True)

# Save for chatbot
df_final.to_csv("legal_chatbot_dataset.csv", index=False)
