# Dataset Preparation

This notebook handles the initial loading and preparation of datasets from three domains:
- Law
- Finance
- Healthcare

In [None]:
# Import required libraries
import os

from prompt_classifier.modeling.datasets_config import load_datasets


## Load Raw Datasets
Load the raw datasets from their respective sources

In [None]:
# Load all three datasets using the configured loader
law_df, finance_df, healthcare_df = load_datasets()

Downloading readme:   0%|          | 0.00/410 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/38.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/24343 [00:00<?, ? examples/s]

Downloading readme:   0%|          | 0.00/163 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/53.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/53937 [00:00<?, ? examples/s]

Downloading data:   0%|          | 0.00/23.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/19999 [00:00<?, ? examples/s]

## Helper Functions
Define utility functions for data processing

In [3]:
def extract_user_content(messages: list) -> str:
    user_content = [msg["content"] for msg in messages if msg["role"] == "user"]
    return ' '.join(user_content)

## Process Finance Dataset
Prepare and clean the finance dataset

In [4]:
finance_df['prompt'] = finance_df['instruction']
os.makedirs('data/interim', exist_ok=True)
os.makedirs('data/processed', exist_ok=True)

In [5]:
finance_df = finance_df.dropna(subset=['prompt'])
finance_df = finance_df.replace('\n', '', regex=True)
finance_df = finance_df[finance_df['prompt'] != '']
finance_df['label'] = 1
finance_df[['prompt', 'label']].to_csv('data/interim/finance_prompts.csv', index=False)

## Process Healthcare Dataset
Prepare and clean the healthcare dataset

In [6]:
healthcare_df['prompt'] = healthcare_df['input']
healthcare_df = healthcare_df.dropna(subset=['prompt'])
healthcare_df = healthcare_df.replace('\n', '', regex=True)
healthcare_df = healthcare_df[healthcare_df['prompt'] != '']
healthcare_df['label'] = 1
healthcare_df[['prompt', 'label']].to_csv('data/interim/healthcare_prompts.csv', index=False)

## Process Law Dataset
Prepare and clean the law dataset

In [7]:
law_df['label'] = 1
law_df = law_df.dropna(subset=['prompt'])
law_df = law_df.replace('\n', '', regex=True)
law_df = law_df[law_df['prompt'] != '']
law_df['label'] = 1
law_df[['prompt', 'label']].to_csv('data/interim/law_prompts.csv', index=False)