# Text Processing Pipeline

This notebook implements the text preprocessing pipeline for cleaning and standardizing the prompt data.

## Setup
Import required libraries and configure environment

In [1]:
import re
import warnings

import nltk
import pandas as pd
from nltk.corpus import stopwords

from prompt_classifier.config import DATASET_SIZE

warnings.filterwarnings('ignore')

In [2]:
nltk.download('stopwords')

## Load Data
Load the interim datasets for processing

In [3]:
finance_df = pd.read_csv('data/interim/finance_prompts.csv')
healthcare_df = pd.read_csv('data/interim/healthcare_prompts.csv')
law_df = pd.read_csv('data/interim/law_prompts.csv')


dataframes = {"finance_prompts": finance_df,
              "healthcare_prompts": healthcare_df, "law_prompts": law_df}

## Text Preprocessing
Define and implement text cleaning functions

In [4]:
def preprocess_text(text: str) -> str:
    # Convert to lowercase
    text = str(text).lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Split into words
    words = text.split()

    # Remove stopwords and short words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words and len(word) > 2]
    clear_output = str(' '.join(words))

    if not clear_output:
        return None

    return clear_output

## Apply Processing
Apply the preprocessing to all datasets

In [5]:
for key, df in dataframes.items():
    df['prompt'] = df['prompt'].apply(preprocess_text)
    df = df[df['prompt'].apply(lambda x: isinstance(x, str))]

    df = df.head(DATASET_SIZE)
    df.to_csv(f'data/processed/{key}.csv', index=False)

## Load Processed Data
Load and verify the processed datasets

In [6]:
finance_df_cleaned = pd.read_csv('data/processed/finance_prompts.csv')
healthcare_df_cleaned = pd.read_csv('data/processed/healthcare_prompts.csv')
law_df_cleaned = pd.read_csv('data/processed/law_prompts.csv')

dataframes_cleaned = {"finance_prompts": finance_df_cleaned, "healthcare_prompts": healthcare_df_cleaned, "law_prompts": law_df_cleaned}