# Phase 3: Preprocessing Pipeline

In this notebook, we will implement a text preprocessing pipeline to clean and normalize the AG News dataset. This includes:
1. **HTML Removal**: Cleaning any leftover HTML tags.
2. **Noise Reduction**: Removing press agency markers (e.g., Reuters, AP).
3. **Normalization**: Lowercasing and removing special characters.
4. **Lemmatization**: Reducing words to their base form.

In [None]:
import pandas as pd
import numpy as np
import re
import os
import sys
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Add src to path to use data_loader
sys.path.append(os.path.abspath("../"))
from src.data_loader import load_data, get_class_labels

## 1. Initialize NLTK Resources

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

## 2. Load Data

In [None]:
train_df, test_df = load_data()
print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

train_df.head()

## 3. Implement `clean_text` Function

In [None]:
def clean_text(text):
    # 1. Lowercase
    text = text.lower()
    
    # 2. Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # 3. Remove Agency Tags (e.g., "reuters - ", "(ap) ")
    # Pattern: agency name followed by "-" or inside parentheses at the start or end of snippet
    text = re.sub(r'\(\w+\)|\w+\s+-\s+', '', text)
    
    # 4. Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # 5. Tokenization and Stopword Removal
    words = text.split()
    words = [w for w in words if w not in stop_words]
    
    # 6. Lemmatization
    words = [lemmatizer.lemmatize(w) for w in words]
    
    return " ".join(words)

## 4. Apply Preprocessing

In [None]:
print("Preprocessing training data...")
train_df['Clean_Description'] = train_df['Description'].apply(clean_text)

print("Preprocessing test data...")
test_df['Clean_Description'] = test_df['Description'].apply(clean_text)

train_df[['Description', 'Clean_Description']].head()

## 5. Save Processed Data

In [None]:
processed_dir = os.path.join("..", "data", "processed")
os.makedirs(processed_dir, exist_ok=True)

train_df.to_csv(os.path.join(processed_dir, "clean_train.csv"), index=False)
test_df.to_csv(os.path.join(processed_dir, "clean_test.csv"), index=False)

print(f"Processed data saved to {processed_dir}")