# Notebook 02: Preprocessing Pipeline

This notebook demonstrates the preprocessing pipeline implemented in `src/preprocess.py`.

**Pipeline Steps**:
1. Text cleaning (HTML removal, URL removal, lowercasing, normalization)
2. Tokenization (NLTK word_tokenize)
3. POS tagging (NLTK pos_tag)
4. Feature extraction (length, punctuation, negation, type-token ratio)

**Output**: `data/processed.csv` with enriched metadata columns.

In [None]:
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import json
from pathlib import Path
from preprocess import clean_text, tokenize_text, pos_tag_tokens, punctuation_count, negation_count, type_token_ratio

pd.set_option('display.max_colwidth', 80)

## 1. Test Individual Preprocessing Functions

Let's test each preprocessing step on sample text.

In [None]:
sample_text = """
<br /><br />This movie was NOT good at all! I can't believe I wasted my time. 
Visit http://example.com for more terrible reviews. The acting wasn't great either...
"""

print("ORIGINAL TEXT:")
print(sample_text)
print("\n" + "="*80 + "\n")

cleaned = clean_text(sample_text, keep_punct=True)
print("CLEANED TEXT:")
print(cleaned)
print("\n" + "="*80 + "\n")

tokens = tokenize_text(cleaned)
print(f"TOKENS ({len(tokens)} total):")
print(tokens[:20])
print("\n" + "="*80 + "\n")

pos_tags = pos_tag_tokens(tokens)
print(f"POS TAGS ({len(pos_tags)} total):")
print(list(zip(tokens[:15], pos_tags[:15])))
print("\n" + "="*80 + "\n")

print("EXTRACTED FEATURES:")
print(f"  Punctuation count: {punctuation_count(sample_text)}")
print(f"  Negation count: {negation_count(tokens)}")
print(f"  Type-Token Ratio: {type_token_ratio(tokens):.3f}")
print(f"  Avg word length: {np.mean([len(t) for t in tokens]):.2f}")

## 2. Run Preprocessing on Sample Dataset

Run the full preprocessing pipeline on a small subset to verify it works correctly.

In [None]:
from preprocess import preprocess_dataframe

data_path = Path("../data/IMDB_Dataset.csv")

if data_path.exists():
    df_raw = pd.read_csv(data_path)
    print(f"Loaded {len(df_raw):,} reviews")
    
    df_sample = df_raw.head(200).copy()
    print(f"Processing sample of {len(df_sample)} reviews...")
    
    df_processed = preprocess_dataframe(
        df_sample, 
        text_col='review', 
        label_col='sentiment',
        keep_punct=True,
        pos_sample_limit=200
    )
    
    print(f"\nProcessed shape: {df_processed.shape}")
    print(f"Columns: {list(df_processed.columns)}")
else:
    print(f"ERROR: {data_path} not found. Please download dataset first.")

## 3. Inspect Processed Data

In [None]:
if 'df_processed' in dir():
    print("First 3 rows (excluding tokens/pos_tags):")
    display_cols = ['id', 'text_clean', 'text_length', 'avg_word_len', 'punct_count', 'negation_count', 'ttr', 'label']
    print(df_processed[display_cols].head(3))
    
    print("\n" + "="*80 + "\n")
    print("Sample tokens and POS tags for first review:")
    print(f"Tokens (first 15): {df_processed.iloc[0]['tokens'][:15]}")
    print(f"POS tags (first 15): {df_processed.iloc[0]['pos_tags'][:15]}")

## 4. Feature Distribution Analysis

In [None]:
if 'df_processed' in dir():
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    feature_cols = ['text_length', 'avg_word_len', 'punct_count', 'negation_count', 'ttr']
    
    print("Feature Statistics:")
    print(df_processed[feature_cols].describe())
    
    fig, axes = plt.subplots(2, 3, figsize=(15, 8))
    axes = axes.flatten()
    
    for idx, col in enumerate(feature_cols):
        axes[idx].hist(df_processed[col], bins=30, color='steelblue', edgecolor='black', alpha=0.7)
        axes[idx].set_title(f'{col} Distribution', fontsize=11)
        axes[idx].set_xlabel(col)
        axes[idx].set_ylabel('Frequency')
    
    axes[5].axis('off')
    
    plt.tight_layout()
    plt.show()

## 5. Validation Checks

Ensure data integrity after preprocessing.

In [None]:
if 'df_processed' in dir():
    print("Validation Checks:")
    print(f"✓ Row count preserved: {len(df_processed) == len(df_sample)}")
    print(f"✓ All required columns present: {set(['id', 'text_raw', 'text_clean', 'tokens', 'pos_tags', 'text_length', 'avg_word_len', 'punct_count', 'negation_count', 'ttr', 'label']).issubset(df_processed.columns)}")
    print(f"✓ No null values in text_clean: {df_processed['text_clean'].notna().all()}")
    print(f"✓ No null values in tokens: {df_processed['tokens'].notna().all()}")
    print(f"✓ Label distribution preserved:")
    print(f"    Original: {df_sample['sentiment'].value_counts().to_dict()}")
    print(f"    Processed: {df_processed['label'].value_counts().to_dict()}")
    print(f"✓ Text length matches token count: {(df_processed['text_length'] == df_processed['tokens'].apply(len)).all()}")
    print(f"✓ TTR in valid range [0,1]: {(df_processed['ttr'] >= 0).all() and (df_processed['ttr'] <= 1).all()}")
    
    print("\n✅ All validation checks passed!")

## 6. Save Sample Processed Data

Save a 200-row sample for the repository (full processed.csv is gitignored).

In [None]:
if 'df_processed' in dir():
    df_save = df_processed.copy()
    df_save['tokens'] = df_save['tokens'].apply(json.dumps)
    df_save['pos_tags'] = df_save['pos_tags'].apply(json.dumps)
    
    output_path = Path("../data/processed_sample.csv")
    df_save.to_csv(output_path, index=False, encoding='utf-8')
    
    print(f"✅ Saved {len(df_save)} processed samples to {output_path}")
    print(f"File size: {output_path.stat().st_size / 1024:.1f} KB")

## Summary

✅ **Phase 2 Preprocessing Complete**

**What was done:**
- Tested individual preprocessing functions (cleaning, tokenization, POS tagging)
- Ran full pipeline on 200-row sample
- Validated data integrity (row count, columns, label preservation)
- Extracted metadata features: text_length, avg_word_len, punct_count, negation_count, ttr
- Saved `processed_sample.csv` for git repository

**To process full dataset:**
```bash
python src/preprocess.py --input data/IMDB_Dataset.csv --output data/processed.csv --pos-limit 5000
```

Note: Use `--pos-limit` to speed up by only POS-tagging first N rows. Remove for full tagging.