# 3. Text Preprocessing

This notebook handles advanced text cleaning and preprocessing to prepare tweets for the DistilBERT model.

## Preprocessing Steps:
1. URL and email removal
2. User mention cleaning
3. Hashtag processing (#word → word)
4. Special character and number handling
5. Text normalization and cleaning
6. Optional spell correction

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import re
import string
import json
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings('ignore')

# Load configuration
with open('../config/hyperparameters.json', 'r') as f:
    config = json.load(f)

# Load preprocessing config
preprocess_config = config['feature_config']['text_cleaning']

print("✅ Libraries imported successfully!")
print(f"Preprocessing config: {preprocess_config}")

## 3.1 Load Data with Meta-Features

In [None]:
# Load datasets from previous notebook
df_train = pd.read_csv('Data/train_with_meta.csv')
df_test = pd.read_csv('Data/test_with_meta.csv')

print(f"Train shape: {df_train.shape}")
print(f"Test shape: {df_test.shape}")

# Verify meta-features are present
meta_cols = [col for col in df_train.columns if col.endswith('_meta')]
print(f"\nMeta-features found: {len(meta_cols)}")
print(f"First few meta-features: {meta_cols[:5]}")

## 3.2 Text Preprocessing Functions

In [None]:
# Initialize stop words
stop_words = set(stopwords.words('english'))

def clean_text(text, config):
    """
    Clean tweet text based on configuration.
    """
    if pd.isna(text):
        return ""
    
    # Convert to string and lowercase
    text = str(text)
    if config['lowercase']:
        text = text.lower()
    
    # Remove URLs
    if config['remove_urls']:
        text = re.sub(r'http[s]?://\S+|www\.\S+', '', text)
    
    # Remove email addresses
    if config['remove_emails']:
        text = re.sub(r'\S+@\S+', '', text)
    
    # Remove user mentions (@username)
    if config['remove_mentions']:
        text = re.sub(r'@\w+', '', text)
    
    # Convert hashtags to regular words (#word → word)
    text = re.sub(r'#(\w+)', r'\1', text)
    
    # Remove numbers (except emergency codes like 911)
    if config['remove_numbers']:
        text = re.sub(r'\b\d+\b(?<!911)', '', text)
    
    # Remove 'RT' indicators
    text = re.sub(r'\brt\b', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def advanced_clean_text(text, config):
    """
    Advanced text cleaning with punctuation and stop word removal.
    """
    # Basic cleaning
    text = clean_text(text, config)
    
    if not text:
        return text
    
    # Remove punctuation (keep basic sentence structure)
    text = re.sub(r'[^\w\s!?]', '', text)
    
    # Remove stop words
    if config['remove_stopwords']:
        words = text.split()
        words = [w for w in words if w not in stop_words and len(w) > 1]
        text = ' '.join(words)
    
    # Remove character repetition (sooooo → so)
    text = re.sub(r'(.)\1{2,}', r'\1', text)
    
    # Final cleanup
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

print("✅ Text preprocessing functions defined!")

## 3.3 Spell Correction Setup

In [None]:
# Try to initialize spell correction (optional)
spell_corrector = None

if preprocess_config['spell_correction']:
    try:
        from symspellpy.symspellpy import SymSpell, Verbosity
        
        # Initialize SymSpell
        sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
        
        # Try to load dictionary
        dictionary_path = "frequency_dictionary_en_82_765.txt"
        import os
        
        if os.path.exists(dictionary_path):
            sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
            spell_corrector = sym_spell
            print("✅ Spell correction enabled")
        else:
            print("⚠️  Spell correction dictionary not found, continuing without it")
            
    except ImportError:
        print("⚠️  SymSpell not available, continuing without spell correction")
    except Exception as e:
        print(f"⚠️  Spell correction initialization failed: {e}")
else:
    print("ℹ️  Spell correction disabled in config")

def spell_correct(text, corrector):
    """
    Apply spell correction if available.
    """
    if corrector is None:
        return text
    
    try:
        # Split text into words for correction
        words = text.split()
        corrected_words = []
        
        for word in words:
            suggestions = corrector.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
            if suggestions:
                corrected_words.append(suggestions[0].term)
            else:
                corrected_words.append(word)
        
        return ' '.join(corrected_words)
    except Exception as e:
        print(f"Spell correction error: {e}")
        return text

## 3.4 Apply Text Preprocessing

In [None]:
# Create cleaned text columns
print("Applying text preprocessing to training data...")
df_train['text_clean'] = df_train['text'].apply(
    lambda x: advanced_clean_text(x, preprocess_config)
)

print("Applying text preprocessing to test data...")
df_test['text_clean'] = df_test['text'].apply(
    lambda x: advanced_clean_text(x, preprocess_config)
)

# Apply spell correction if available
if spell_corrector is not None:
    print("\nApplying spell correction...")
    df_train['text_clean'] = df_train['text_clean'].apply(
        lambda x: spell_correct(x, spell_corrector)
    )
    df_test['text_clean'] = df_test['text_clean'].apply(
        lambda x: spell_correct(x, spell_corrector)
    )

print(f"\nText preprocessing completed!")
print(f"Train shape after preprocessing: {df_train.shape}")
print(f"Test shape after preprocessing: {df_test.shape}")

## 3.5 Preprocessing Results Analysis

In [None]:
# Analyze preprocessing results
print("Preprocessing analysis:")
print("="*50)

# Calculate text length reduction
df_train['original_length'] = df_train['text'].str.len()
df_train['cleaned_length'] = df_train['text_clean'].str.len()
df_train['length_reduction'] = df_train['original_length'] - df_train['cleaned_length']

df_test['original_length'] = df_test['text'].str.len()
df_test['cleaned_length'] = df_test['text_clean'].str.len()
df_test['length_reduction'] = df_test['original_length'] - df_test['cleaned_length']

print("Training data:")
print(f"  Average original length: {df_train['original_length'].mean():.1f}")
print(f"  Average cleaned length: {df_train['cleaned_length'].mean():.1f}")
print(f"  Average reduction: {df_train['length_reduction'].mean():.1f} characters")

print("\nTest data:")
print(f"  Average original length: {df_test['original_length'].mean():.1f}")
print(f"  Average cleaned length: {df_test['cleaned_length'].mean():.1f}")
print(f"  Average reduction: {df_test['length_reduction'].mean():.1f} characters")

# Check for empty texts after cleaning
empty_train = (df_train['text_clean'].str.strip() == '').sum()
empty_test = (df_test['text_clean'].str.strip() == '').sum()

print(f"\nEmpty texts after cleaning:")
print(f"  Training: {empty_train} ({empty_train/len(df_train)*100:.2f}%)")
print(f"  Test: {empty_test} ({empty_test/len(df_test)*100:.2f}%)")

## 3.6 Sample Text Comparison

In [None]:
# Display sample texts before and after cleaning
print("Sample texts before and after cleaning:")
print("="*80)

# Sample from both classes
for target in [0, 1]:
    label = 'Non-Disaster' if target == 0 else 'Disaster'
    print(f"\n{label} Tweets:")
    print("-"*40)
    
    samples = df_train[df_train['target'] == target].head(3)
    for idx, row in samples.iterrows():
        print(f"\nOriginal:  {row['text']}")
        print(f"Cleaned:   {row['text_clean']}")
        print(f"Reduction: {row['length_reduction']} characters")

## 3.7 Handle Empty Texts

In [None]:
# Handle empty texts after cleaning
def handle_empty_text(df):
    """
    Replace empty texts with original text as fallback.
    """
    empty_mask = df['text_clean'].str.strip() == ''
    
    if empty_mask.any():
        print(f"Found {empty_mask.sum()} empty texts, using original text as fallback")
        df.loc[empty_mask, 'text_clean'] = df.loc[empty_mask, 'text']
        
        # Apply basic cleaning to original text
        df.loc[empty_mask, 'text_clean'] = df.loc[empty_mask, 'text_clean'].apply(
            lambda x: clean_text(x, preprocess_config)
        )
    
    return df

print("Handling empty texts...")
df_train = handle_empty_text(df_train)
df_test = handle_empty_text(df_test)

# Final check
final_empty_train = (df_train['text_clean'].str.strip() == '').sum()
final_empty_test = (df_test['text_clean'].str.strip() == '').sum()

print(f"Final empty texts - Train: {final_empty_train}, Test: {final_empty_test}")

## 3.8 Text Length Distribution After Preprocessing

In [None]:
# Visualize text length distribution
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn-v0_8')

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Training data length distribution
sns.histplot(data=df_train, x='cleaned_length', hue='target', kde=True, ax=axes[0, 0])
axes[0, 0].set_title('Training Data - Cleaned Text Length Distribution')
axes[0, 0].set_xlabel('Character Count')
axes[0, 0].legend(['Disaster', 'Non-Disaster'])

# Test data length distribution
sns.histplot(data=df_test, x='cleaned_length', kde=True, ax=axes[0, 1])
axes[0, 1].set_title('Test Data - Cleaned Text Length Distribution')
axes[0, 1].set_xlabel('Character Count')

# Length reduction distribution
sns.histplot(data=df_train, x='length_reduction', hue='target', kde=True, ax=axes[1, 0])
axes[1, 0].set_title('Character Reduction Distribution')
axes[1, 0].set_xlabel('Characters Removed')
axes[1, 0].legend(['Disaster', 'Non-Disaster'])

# Box plot comparison
length_data = pd.DataFrame({
    'Original': df_train['original_length'],
    'Cleaned': df_train['cleaned_length'],
    'Target': df_train['target']
})

length_melted = length_data.melt(id_vars=['Target'], var_name='Type', value_name='Length')
sns.boxplot(data=length_melted, x='Type', y='Length', hue='Target', ax=axes[1, 1])
axes[1, 1].set_title('Text Length Comparison')
axes[1, 1].set_xlabel('Text Type')
axes[1, 1].set_ylabel('Character Count')

plt.tight_layout()
plt.show()

## 3.9 Prepare Final Datasets

In [None]:
# Select essential columns for model training
essential_cols = ['id', 'text', 'text_clean', 'target'] + meta_cols

# For test set, target may not exist
test_essential_cols = ['id', 'text', 'text_clean'] + meta_cols
if 'target' in df_test.columns:
    test_essential_cols.append('target')

# Create final datasets
train_final = df_train[essential_cols].copy()
test_final = df_test[test_essential_cols].copy()

# Remove temporary columns
temp_cols = ['original_length', 'cleaned_length', 'length_reduction']
for col in temp_cols:
    if col in train_final.columns:
        train_final = train_final.drop(col, axis=1)
    if col in test_final.columns:
        test_final = test_final.drop(col, axis=1)

print(f"Final training dataset shape: {train_final.shape}")
print(f"Final test dataset shape: {test_final.shape}")

print("\nFinal columns:")
print(f"Train: {list(train_final.columns)}")
print(f"Test:  {list(test_final.columns)}")

## 3.10 Save Processed Data

In [None]:
# Save cleaned datasets
train_final.to_csv('Data/train_cleaned.csv', index=False)
test_final.to_csv('Data/test_cleaned.csv', index=False)

# Save preprocessing statistics
preprocessing_stats = {
    'train_samples': len(train_final),
    'test_samples': len(test_final),
    'avg_original_length': df_train['original_length'].mean(),
    'avg_cleaned_length': df_train['cleaned_length'].mean(),
    'avg_reduction': df_train['length_reduction'].mean(),
    'spell_correction_enabled': spell_corrector is not None,
    'preprocessing_config': preprocess_config
}

import json
with open('results/metrics/preprocessing_stats.json', 'w') as f:
    json.dump(preprocessing_stats, f, indent=2)

print("✅ Text preprocessing completed!")
print("\nFiles saved:")
print("- Data/train_cleaned.csv")
print("- Data/test_cleaned.csv")
print("- results/metrics/preprocessing_stats.json")

# Display final sample
print("\nFinal sample of cleaned training data:")
display(train_final[['text', 'text_clean'] + meta_cols[:3]].head(3))

print("\n" + "="*60)
print("🎉 Text preprocessing completed successfully!")
print("Next: Model Training (04_model_training.ipynb)")
print("="*60)