# Preprocessing demo
This notebook demonstrates the cleaning and tokenization steps used in the demo project. It uses the local library in `src/` so start the kernel from the project root.

In [None]:
# Generate a small dataset (if missing) and load modules
from generate_data import generate
from src.preprocess import clean_text, tokenize_and_lemmatize
from pathlib import Path
DATA = Path('data') / 'synthetic_texts.csv'
if not DATA.exists():
    generate(200)
print('Data available at', DATA)

In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Text Preprocessing Demo\n",
    "\n",
    "This notebook demonstrates comprehensive text preprocessing techniques including cleaning, tokenization, lemmatization, and feature engineering."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Setup and Data Loading"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from pathlib import Path\n",
    "\n",
    "from src.preprocess import clean_text, tokenize_and_lemmatize, preprocess_df\n",
    "from generate_data import generate\n",
    "\n",
    "# Generate data if needed\n",
    "DATA = Path('data') / 'synthetic_texts.csv'\n",
    "if not DATA.exists():\n",
    "    generate(200)\n",
    "\n",
    "print('Data available at', DATA)\n",
    "df = pd.read_csv(DATA)\n",
    "print(f\"Dataset shape: {df.shape}\")\n",
    "print(\"Label distribution:\")\n",
    "print(df['label'].value_counts())\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Text Cleaning Examples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Test cleaning on various types of text\n",
    "samples = [\n",
    "    '<p>Hello!!! Visit https://example.com for more 😊</p>',\n",
    "    'Contact us at info@example.com or call +1 (555) 123-4567',\n",
    "    'Repeated words words words words',\n",
    "    'SPECIAL characters: !@#$%^&*()_+',\n",
    "    'Mixed CASE Text with Numbers 123',\n",
    "    'HTML tags: <div>content</div> and <script>alert()</script>',\n",
    "    'Emoji test: 😊 👍 🚀',\n",
    "    'URLs: http://example.com and https://secure.site/path?query=value'\n",
    "]\n",
    "\n",
    "print(\"Text Cleaning Demonstration:\")\n",
    "print(\"=\" * 50)\n",
    "for i, s in enumerate(samples, 1):\n",
    "    print(f\"\\nExample {i}:\")\n",
    "    print(f\"RAW   : {s}\")\n",
    "    print(f\"CLEAN : {clean_text(s)}\")\n",
    "    print(\"-\" * 30)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Tokenization and Lemmatization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Show detailed tokenization process\n",
    "print(\"Tokenization and Lemmatization Process:\")\n",
    "print(\"=\" * 60)\n",
    "\n",
    "# Select sample texts from different domains\n",
    "sample_rows = df.sample(5, random_state=42)\n",
    "\n",
    "for idx, row in sample_rows.iterrows():\n",
    "    print(f\"\\nSample {idx} (Domain: {row['domain']}, Label: {row['label']}):\")\n",
    "    print(f\"Original: {row['text']}\")\n",
    "    \n",
    "    cleaned = clean_text(row['text'])\n",
    "    print(f\"Cleaned : {cleaned}\")\n",
    "    \n",
    "    tokens = tokenize_and_lemmatize(cleaned)\n",
    "    print(f\"Tokens  : {tokens}\")\n",
    "    print(f\"Token count: {len(tokens)}\")\n",
    "    print(\"-\" * 50)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Batch Preprocessing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Apply preprocessing to entire dataset\n",
    "print(\"Batch Preprocessing Results:\")\n",
    "print(\"=\" * 40)\n",
    "\n",
    "preprocessed_df = preprocess_df(df)\n",
    "print(f\"Preprocessed dataset shape: {preprocessed_df.shape}\")\n",
    "print(\"\\nFirst 5 rows of preprocessed data:\")\n",
    "preprocessed_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Preprocessing Statistics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Analyze preprocessing results\n",
    "print(\"Preprocessing Statistics:\")\n",
    "print(\"=\" * 30)\n",
    "\n",
    "# Calculate text length statistics\n",
    "preprocessed_df['original_length'] = df['text'].str.split().str.len()\n",
    "preprocessed_df['processed_length'] = preprocessed_df['tokens'].str.len()\n",
    "\n",
    "print(f\"Average original text length: {preprocessed_df['original_length'].mean():.1f} words\")\n",
    "print(f\"Average processed text length: {preprocessed_df['processed_length'].mean():.1f} tokens\")\n",
    "print(f\"Reduction ratio: {(1 - preprocessed_df['processed_length'].mean() / preprocessed_df['original_length'].mean()) * 100:.1f}%\")\n",
    "\n",
    "# Show length distribution\n",
    "plt.figure(figsize=(12, 5))\n",
    "\n",
    "plt.subplot(1, 2, 1)\n",
    "plt.hist(preprocessed_df['original_length'], bins=20, alpha=0.7, label='Original')\n",
    "plt.hist(preprocessed_df['processed_length'], bins=20, alpha=0.7, label='Processed')\n",
    "plt.xlabel('Text Length (words/tokens)')\n",
    "plt.ylabel('Frequency')\n",
    "plt.title('Text Length Distribution')\n",
    "plt.legend()\n",
    "\n",
    "plt.subplot(1, 2, 2)\n",
    "reduction_ratio = (preprocessed_df['original_length'] - preprocessed_df['processed_length']) / preprocessed_df['original_length']\n",
    "plt.hist(reduction_ratio, bins=20, alpha=0.7)\n",
    "plt.xlabel('Reduction Ratio')\n",
    "plt.ylabel('Frequency')\n",
    "plt.title('Text Reduction Ratio Distribution')\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Vocabulary Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Analyze vocabulary before and after preprocessing\n",
    "print(\"Vocabulary Analysis:\")\n",
    "print(\"=\" * 25)\n",
    "\n",
    "# Original vocabulary\n",
    "original_words = []\n",
    "for text in df['text']:\n",
    "    words = text.lower().split()\n",
    "    original_words.extend(words)\n",
    "\n",
    "# Processed vocabulary\n",
    "processed_words = []\n",
    "for tokens in preprocessed_df['tokens']:\n",
    "    processed_words.extend(tokens)\n",
    "\n",
    "print(f\"Original vocabulary size: {len(set(original_words))}\")\n",
    "print(f\"Processed vocabulary size: {len(set(processed_words))}\")\n",
    "print(f\"Vocabulary reduction: {(1 - len(set(processed_words)) / len(set(original_words))) * 100:.1f}%\")\n",
    "\n",
    "# Show most common words\n",
    "from collections import Counter\n",
    "\n",
    "original_counter = Counter(original_words)\n",
    "processed_counter = Counter(processed_words)\n",
    "\n",
    "print(\"\\nTop 10 original words:\")\n",
    "for word, count in original_counter.most_common(10):\n",
    "    print(f\"  {word}: {count}\")\n",
    "\n",
    "print(\"\\nTop 10 processed words:\")\n",
    "for word, count in processed_counter.most_common(10):\n",
    "    print(f\"  {word}: {count}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Domain-Specific Preprocessing Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Analyze preprocessing effects by domain\n",
    "print(\"Domain-Specific Preprocessing Analysis:\")\n",
    "print(\"=\" * 45)\n",
    "\n",
    "domain_stats = preprocessed_df.groupby('domain').agg({\n",
    "    'original_length': 'mean',\n",
    "    'processed_length': 'mean',\n",
    "    'tokens': 'count'\n",
    "}).round(1)\n",
    "\n",
    "domain_stats['reduction_ratio'] = ((domain_stats['original_length'] - domain_stats['processed_length']) / \n",
    "                                 domain_stats['original_length'] * 100).round(1)\n",
    "\n",
    "print(\"\\nPreprocessing statistics by domain:\")\n",
    "print(domain_stats)\n",
    "\n",
    "# Visualize domain differences\n",
    "plt.figure(figsize=(10, 6))\n",
    "domains = domain_stats.index\n",
    "x = np.arange(len(domains))\n",
    "width = 0.35\n",
    "\n",
    "plt.bar(x - width/2, domain_stats['original_length'], width, label='Original')\n",
    "plt.bar(x + width/2, domain_stats['processed_length'], width, label='Processed')\n",
    "plt.xlabel('Domain')\n",
    "plt.ylabel('Average Text Length')\n",
    "plt.title('Text Length by Domain (Before/After Preprocessing)')\n",
    "plt.xticks(x, domains)\n",
    "plt.legend()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Custom Preprocessing Examples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Demonstrate custom preprocessing options\n",
    "print(\"Custom Preprocessing Options:\")\n",
    "print(\"=\" * 35)\n",
    "\n",
    "# Example of custom cleaning function\n",
    "def custom_clean_text(text, remove_numbers=True, remove_special_chars=True):\n",
    "    \"\"\"Custom text cleaning with configurable options\"\"\"\n",
    "    cleaned = clean_text(text)\n",
    "    \n",
    "    if remove_numbers:\n",
    "        # Remove numbers\n",
    "        import re\n",
    "        cleaned = re.sub(r'\\d+', '', cleaned)\n",
    "    \n",
    "    if remove_special_chars:\n",
    "        # Remove special characters\n",
    "        cleaned = re.sub(r'[^\\w\\s]', '', cleaned)\n",
    "    \n",
    "    return cleaned.strip()\n",
    "\n",
    "# Test custom preprocessing\n",
    "test_texts = [\n",
    "    \"Product ID: ABC123, Price: $99.99, Rating: 4.5/5\",\n",
    "    \"Special! Buy now & save 50%!!!\",\n",
    "    \"Contact: john.doe@example.com, Phone: (555) 123-4567\"\n",
    "]\n",
    "\n",
    "for text in test_texts:\n",
    "    print(f\"\\nOriginal: {text}\")\n",
    "    print(f\"Standard: {clean_text(text)}\")\n",
    "    print(f\"Custom  : {custom_clean_text(text, remove_numbers=True, remove_special_chars=True)}\")\n",
    "    print(\"-\" * 50)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Preprocessing Pipeline Summary\n",
    "\n",
    "This notebook demonstrates:\n",
    "1. Text cleaning (HTML removal, URL/email removal, special character handling)\n",
    "2. Tokenization and lemmatization\n",
    "3. Batch processing of datasets\n",
    "4. Statistical analysis of preprocessing effects\n",
    "5. Vocabulary reduction analysis\n",
    "6. Domain-specific preprocessing characteristics\n",
    "7. Custom preprocessing options\n",
    "\n",
    "Key insights:\n",
    "- Preprocessing significantly reduces text length and vocabulary size\n",
    "- Different domains may require different preprocessing strategies\n",
    "- Custom preprocessing can be tailored to specific use cases\n",
    "- Proper preprocessing improves downstream NLP task performance"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.13.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}


## Cleaning examples
Try a few example strings and compare before/after cleaning.

In [None]:
samples = [
    '<p>Hello!!! Visit https://example.com for more 😊</p>',
    'Contact us at info@example.com or call +1 (555) 123-4567',
    'Repeated words words words words'
]
for s in samples:
    print('RAW :', s)
    print('CLEAN:', clean_text(s))
    print()

## Tokenization and lemmatization
Show tokens and lemmas for a couple of rows from the generated dataset.

In [None]:
row = df.loc[0, 'text']
print('Original:', row)
cleaned = clean_text(row)
print('Cleaned:', cleaned)
print('Tokens:', tokenize_and_lemmatize(cleaned))