# LLM Training Data Pipeline - Data Exploration

This notebook explores the data at various stages of the pipeline.

In [None]:
import sys
sys.path.insert(0, '..')

import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

# Pipeline imports
from src.ingestion import WikipediaParser
from src.processing import TextCleaner, MinHashDeduplicator, QualityFilter

## 1. Load Sample Data

In [None]:
# Load processed data if available
try:
    df = pd.read_parquet('../data/output/processed_data.parquet')
    print(f"Loaded {len(df)} documents")
    df.head()
except FileNotFoundError:
    print("No processed data found. Run the pipeline first!")
    df = None

## 2. Document Length Distribution

In [None]:
if df is not None:
    # Text length distribution
    df['text_length'] = df['text'].str.len()
    df['word_count'] = df['text'].str.split().str.len()
    
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Character length
    axes[0].hist(df['text_length'], bins=50, edgecolor='black')
    axes[0].set_xlabel('Character Count')
    axes[0].set_ylabel('Frequency')
    axes[0].set_title('Document Length Distribution (Characters)')
    
    # Word count
    axes[1].hist(df['word_count'], bins=50, edgecolor='black', color='orange')
    axes[1].set_xlabel('Word Count')
    axes[1].set_ylabel('Frequency')
    axes[1].set_title('Document Length Distribution (Words)')
    
    plt.tight_layout()
    plt.show()
    
    print(f"\nStatistics:")
    print(df[['text_length', 'word_count', 'token_count']].describe())

## 3. Token Analysis

In [None]:
if df is not None and 'token_count' in df.columns:
    # Compression ratio
    df['compression_ratio'] = df['text_length'] / df['token_count']
    
    plt.figure(figsize=(10, 5))
    plt.hist(df['compression_ratio'], bins=50, edgecolor='black', color='green')
    plt.xlabel('Characters per Token')
    plt.ylabel('Frequency')
    plt.title('Compression Ratio Distribution')
    plt.axvline(df['compression_ratio'].mean(), color='red', linestyle='--', label=f'Mean: {df["compression_ratio"].mean():.2f}')
    plt.legend()
    plt.show()

## 4. Sample Documents

In [None]:
if df is not None:
    # Show sample documents
    for idx, row in df.sample(3).iterrows():
        print("="*60)
        print(f"Title: {row['title']}")
        print(f"Words: {row['word_count']}, Tokens: {row['token_count']}")
        print("-"*60)
        print(row['text'][:500] + "..." if len(row['text']) > 500 else row['text'])
        print()

## 5. Pipeline Metrics

In [None]:
import json

try:
    with open('../data/output/pipeline_metrics.json') as f:
        metrics = json.load(f)
    
    print("Pipeline Metrics:")
    print(json.dumps(metrics, indent=2))
except FileNotFoundError:
    print("No metrics file found.")