# Experiments: TF‑IDF hyperparameter sweeps and diagnostics

This notebook contains runnable experiments for sampling documents, sweeping TF‑IDF hyperparameters, and producing similarity diagnostics.

In [None]:
# Cell 1: Setup - Import Libraries and Load Sample Documents
import sys
from pathlib import Path
sys.path.insert(0, str(Path('..') / 'src'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
import glob
import re
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 6)

# Load sample documents from data/internal/
data_dir = Path('../data/internal')
files = sorted(glob.glob(str(data_dir / '*.txt')))[:10]
docs = []
doc_names = []

for f in files:
    with open(f, 'r', encoding='utf-8') as fh:
        text = fh.read()
        docs.append(text)
        doc_names.append(Path(f).stem)

print(f'✓ Loaded {len(docs)} documents')
print(f'Sample documents: {doc_names[:3]}...')

# Define preprocess_text() locally for flexibility
def preprocess_text(text):
    """Basic preprocessing for experiments"""
    # Lowercase
    text = text.lower()
    # Remove special chars, keep alphanumeric and spaces
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove stopwords
    stopwords = {'the', 'a', 'an', 'and', 'or', 'is', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from'}
    tokens = text.split()
    tokens = [t for t in tokens if t not in stopwords and len(t) > 1]
    return ' '.join(tokens)

# Preprocess all documents
docs_clean = [preprocess_text(doc) for doc in docs]
print(f'✓ Preprocessed {len(docs_clean)} documents')

## Cell 2: Baseline TF-IDF Analysis

Create a baseline TfidfVectorizer with default parameters and analyze the resulting matrix properties.

In [None]:
# Cell 2: Baseline TF-IDF Analysis
# Create TfidfVectorizer with default parameters
vectorizer_baseline = TfidfVectorizer()
tfidf_matrix = vectorizer_baseline.fit_transform(docs_clean)

# Calculate sparsity
n_zeros = tfidf_matrix.nnz - tfidf_matrix.data.size if hasattr(tfidf_matrix, 'data') else 0
sparsity = 1 - (tfidf_matrix.nnz / (tfidf_matrix.shape[0] * tfidf_matrix.shape[1]))

print("=" * 60)
print("BASELINE TF-IDF ANALYSIS")
print("=" * 60)
print(f"Matrix shape: {tfidf_matrix.shape[0]} documents × {tfidf_matrix.shape[1]} features")
print(f"Non-zero elements: {tfidf_matrix.nnz}")
print(f"Sparsity: {sparsity:.2%}")
print(f"Matrix density: {(1 - sparsity):.2%}")

# Get top 10 features by frequency
feature_names = np.array(vectorizer_baseline.get_feature_names_out())
tfidf_sum = np.asarray(tfidf_matrix.sum(axis=0)).flatten()
top_indices = tfidf_sum.argsort()[-10:][::-1]
top_features = feature_names[top_indices]
top_scores = tfidf_sum[top_indices]

print("\nTop 10 Features by TF-IDF Sum:")
for i, (feat, score) in enumerate(zip(top_features, top_scores), 1):
    print(f"  {i:2d}. {feat:20s} → {score:.4f}")

print("\n" + "=" * 60)

## Cell 3: Hyperparameter Sensitivity Sweep

Test combinations of `min_df` and `max_df` to understand their impact on feature extraction.
- **min_df**: Minimum document frequency (filters out rare terms)
- **max_df**: Maximum document frequency (filters out common terms)


In [None]:
# Cell 3: Hyperparameter Sensitivity Sweep
min_df_values = [0.01, 0.05, 0.1, 0.2]
max_df_values = [0.8, 0.9, 0.95, 1.0]

sweep_results = []

print("Running hyperparameter sweep...")
print(f"Testing {len(min_df_values)} × {len(max_df_values)} = {len(min_df_values) * len(max_df_values)} combinations\n")

for min_df in min_df_values:
    for max_df in max_df_values:
        try:
            vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df, norm='l2')
            matrix = vectorizer.fit_transform(docs_clean)
            
            n_features = matrix.shape[1]
            nnz = matrix.nnz
            sparsity = 1 - (nnz / (matrix.shape[0] * matrix.shape[1]))
            
            sweep_results.append({
                'min_df': min_df,
                'max_df': max_df,
                'n_features': n_features,
                'nnz': nnz,
                'sparsity': sparsity,
                'density': 1 - sparsity
            })
            print(f"✓ min_df={min_df:.2f}, max_df={max_df:.2f} → {n_features:4d} features, sparsity={sparsity:.2%}")
        except Exception as e:
            print(f"✗ min_df={min_df:.2f}, max_df={max_df:.2f} → ERROR: {str(e)[:50]}")

# Convert to DataFrame for analysis
df_sweep = pd.DataFrame(sweep_results)
print(f"\n✓ Completed {len(df_sweep)} combinations successfully\n")
print("Summary Statistics:")
print(df_sweep.groupby('min_df')[['n_features', 'sparsity']].agg(['min', 'mean', 'max']).round(4))

## Cell 4: Hyperparameter Visualization

Visualize the impact of min_df and max_df on feature extraction and sparsity.

In [None]:
# Cell 4: Hyperparameter Visualization
fig, axes = plt.subplots(1, 2, figsize=(16, 5))

# Plot 1: Feature count vs min_df (lines for each max_df)
ax1 = axes[0]
for max_df in max_df_values:
    df_subset = df_sweep[df_sweep['max_df'] == max_df]
    ax1.plot(df_subset['min_df'], df_subset['n_features'], 
             marker='o', linewidth=2, markersize=8, label=f'max_df={max_df}')
ax1.set_xlabel('min_df (Minimum Document Frequency)', fontsize=12, fontweight='bold')
ax1.set_ylabel('Number of Features', fontsize=12, fontweight='bold')
ax1.set_title('Impact of min_df on Feature Count', fontsize=14, fontweight='bold')
ax1.legend(loc='best', fontsize=10)
ax1.grid(True, alpha=0.3)
ax1.set_xscale('log')

# Plot 2: Sparsity vs min_df (lines for each max_df)
ax2 = axes[1]
for max_df in max_df_values:
    df_subset = df_sweep[df_sweep['max_df'] == max_df]
    ax2.plot(df_subset['min_df'], df_subset['sparsity'] * 100, 
             marker='s', linewidth=2, markersize=8, label=f'max_df={max_df}')
ax2.set_xlabel('min_df (Minimum Document Frequency)', fontsize=12, fontweight='bold')
ax2.set_ylabel('Sparsity (%)', fontsize=12, fontweight='bold')
ax2.set_title('Impact of min_df on Sparsity', fontsize=14, fontweight='bold')
ax2.legend(loc='best', fontsize=10)
ax2.grid(True, alpha=0.3)
ax2.set_xscale('log')

plt.tight_layout()
output_path = Path('../results/hyperparameter_sweep.png')
output_path.parent.mkdir(parents=True, exist_ok=True)
plt.savefig(output_path, dpi=300, bbox_inches='tight')
print(f"✓ Saved visualization to {output_path}")
plt.show()

## Cell 5: TF Variant Comparison

Compare all 5 TF variants for the word "compliance" across sample documents.

**TF Variants:**
1. **Raw count**: Simple term frequency
2. **Log-normalized**: 1 + log(count)
3. **Double normalized**: 0.5 + 0.5 × (count / max_count)
4. **Augmented**: count / max_count
5. **Boolean**: 1 if present, 0 otherwise


In [None]:
# Cell 5: TF Variant Comparison
# Select a sample word to analyze
sample_word = "compliance"
sample_docs = docs_clean[:5]

print("=" * 70)
print(f"TF VARIANT COMPARISON FOR WORD: '{sample_word}'")
print("=" * 70)

tf_variants = []

for doc_idx, doc in enumerate(sample_docs):
    tokens = doc.split()
    raw_count = tokens.count(sample_word)
    max_count = max(Counter(tokens).values()) if tokens else 1
    
    # Variant 1: Raw count
    tf_raw = raw_count
    
    # Variant 2: Log-normalized
    tf_log = 1 + np.log(raw_count) if raw_count > 0 else 0
    
    # Variant 3: Double normalized
    tf_double = 0.5 + 0.5 * (raw_count / max_count) if raw_count > 0 else 0.5
    
    # Variant 4: Augmented (normalized by max)
    tf_augmented = raw_count / max_count if raw_count > 0 else 0
    
    # Variant 5: Boolean
    tf_boolean = 1.0 if raw_count > 0 else 0.0
    
    tf_variants.append({
        'Document': f'Doc {doc_idx + 1}',
        'Raw Count': raw_count,
        'Log-normalized': tf_log,
        'Double-normalized': tf_double,
        'Augmented': tf_augmented,
        'Boolean': tf_boolean
    })

df_tf = pd.DataFrame(tf_variants)
print("\nTF Variants Comparison Table:")
print(df_tf.to_string(index=False))

# Visualization
fig, ax = plt.subplots(figsize=(14, 6))
x = np.arange(len(sample_docs))
width = 0.15
columns_to_plot = ['Raw Count', 'Log-normalized', 'Double-normalized', 'Augmented', 'Boolean']

for i, col in enumerate(columns_to_plot):
    ax.bar(x + i * width, df_tf[col], width, label=col, alpha=0.8)

ax.set_xlabel('Document', fontsize=12, fontweight='bold')
ax.set_ylabel('TF Value', fontsize=12, fontweight='bold')
ax.set_title(f'TF Variants Comparison for "{sample_word}"', fontsize=14, fontweight='bold')
ax.set_xticks(x + width * 2)
ax.set_xticklabels([f'Doc {i+1}' for i in range(len(sample_docs))])
ax.legend(loc='best', fontsize=10)
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

## Cell 6: IDF Variant Comparison

Compare all 4 IDF variants for the word "compliance" across the corpus.

**IDF Variants:**
1. **Standard**: log(N / df) where N = total docs, df = docs containing term
2. **Smooth IDF**: log((N + 1) / (df + 1)) + 1 (adds 1 to prevent division by zero)
3. **Max IDF**: log(max_df / df) (uses max document frequency)
4. **Probabilistic**: log((N - df) / df)


In [None]:
# Cell 6: IDF Variant Comparison
# Calculate document frequency for sample word
sample_word = "compliance"
N = len(docs_clean)
df = sum(1 for doc in docs_clean if sample_word in doc.split())

print("=" * 70)
print(f"IDF VARIANT COMPARISON FOR WORD: '{sample_word}'")
print("=" * 70)
print(f"Total documents (N): {N}")
print(f"Documents containing '{sample_word}': {df}")
print()

# Calculate all 4 IDF variants
idf_data = {}

# Variant 1: Standard IDF
if df > 0:
    idf_standard = np.log(N / df)
else:
    idf_standard = 0
idf_data['Standard: log(N/df)'] = idf_standard

# Variant 2: Smooth IDF (sklearn default)
if df + 1 > 0:
    idf_smooth = np.log((N + 1) / (df + 1)) + 1
else:
    idf_smooth = 1
idf_data['Smooth: log((N+1)/(df+1)) + 1'] = idf_smooth

# Variant 3: Max IDF
max_df = N  # maximum possible
if df > 0:
    idf_max = np.log(max_df / df)
else:
    idf_max = 0
idf_data['Max IDF: log(max_df/df)'] = idf_max

# Variant 4: Probabilistic IDF
if df > 0 and (N - df) > 0:
    idf_prob = np.log((N - df) / df)
else:
    idf_prob = 0
idf_data['Probabilistic: log((N-df)/df)'] = idf_prob

# Display as table
df_idf = pd.DataFrame(list(idf_data.items()), columns=['IDF Variant', 'Value'])
print("IDF Variants Comparison Table:")
print(df_idf.to_string(index=False))

print("\n" + "=" * 70)
print("INTERPRETATION:")
print("=" * 70)
print(f"• Standard IDF is the most common and simplest formula")
print(f"• Smooth IDF adds 1 to avoid log(0) and division by zero")
print(f"• Max IDF normalizes by the maximum possible document frequency")
print(f"• Probabilistic IDF emphasizes rarity (N-df) in the numerator")
print()
print(f"For '{sample_word}' appearing in {df}/{N} documents:")
print(f"  → Appears in {100*df/N:.1f}% of corpus")
print(f"  → Standard IDF: {idf_standard:.4f}")
print(f"  → Smooth IDF (sklearn): {idf_smooth:.4f}")

# Visualization
fig, ax = plt.subplots(figsize=(12, 6))
variants = list(idf_data.keys())
values = list(idf_data.values())
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']
bars = ax.bar(range(len(variants)), values, color=colors, alpha=0.8, edgecolor='black', linewidth=1.5)

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{height:.4f}', ha='center', va='bottom', fontsize=11, fontweight='bold')

ax.set_xlabel('IDF Variant', fontsize=12, fontweight='bold')
ax.set_ylabel('IDF Value', fontsize=12, fontweight='bold')
ax.set_title(f'IDF Variants for "{sample_word}" (df={df}/{N})', fontsize=14, fontweight='bold')
ax.set_xticks(range(len(variants)))
ax.set_xticklabels(variants, rotation=15, ha='right', fontsize=10)
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

## Cell 7: Manual vs Sklearn TF-IDF Validation

Validate our manual TF-IDF implementation against sklearn's TfidfVectorizer by comparing computed values for sample words.

In [None]:
# Cell 7: Manual vs Sklearn TF-IDF Validation

# Import manual implementations from src
try:
    from manual_tfidf_math import (
        compute_term_frequency,
        compute_idf_smooth,
        compute_tfidf_l2_norm
    )
    manual_available = True
except ImportError:
    print("⚠ Manual TF-IDF functions not available, will create simplified versions")
    manual_available = False

# Create simplified manual implementations for validation
def manual_tf_idf(corpus, sample_words):
    """Compute manual TF-IDF for validation"""
    results = []
    
    for word_idx, word in enumerate(sample_words):
        word_results = []
        
        for doc_idx, doc in enumerate(corpus):
            tokens = doc.split()
            
            # TF: log-normalized (1 + log(count))
            raw_count = tokens.count(word)
            tf = 1 + np.log(raw_count) if raw_count > 0 else 0
            
            # IDF: smooth (log((N+1)/(df+1)) + 1)
            N = len(corpus)
            df = sum(1 for d in corpus if word in d.split())
            idf = np.log((N + 1) / (df + 1)) + 1 if df > 0 else 0
            
            # TF-IDF
            tfidf = tf * idf
            word_results.append(tfidf)
        
        results.append(word_results)
    
    return results

print("=" * 70)
print("VALIDATION: Manual vs Sklearn TF-IDF")
print("=" * 70)

# Select 3 sample words for validation
sample_words = ["compliance", "procedure", "policy"]
test_corpus = docs_clean[:5]

print(f"\nTest corpus: {len(test_corpus)} documents")
print(f"Sample words: {sample_words}\n")

# Get sklearn TF-IDF
vectorizer_val = TfidfVectorizer(norm='l2', sublinear_tf=True)
sklearn_matrix = vectorizer_val.fit_transform(test_corpus)

# Get feature names and indices
feature_names = vectorizer_val.get_feature_names_out()
sklearn_tfidf = {}

for word in sample_words:
    if word in feature_names:
        word_idx = list(feature_names).index(word)
        sklearn_tfidf[word] = sklearn_matrix[:, word_idx].toarray().flatten()
    else:
        sklearn_tfidf[word] = np.array([0] * len(test_corpus))

# Get manual TF-IDF
manual_results = manual_tf_idf(test_corpus, sample_words)

# Normalize manual results (L2 norm)
manual_tfidf_norm = {}
for word_idx, word in enumerate(sample_words):
    manual_vals = np.array(manual_results[word_idx])
    # L2 normalize
    norm = np.sqrt(np.sum(manual_vals ** 2))
    manual_vals_norm = manual_vals / norm if norm > 0 else manual_vals
    manual_tfidf_norm[word] = manual_vals_norm

# Compare results
print("COMPARISON TABLE:")
print("-" * 70)

comparison_data = []
matches = 0
total_comparisons = 0

for word in sample_words:
    if word in sklearn_tfidf:
        print(f"\nWord: '{word}'")
        print(f"  Document | Manual TF-IDF | Sklearn TF-IDF | Diff      | Match")
        print(f"  " + "-" * 65)
        
        for doc_idx in range(len(test_corpus)):
            manual_val = manual_tfidf_norm[word][doc_idx]
            sklearn_val = sklearn_tfidf[word][doc_idx]
            diff = abs(manual_val - sklearn_val)
            match = diff < 0.01  # 1% tolerance
            match_str = "✓" if match else "✗"
            
            print(f"  {doc_idx + 1:8d} | {manual_val:13.6f} | {sklearn_val:14.6f} | {diff:.6f} | {match_str}")
            
            if match:
                matches += 1
            total_comparisons += 1

# Calculate match percentage
match_percentage = (matches / total_comparisons * 100) if total_comparisons > 0 else 0

print("\n" + "=" * 70)
print(f"VALIDATION SUMMARY:")
print("=" * 70)
print(f"Total comparisons: {total_comparisons}")
print(f"Matches (within 1%): {matches}")
print(f"Match percentage: {match_percentage:.1f}%")
print()

# Assertion
if match_percentage >= 95:
    print(f"✅ VALIDATION PASSED: {match_percentage:.1f}% match (threshold: 95%)")
else:
    print(f"⚠ VALIDATION WARNING: {match_percentage:.1f}% match (threshold: 95%)")

print("=" * 70)