# NYT 500K Articles - Embeddings Generation

This notebook:
1. Preprocesses 500K NYT articles from CSV
2. Generates BERTweet embeddings (768 dimensions)
3. Downloads embeddings for local use

**Estimated time:** 2-3 hours with GPU

## Step 1: Install Dependencies

In [None]:
!pip install transformers torch pandas pyarrow tqdm scikit-learn

## Step 2: Import Libraries

In [None]:
import numpy as np
import pandas as pd
import torch
import re
from pathlib import Path
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
from google.colab import files

print("All libraries imported successfully!")

## Step 3: Upload Your Data

Upload `nyt_articles_500K.csv` file

In [None]:
# Option 1: Upload file directly
print("Please upload your nyt_articles_500K.csv file:")
uploaded = files.upload()
input_file = list(uploaded.keys())[0]
print(f"Uploaded: {input_file}")

# Option 2: If using Google Drive (uncomment below)
# from google.colab import drive
# drive.mount('/content/drive')
# input_file = '/content/drive/MyDrive/nyt_articles_500K.csv'

## Step 4: Load and Preprocess Data

In [None]:
print("Loading data...")
df = pd.read_csv(input_file)
print(f"Loaded {len(df):,} articles")
print(f"Columns: {list(df.columns)}")
print(f"\nDate range: {df['pub_date'].min()} to {df['pub_date'].max()}")

In [None]:
def clean_text(text):
    """Clean and normalize text."""
    if not isinstance(text, str) or not text:
        return ''
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Remove special characters but keep basic punctuation
    text = re.sub(r'[^a-zA-Z0-9\s.,!?;:\'-]', '', text)
    
    return text.strip()

print("Text cleaning function defined")

In [None]:
print("\n" + "="*60)
print("PREPROCESSING 500K ARTICLES")
print("="*60)

# Handle missing values
print("\n1. Handling missing values...")
df['headline'] = df['headline'].fillna('')
df['abstract'] = df['abstract'].fillna('')
df['lead_paragraph'] = df['lead_paragraph'].fillna('')

# Clean text fields
print("\n2. Cleaning text fields...")
tqdm.pandas(desc="Cleaning headlines")
df['headline_cleaned'] = df['headline'].progress_apply(clean_text)

tqdm.pandas(desc="Cleaning abstracts")
df['abstract_cleaned'] = df['abstract'].progress_apply(clean_text)

tqdm.pandas(desc="Cleaning body")
df['body_cleaned'] = df['lead_paragraph'].progress_apply(clean_text)

# Create combined text field
print("\n3. Creating combined text field...")
def combine_text(row):
    """Combine headline, abstract, and body with weights."""
    parts = []
    
    # Headline (most important, repeat 2x)
    if row.get('headline_cleaned'):
        parts.append(row['headline_cleaned'])
        parts.append(row['headline_cleaned'])
    
    # Abstract
    if row.get('abstract_cleaned'):
        parts.append(row['abstract_cleaned'])
    
    # Body (first 500 chars)
    if row.get('body_cleaned'):
        body = str(row['body_cleaned'])[:500]
        parts.append(body)
    
    return ' '.join(parts)

tqdm.pandas(desc="Combining text")
df['combined_text'] = df.progress_apply(combine_text, axis=1)

# Clean combined text
print("\n4. Cleaning combined text...")
tqdm.pandas(desc="Final cleaning")
df['cleaned_text'] = df['combined_text'].progress_apply(clean_text)

# Compute statistics
print("\n5. Computing text statistics...")
df['word_count'] = df['cleaned_text'].str.split().str.len()
df['char_count'] = df['cleaned_text'].str.len()

# Filter out very short articles
before_filter = len(df)
df = df[df['word_count'] >= 10].reset_index(drop=True)
after_filter = len(df)

print(f"\n6. Filtered out {before_filter - after_filter:,} articles with < 10 words")
print(f"   Remaining: {after_filter:,} articles")

print("\n" + "="*60)
print("PREPROCESSING COMPLETE")
print("="*60)
print(f"\nFinal dataset: {len(df):,} articles")
print(f"Avg word count: {df['word_count'].mean():.2f}")
print(f"Avg char count: {df['char_count'].mean():.2f}")

## Step 5: Generate Embeddings

In [None]:
# Check GPU availability
def get_device():
    if torch.cuda.is_available():
        device = torch.device('cuda')
        print(f"✓ Using GPU: {torch.cuda.get_device_name(0)}")
        print(f"  GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    else:
        device = torch.device('cpu')
        print("⚠ Using CPU (this will be slower)")
    return device

device = get_device()

In [None]:
def extract_embeddings_batch(texts, tokenizer, model, device, max_length=128):
    """Extract embeddings for a batch of texts."""
    # Tokenize
    encoded = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )
    
    # Move to device
    encoded = {k: v.to(device) for k, v in encoded.items()}
    
    # Get embeddings
    with torch.no_grad():
        outputs = model(**encoded)
    
    # Use CLS token embedding (first token)
    embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
    
    return embeddings

print("Embedding extraction function defined")

In [None]:
print("\n" + "="*60)
print("LOADING BERTWEET MODEL")
print("="*60)

model_name = 'vinai/bertweet-base'
print(f"\nModel: {model_name}")
print("This may take a few minutes...\n")

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.to(device)
model.eval()

embedding_dim = model.config.hidden_size

print(f"\n✓ Model loaded successfully")
print(f"  Embedding dimension: {embedding_dim}")

In [None]:
print("\n" + "="*60)
print("GENERATING EMBEDDINGS")
print("="*60)

# Configuration
batch_size = 64  # Use 32 if you get OOM errors
max_length = 128

print(f"\nProcessing {len(df):,} articles")
print(f"Batch size: {batch_size}")
print(f"Max length: {max_length}")
print(f"Device: {device}")

# Estimated size
estimated_size_mb = len(df) * embedding_dim * 4 / (1024**2)
print(f"\nEstimated output size: {estimated_size_mb:.2f} MB")

# Extract embeddings
texts = df['cleaned_text'].fillna('').astype(str).tolist()
all_embeddings = []
n_batches = (len(texts) + batch_size - 1) // batch_size

print(f"\nProcessing {n_batches} batches...\n")

for i in tqdm(range(n_batches), desc="Generating embeddings"):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(texts))
    batch_texts = texts[start_idx:end_idx]
    
    try:
        batch_embeddings = extract_embeddings_batch(
            batch_texts,
            tokenizer,
            model,
            device,
            max_length=max_length
        )
        all_embeddings.append(batch_embeddings)
    except Exception as e:
        print(f"\n⚠ Error in batch {i}: {e}")
        # Create zero embeddings for failed batch
        batch_embeddings = np.zeros((len(batch_texts), embedding_dim))
        all_embeddings.append(batch_embeddings)

# Combine all embeddings
embeddings = np.vstack(all_embeddings)

print(f"\n✓ Embeddings generated successfully")
print(f"  Shape: {embeddings.shape}")
print(f"  Size: {embeddings.nbytes / (1024**2):.2f} MB")

In [None]:
print("\nCreating ID-to-index mapping...")

# Create mapping DataFrame
if '_id' in df.columns:
    mapping = pd.DataFrame({
        '_id': df['_id'].values,
        'index': np.arange(len(df))
    })
else:
    mapping = pd.DataFrame({
        '_id': df.index.values,
        'index': np.arange(len(df))
    })

print(f"✓ Mapping created: {len(mapping):,} entries")

## Step 6: Save and Download Results

In [None]:
print("\n" + "="*60)
print("SAVING FILES")
print("="*60)

# Save embeddings
print("\n1. Saving embeddings...")
np.save('embeddings_500k.npy', embeddings)
print(f"   ✓ Saved: embeddings_500k.npy ({embeddings.nbytes / (1024**2):.2f} MB)")

# Save mapping
print("\n2. Saving mapping...")
mapping.to_csv('embeddings_500k_mapping.csv', index=False)
print(f"   ✓ Saved: embeddings_500k_mapping.csv ({len(mapping):,} rows)")

# Save preprocessed data
print("\n3. Saving preprocessed data...")
df.to_parquet('preprocessed_500K.parquet', index=False)
print(f"   ✓ Saved: preprocessed_500K.parquet ({len(df):,} articles)")

print("\n" + "="*60)
print("ALL FILES SAVED SUCCESSFULLY")
print("="*60)

In [None]:
print("\nDownloading files to your computer...\n")

# Download embeddings
print("1. Downloading embeddings_500k.npy...")
files.download('embeddings_500k.npy')

# Download mapping
print("2. Downloading embeddings_500k_mapping.csv...")
files.download('embeddings_500k_mapping.csv')

# Download preprocessed data
print("3. Downloading preprocessed_500K.parquet...")
files.download('preprocessed_500K.parquet')

print("\n✓ All files downloaded!")
print("\nPlace these files in your local 'data/' directory:")
print("  - embeddings_500k.npy")
print("  - embeddings_500k_mapping.csv")
print("  - preprocessed_500K.parquet")

## Step 7: Verification & Quick Test

In [None]:
print("\n" + "="*60)
print("VERIFICATION")
print("="*60)

print(f"\nEmbeddings shape: {embeddings.shape}")
print(f"Mapping shape: {mapping.shape}")
print(f"\nFirst 5 article IDs:")
print(mapping.head())
print(f"\nSample embedding (first 10 dimensions):")
print(embeddings[0][:10])

In [None]:
# Quick similarity test
from sklearn.metrics.pairwise import cosine_similarity

print("\n" + "="*60)
print("QUICK SIMILARITY TEST")
print("="*60)

# Find articles similar to the first article
query_idx = 0
similarities = cosine_similarity([embeddings[query_idx]], embeddings)[0]
top_k_indices = similarities.argsort()[-10:][::-1]

print(f"\nTop 10 articles similar to article at index {query_idx}:")
print(f"Query article ID: {mapping.iloc[query_idx]['_id']}")
print(f"\nSimilar articles:")
for i, idx in enumerate(top_k_indices, 1):
    article_id = mapping.iloc[idx]['_id']
    score = similarities[idx]
    print(f"  {i}. {article_id} (similarity: {score:.4f})")

## Summary

### What You Generated:
1. **embeddings_500k.npy** - BERTweet embeddings (500K × 768)
2. **embeddings_500k_mapping.csv** - Article ID to index mapping
3. **preprocessed_500K.parquet** - Cleaned article data

### Next Steps:
1. Download the three files above
2. Place them in your local `data/` directory
3. Update your application to use the new embeddings
4. Run your analysis with 25x more data!

### Local Usage:
```python
import numpy as np
import pandas as pd

# Load embeddings
embeddings = np.load('data/embeddings_500k.npy')
mapping = pd.read_csv('data/embeddings_500k_mapping.csv')
articles = pd.read_parquet('data/preprocessed_500K.parquet')

print(f"Loaded {embeddings.shape[0]:,} embeddings")
```