In [1]:
import os
import pandas as pd
import numpy as np
import pickle
import logging
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from transformers import AutoTokenizer, AutoModel
import torch
import matplotlib.pyplot as plt
import seaborn as sns

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Initialize device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load models
try:
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    bert_model = AutoModel.from_pretrained("distilbert-base-uncased")
    bert_model = bert_model.to(device)
    print("Loaded DistilBERT model successfully")
except Exception as e:
    print(f"Failed to load DistilBERT model: {e}")
    
test_input = tokenizer("Test headline", return_tensors="pt").to(device)
with torch.no_grad():
    output = bert_model(**test_input)
print("Model test passed. Output shape:", output.last_hidden_state.shape)


  from .autonotebook import tqdm as notebook_tqdm


Using device: cpu
Loaded DistilBERT model successfully
Model test passed. Output shape: torch.Size([1, 4, 768])


## Load Train Data

In [6]:
# Define data directory
processed_data_dir = 'agentic_news_editor/processed_data'

def load_training_data(data_dir=processed_data_dir):
    """Load processed headlines with CTR data"""
    try:
        headlines_path = os.path.join(data_dir, 'news_with_engagement.csv')
        if not os.path.exists(headlines_path):
            print(f"Training data not found at {headlines_path}")
            return None
                
        headline_data = pd.read_csv(headlines_path)
        print(f"Loaded {len(headline_data)} headlines with CTR data")
        
        # Preview the data
        print("\nData preview:")
        display(headline_data.head())
        
        # Check for missing values
        print("\nMissing values:")
        print(headline_data[['title', 'ctr']].isna().sum())
        
        return headline_data
    except Exception as e:
        print(f"Error loading training data: {e}")
        return None
# Explicitly call the function and store the result
print("About to call load_training_data()")
headline_data = load_training_data()

# Check if data was loaded
if headline_data is not None:
    print("Data loaded successfully")
    print(headline_data.head())  # Using print instead of display
else:
    print("Failed to load data")

About to call load_training_data()
Loaded 95492 headlines with CTR data

Data preview:


Unnamed: 0,newsID,category,subcategory,title,abstract,url,title_entities,abstract_entities,title_length,abstract_length,title_reading_ease,news_id,total_clicks,total_impressions,ctr
0,N88753,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[],70,73,77.23,,0.0,0.0,0.0
1,N45436,news,newsscienceandtechnology,Walmart Slashes Prices on Last-Generation iPads,Apple's new iPad releases bring big deals on l...,https://assets.msn.com/labs/mind/AABmf2I.html,"[{""Label"": ""IPad"", ""Type"": ""J"", ""WikidataId"": ...","[{""Label"": ""IPad"", ""Type"": ""J"", ""WikidataId"": ...",47,64,6.17,,0.0,0.0,0.0
2,N23144,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...",29,116,90.77,,0.0,0.0,0.0
3,N93187,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId...",63,196,101.6,,0.0,0.0,0.0
4,N75236,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ...",59,99,82.31,,0.0,0.0,0.0



Missing values:
title    0
ctr      0
dtype: int64
Data loaded successfully
   newsID   category               subcategory  \
0  N88753  lifestyle           lifestyleroyals   
1  N45436       news  newsscienceandtechnology   
2  N23144     health                weightloss   
3  N93187       news                 newsworld   
4  N75236     health                    voices   

                                               title  \
0  The Brands Queen Elizabeth, Prince Charles, an...   
1    Walmart Slashes Prices on Last-Generation iPads   
2                      50 Worst Habits For Belly Fat   
3  The Cost of Trump's Aid Freeze in the Trenches...   
4  I Was An NBA Wife. Here's How It Affected My M...   

                                            abstract  \
0  Shop the notebooks, jackets, and more that the...   
1  Apple's new iPad releases bring big deals on l...   
2  These seemingly harmless habits are holding yo...   
3  Lt. Ivan Molchanets peeked over a parapet of s...   
4  I 

## Feature Extraction

In [7]:
def extract_features(headlines, tokenizer=tokenizer, bert_model=bert_model, device=device):
    """Extract features from headlines for model training"""
    print(f"Extracting features from {len(headlines)} headlines")
    features_list = []
    
    # Process in smaller batches to avoid memory issues
    batch_size = 20  # Smaller for notebook testing
    
    # Sample a few headlines for testing in notebook
    if len(headlines) > 100:
        test_headlines = headlines[:100]
        print(f"Using first 100 headlines for testing")
    else:
        test_headlines = headlines
    
    for i in range(0, len(test_headlines), batch_size):
        batch = test_headlines[i:i+batch_size]
        print(f"Processing batch {i//batch_size + 1}/{(len(test_headlines)-1)//batch_size + 1}")
        
        for headline in batch:
            features = {}
            
            # Basic features based on EDA findings
            features['length'] = len(headline)
            features['word_count'] = len(headline.split())
            features['has_number'] = int(bool(re.search(r'\d', headline)))
            features['num_count'] = len(re.findall(r'\d+', headline))
            features['is_question'] = int(headline.endswith('?') or 
                                       headline.lower().startswith(('how', 'what', 'why', 'where', 'when', 'is ')))
            features['has_colon'] = int(':' in headline)
            features['has_quote'] = int('"' in headline or "'" in headline)
            features['has_how_to'] = int('how to' in headline.lower())
            
            # Get embedding for semantic features
            try:
                inputs = tokenizer(headline, return_tensors="pt", padding=True, truncation=True, max_length=128)
                inputs = {k: v.to(device) for k, v in inputs.items()}
                
                with torch.no_grad():
                    outputs = bert_model(**inputs)
                
                # Use the [CLS] token embedding
                embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()[0]
                
                # Add first 10 embedding dimensions as features
                for j in range(10):
                    features[f'emb_{j}'] = embedding[j]
            except Exception as e:
                print(f"Error extracting embedding for '{headline}': {e}")
                # Add zero embeddings if failed
                for j in range(10):
                    features[f'emb_{j}'] = 0.0
            
            features_list.append(features)
    
    features_df = pd.DataFrame(features_list)
    
    # Display feature statistics
    print("\nFeature statistics:")
    print(features_df.describe())
    
    return features_df

# Clean data and extract features
if headline_data is not None:
    # Handle NaN values
    clean_headline_data = headline_data.dropna(subset=['title', 'ctr'])
    print(f"Clean data shape: {clean_headline_data.shape}")
    
    # Extract features for a sample
    features_df = extract_features(clean_headline_data['title'].values)

Clean data shape: (95492, 15)
Extracting features from 95492 headlines
Using first 100 headlines for testing
Processing batch 1/5
Processing batch 2/5
Processing batch 3/5
Processing batch 4/5
Processing batch 5/5

Feature statistics:
          length  word_count  has_number   num_count  is_question   has_colon  \
count  100.00000  100.000000  100.000000  100.000000    100.00000  100.000000   
mean    61.61000   10.470000    0.410000    0.540000      0.15000    0.130000   
std     17.09681    3.056554    0.494311    0.757721      0.35887    0.337998   
min     16.00000    3.000000    0.000000    0.000000      0.00000    0.000000   
25%     52.00000    9.000000    0.000000    0.000000      0.00000    0.000000   
50%     60.00000   10.000000    0.000000    0.000000      0.00000    0.000000   
75%     71.00000   12.000000    1.000000    1.000000      0.00000    0.000000   
max    112.00000   20.000000    1.000000    3.000000      1.00000    1.000000   

        has_quote  has_how_to      

## Model Training

def train_model(features_df, ctr_values, output_file='headline_ctr_model.pkl'):
    """Train a model to predict headline CTR"""
    print("Training headline CTR prediction model")
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        features_df, ctr_values, test_size=0.2, random_state=42
    )
    
    print(f"Training set: {X_train.shape[0]} samples")
    print(f"Test set: {X_test.shape[0]} samples")
    
    # Define and train model
    model = RandomForestRegressor(
        n_estimators=100, 
        max_depth=10,
        min_samples_split=10,
        random_state=42
    )
    
    model.fit(X_train, y_train)
    
    # Evaluate model
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"Model evaluation - MSE: {mse:.4f}, R²: {r2:.4f}")
    
    # Feature importance
    feature_importances = pd.DataFrame({
        'feature': features_df.columns,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\nTop 10 important features:")
    for i, row in feature_importances.head(10).iterrows():
        print(f"  {row['feature']}: {row['importance']:.4f}")
    
    # Plot feature importance
    plt.figure(figsize=(10, 6))
    sns.barplot(x='importance', y='feature', data=feature_importances.head(10))
    plt.title('Top 10 Feature Importances for CTR Prediction')
    plt.tight_layout()
    plt.show()
    
    # Save model - comment this out during testing if desired
    # with open(output_file, 'wb') as f:
    #     pickle.dump(model, f)
    # print(f"Model saved to {output_file}")
    
    return {
        'model': model,
        'mse': mse,
        'r2': r2,
        'feature_importances': feature_importances
    }

# Train model if features are available
if 'features_df' in locals() and len(features_df) > 0:
    result = train_model(features_df, clean_headline_data['ctr'].values)

## Model Report

def create_model_report(result, headline_data, save_to_file=False):
    """Create a markdown report about the model performance"""
    if result is None:
        print("No model result to report")
        return
    
    report = f"""# Headline CTR Prediction Model Report
Generated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M')}

## Model Performance
- Mean Squared Error: {result['mse']:.4f}
- R-squared: {result['r2']:.4f}

## Dataset Summary
- Total headlines analyzed: {len(headline_data)}
- CTR range: {headline_data['ctr'].min():.2f} to {headline_data['ctr'].max():.2f}
- Mean CTR: {headline_data['ctr'].mean():.2f}

## Key Feature Importances
"""
    
    for i, row in result['feature_importances'].head(10).iterrows():
        report += f"- {row['feature']}: {row['importance']:.4f}\n"
    
    report += """
## Usage Guidelines
This model can be used to predict the expected CTR of news headlines.
It's integrated with the HeadlineMetrics class for headline evaluation
and the HeadlineLearningLoop for continuous improvement.

## Features Based on EDA
The model uses features derived from EDA findings:
- Questions in headlines significantly reduce CTR
- Numbers in headlines can reduce CTR if used inappropriately
- Headline length and structure matter for engagement
- Category-specific patterns influence performance
"""
    
    # Print report for notebook review
    print(report)
    
    # Save to file if requested
    if save_to_file:
        with open('headline_model_report.md', 'w') as f:
            f.write(report)
        print("Model report saved to headline_model_report.md")

# Create report if results are available
if 'result' in locals() and result is not None:
    create_model_report(result, clean_headline_data, save_to_file=False)

## Full Pipeline

def run_training_pipeline(save_model=False):
    """Run the complete model training pipeline"""
    # Load data
    headline_data = load_training_data()
    if headline_data is None:
        print("Could not load training data. Aborting.")
        return None
    
    # Handle NaN values
    clean_data = headline_data.dropna(subset=['title', 'ctr'])
    
    # Extract features
    features_df = extract_features(clean_data['title'].values)
    
    # Train model
    result = train_model(features_df, clean_data['ctr'].values)
    
    # Create a report
    create_model_report(result, headline_data, save_to_file=save_model)
    
    if save_model:
        # Save model
        with open('headline_ctr_model.pkl', 'wb') as f:
            pickle.dump(result['model'], f)
        print("Model saved to headline_ctr_model.pkl")
    
    return result

# Uncommment to run full pipeline
# final_result = run_training_pipeline(save_model=True)
# if final_result is not None:
#     print(f"Model training complete. R-squared: {final_result['r2']:.4f}")
# else:
#     print("Model training failed.")