In [None]:
import os
import pandas as pd
import numpy as np
import pickle
import logging
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from transformers import AutoTokenizer, AutoModel
import torch
import matplotlib.pyplot as plt
import seaborn as sns

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Initialize device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load models
try:
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    bert_model = AutoModel.from_pretrained("distilbert-base-uncased")
    bert_model = bert_model.to(device)
    print("Loaded DistilBERT model successfully")
except Exception as e:
    print(f"Failed to load DistilBERT model: {e}")

In [None]:
# Define data directory
processed_data_dir = 'agentic_news_editor/processed_data'

def load_training_data(data_dir=processed_data_dir):
    """Load processed headlines with CTR data"""
    try:
        headlines_path = os.path.join(data_dir, 'headline_ctr_data.csv')
        if not os.path.exists(headlines_path):
            print(f"Training data not found at {headlines_path}")
            return None
                
        headline_data = pd.read_csv(headlines_path)
        print(f"Loaded {len(headline_data)} headlines with CTR data")
        
        # Preview the data
        print("\nData preview:")
        print(headline_data.head())
        
        # Check for missing values
        print("\nMissing values:")
        print(headline_data[['title', 'ctr']].isna().sum())
        
        return headline_data
    except Exception as e:
        print(f"Error loading training data: {e}")
        return None

# Load the data
headline_data = load_training_data()