In [None]:
# Agentic AI News Editor: Comprehensive EDA Pipeline
# -----------------------------------------------
# This pipeline combines analysis of the Microsoft MIND dataset to prepare data
# for an agentic AI system that selects, ranks, and rewrites news headlines

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import os
from datetime import datetime
from collections import Counter
import re
import json
from textstat import flesch_reading_ease
import warnings
warnings.filterwarnings('ignore')

# Set visualization defaults
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10

# Create output directory for results
output_dir = 'agentic_news_editor'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    os.makedirs(f'{output_dir}/plots')
    os.makedirs(f'{output_dir}/processed_data')

print("# Agentic AI News Editor - EDA Pipeline")
print("Starting EDA pipeline... Time:", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print("-" * 80)

# ===============================================================
# PART 1: DATA LOADING AND INITIAL EXPLORATION
# ===============================================================
print("\n## PART 1: Data Loading and Initial Exploration")

# Load news data
news_cols = ["newsID", "category", "subcategory", "title", "abstract", "url", "title_entities", "abstract_entities"]
print("Loading news data...")
news_df = pd.read_csv("train_data/news.tsv", sep="\t", header=None, names=news_cols)
print(f"News data loaded: {news_df.shape[0]} rows, {news_df.shape[1]} columns")

# Loading behaviors data
print("Loading behaviors data...")
behaviors_cols = ["impression_id", "user_id", "time", "history", "impressions"]
behaviors_df = pd.read_csv("train_data/behaviors.tsv", sep="\t", header=None, names=behaviors_cols)
print(f"Behaviors data loaded: {behaviors_df.shape[0]} rows, {behaviors_df.shape[1]} columns")

# Store news_ids for reference
news_ids = set(news_df["newsID"])

# Sample display of news data
print("\nSample news data:")
print(news_df.head(3))

# Sample display of behaviors data
print("\nSample behaviors data:")
print(behaviors_df.head(3))

# ===============================================================
# PART 5: PREPARE DATA FOR THE AGENTIC NEWS EDITOR
# ===============================================================
print("\n## PART 5: Data Preparation for Agentic News Editor")

# 1. Clean and process news data
print("Preparing news data...")
news_df_cleaned = news_df.copy()

# Filter out articles with very short titles or abstracts
news_df_cleaned = news_df_cleaned[news_df_cleaned['title_length'] >= 10]
news_df_cleaned = news_df_cleaned[news_df_cleaned['abstract_length'] >= 20]

# Add reading ease score if not already added
if 'title_reading_ease' not in news_df_cleaned.columns:
    news_df_cleaned = calculate_reading_scores(news_df_cleaned)

# 2. Prepare behavior data
print("Preparing behavior data...")

# Add CTR data to news articles
article_ctr_data = impressions_df.groupby('news_id').agg({
    'clicked': ['sum', 'count']
})
article_ctr_data.columns = ['total_clicks', 'total_impressions']
article_ctr_data['ctr'] = article_ctr_data['total_clicks'] / article_ctr_data['total_impressions']

# Merge CTR data with news data
news_with_engagement = news_df_cleaned.merge(
    article_ctr_data.reset_index(),
    left_on='newsID',
    right_on='news_id',
    how='left'
)

# Fill missing engagement data with zeros
engagement_columns = ['total_clicks', 'total_impressions', 'ctr']
news_with_engagement[engagement_columns] = news_with_engagement[engagement_columns].fillna(0)

# 3. Create dataset for title rewriting task
print("Creating dataset for headline rewriting task...")

# Filter to articles with sufficient impressions for reliable CTR data
rewriting_candidates = news_with_engagement[news_with_engagement['total_impressions'] >= 5].copy()

# Group titles by reading ease score to identify good and bad examples
rewriting_candidates['reading_ease_bin'] = pd.qcut(
    rewriting_candidates['title_reading_ease'], 
    q=5, 
    labels=['Very Hard', 'Hard', 'Medium', 'Easy', 'Very Easy']
)

# Create a dataset to show examples from different readability levels
rewriting_examples = {}
for readability_level in rewriting_candidates['reading_ease_bin'].unique():
    group_df = rewriting_candidates[rewriting_candidates['reading_ease_bin'] == readability_level]
    
    # Get high and low CTR examples from this readability group
    high_ctr = group_df.nlargest(5, 'ctr')
    low_ctr = group_df.nsmallest(5, 'ctr')
    
    rewriting_examples[readability_level] = {
        'high_ctr': high_ctr[['newsID', 'title', 'category', 'ctr', 'title_reading_ease']].to_dict('records'),
        'low_ctr': low_ctr[['newsID', 'title', 'category', 'ctr', 'title_reading_ease']].to_dict('records')
    }

# 4. Save processed data for the news editor system
print("Saving processed data...")

# Save the cleaned news data with engagement metrics
news_with_engagement.to_csv(f'{output_dir}/processed_data/news_with_engagement.csv', index=False)

# Save headline rewriting examples dataset
with open(f'{output_dir}/processed_data/headline_rewriting_examples.json', 'w') as f:
    json.dump(rewriting_examples, f, indent=2)

# Save category and subcategory distributions for editorial diversity goals
category_distribution = news_df['category'].value_counts().to_dict()
with open(f'{output_dir}/processed_data/category_distribution.json', 'w') as f:
    json.dump(category_distribution, f, indent=2)

# Create an editorial guideline summary based on the analysis
category_ctr_summary = category_ctr.reset_index().to_dict('records')
headline_insights = {
    'reading_ease_correlation': correlations['ctr']['title_reading_ease'],
    'headline_patterns': {
        'high_engagement': high_ctr_patterns,
        'low_engagement': low_ctr_patterns
    }
}

editorial_guidelines = {
    'category_performance': category_ctr_summary,
    'headline_insights': headline_insights,
    'overall_ctr_benchmark': clicks/total
}

with open(f'{output_dir}/processed_data/editorial_guidelines.json', 'w') as f:
    json.dump(editorial_guidelines, f, indent=2)

print("-" * 80)
print(f"EDA Pipeline completed! Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Processed data saved to '{output_dir}/processed_data/'")
print(f"Visualizations saved to '{output_dir}/plots/'")
print(f"Ready for Agentic AI News Editor development!")