# Comprehensive Publication Analysis
## OpenAlex Data: Supply Chain, Agency & Temporal Keyword Evolution

**Complete Workflow:**
1. Load and explore OpenAlex data structure
2. Filter for supply chain relevance
3. Identify agency-related articles
4. **Temporal Analysis**: Track 'agent' term evolution and AI/LLM connections
5. Generate insights and visualizations

In [None]:
import pandas as pd
import numpy as np
import re
import json
from pathlib import Path
from collections import Counter, defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from wordcloud import WordCloud

# Setup
plt.style.use('default')
plt.rcParams['figure.figsize'] = (12, 8)
data_dir = Path('../data')

print("=== ENVIRONMENT SETUP ===")
print(f"Working directory: {Path.cwd()}")
print(f"Data directory: {data_dir}")
print(f"Available data files:")
for file in data_dir.glob('*'):
    if file.is_file():
        size_mb = file.stat().st_size / 1024 / 1024
        print(f"  {file.name} ({size_mb:.1f} MB)")

## 1. Data Loading and Structure Analysis

In [None]:
# Load the specific OpenAlex dataset: agent_scm_30year_yearly.csv
print("=== LOADING AGENT SCM 30-YEAR DATASET ===")

# Load the main dataset
target_file = data_dir / 'agent_scm_30year_yearly.csv'
print(f"Loading: {target_file.name} ({target_file.stat().st_size / 1024 / 1024:.1f} MB)")

try:
    # Load with low_memory=False to handle mixed types
    df = pd.read_csv(target_file, low_memory=False)
    print(f"✅ Successfully loaded {len(df):,} records")
    print(f"Shape: {df.shape}")
    print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024 / 1024:.1f} MB")
except Exception as e:
    print(f"❌ Error loading {target_file.name}: {e}")
    df = None

if df is not None:
    print(f"\n=== DATASET OVERVIEW ===")
    print(f"Columns: {len(df.columns)}")
    print(f"Records: {len(df):,}")
    
    # Show basic info
    print(f"\nFirst 5 column names:")
    for i, col in enumerate(df.columns[:5]):
        print(f"  {i+1}. {col}")
    
    if len(df.columns) > 5:
        print(f"  ... and {len(df.columns) - 5} more columns")

In [None]:
# Detailed column analysis
if df is not None:
    print("=== DETAILED COLUMN ANALYSIS ===")
    print(f"Total columns: {len(df.columns)}")
    
    # Analyze each column
    column_info = []
    for i, col in enumerate(df.columns):
        dtype = str(df[col].dtype)
        null_count = df[col].isnull().sum()
        null_pct = (null_count / len(df)) * 100
        unique_count = df[col].nunique()
        
        # Sample non-null values
        sample_values = df[col].dropna().head(3).tolist()
        sample_str = ', '.join([str(v)[:30] + ('...' if len(str(v)) > 30 else '') for v in sample_values])
        
        column_info.append({
            'position': i + 1,
            'name': col,
            'dtype': dtype,
            'null_count': null_count,
            'null_pct': null_pct,
            'unique_count': unique_count,
            'sample_values': sample_str
        })
        
        print(f"  {i+1:2d}. {col:30} | {dtype:12} | {null_pct:5.1f}% null | {unique_count:8,} unique")
    
    # Store for later use
    globals()['column_info'] = column_info
    
    print(f"\nSample values from first few columns:")
    for col_info in column_info[:5]:
        print(f"  {col_info['name']:30}: {col_info['sample_values']}")

In [None]:
# Map columns to OpenAlex standard fields
if df is not None:
    print("\n=== OPENALEX FIELD MAPPING ===")
    
    # Common OpenAlex field patterns
    field_patterns = {
        'id': ['id', 'openalex_id', 'work_id'],
        'title': ['title', 'display_name', 'work_title'],
        'abstract': ['abstract', 'abstract_inverted_index'],
        'authors': ['authors', 'authorships', 'author_names'],
        'year': ['publication_year', 'year', 'publication_date', 'published_date'],
        'venue': ['primary_location', 'host_venue', 'journal', 'source'],
        'keywords': ['keywords', 'concepts', 'topics', 'mesh_terms'],
        'citations': ['cited_by_count', 'citation_count', 'citations'],
        'doi': ['doi', 'ids'],
        'type': ['type', 'work_type', 'publication_type'],
        'open_access': ['open_access', 'is_oa'],
        'language': ['language'],
        'institutions': ['institutions', 'affiliations']
    }
    
    identified_fields = {}
    
    for field_type, patterns in field_patterns.items():
        matches = []
        for col in df.columns:
            col_lower = col.lower()
            for pattern in patterns:
                if pattern.lower() in col_lower:
                    matches.append(col)
                    break
        
        if matches:
            # Prefer exact matches, then shortest name
            best_match = min(matches, key=lambda x: (len(x), x))
            identified_fields[field_type] = best_match
            print(f"  {field_type:15}: ✅ {best_match}")
        else:
            print(f"  {field_type:15}: ❌ NOT FOUND")
    
    # Store field mapping globally
    globals()['field_map'] = identified_fields
    
    print(f"\nMapped {len(identified_fields)} standard fields from {len(df.columns)} total columns")

In [None]:
# Sample data exploration
if df is not None and 'field_map' in globals():
    print("\n=== SAMPLE DATA EXPLORATION ===")
    
    # Show sample records
    print("Sample records (first 3 rows):")
    display_cols = []
    
    # Priority display columns
    priority_fields = ['title', 'year', 'authors', 'citations']
    for field in priority_fields:
        if field in field_map:
            display_cols.append(field_map[field])
    
    # Add a few more interesting columns
    remaining_cols = [col for col in df.columns if col not in display_cols][:3]
    display_cols.extend(remaining_cols)
    
    if display_cols:
        sample_df = df[display_cols].head(3).copy()
        
        # Truncate long text for display
        for col in sample_df.columns:
            if sample_df[col].dtype == 'object':
                sample_df[col] = sample_df[col].astype(str).apply(
                    lambda x: x[:60] + '...' if len(x) > 60 else x
                )
        
        print(sample_df.to_string(index=False))
    
    # Basic statistics
    print(f"\n=== BASIC STATISTICS ===")
    
    # Year analysis
    if 'year' in field_map:
        year_col = field_map['year']
        years = pd.to_numeric(df[year_col], errors='coerce').dropna()
        if len(years) > 0:
            print(f"Publication years: {int(years.min())} - {int(years.max())}")
            print(f"Total with valid years: {len(years):,} ({len(years)/len(df)*100:.1f}%)")
            
            # Recent distribution
            recent_years = years[years >= 2020]
            if len(recent_years) > 0:
                print(f"Publications 2020+: {len(recent_years):,} ({len(recent_years)/len(years)*100:.1f}% of dated articles)")
    
    # Citations analysis
    if 'citations' in field_map:
        cite_col = field_map['citations']
        citations = pd.to_numeric(df[cite_col], errors='coerce').dropna()
        if len(citations) > 0:
            print(f"Citations: mean={citations.mean():.1f}, median={citations.median():.0f}, max={citations.max():.0f}")
    
    print(f"Data quality: {df.isnull().sum().sum():,} total null values across all columns")

## 2. Comprehensive Keyword Taxonomy Definition

Defining our keyword categories for multi-dimensional analysis:
- **Supply Chain**: Core SCM, operations, and technology terms
- **Agency**: Theory, modeling, and governance concepts
- **AI/LLM**: General AI, specific LLM terms, and applications
- **Agent Terms**: General and compound agent terminology

In [None]:
# Comprehensive keyword taxonomy for analysis
KEYWORD_TAXONOMY = {
    'supply_chain': {
        'core': [
            'supply chain', 'supply-chain', 'supply chains',
            'logistics', 'procurement', 'sourcing',
            'inventory management', 'distribution', 'warehousing',
            'supplier', 'vendor management', 'supply network',
            'supply chain management', 'SCM'
        ],
        'operations': [
            'operations management', 'production planning', 'demand forecasting',
            'capacity planning', 'lean manufacturing', 'just-in-time',
            'value chain', 'operations research', 'supply planning'
        ],
        'technology': [
            'supply chain technology', 'supply chain digitalization',
            'supply chain analytics', 'blockchain supply chain',
            'IoT supply chain', 'AI supply chain', 'digital supply chain',
            'ERP', 'WMS', 'TMS', 'supply chain automation'
        ]
    },
    'agency': {
        'theory': [
            'agency theory', 'principal-agent', 'principal agent',
            'moral hazard', 'adverse selection', 'information asymmetry',
            'agency costs', 'agency problems', 'agency relationship'
        ],
        'modeling': [
            'agent-based modeling', 'agent based modeling', 'ABM',
            'multi-agent system', 'multi agent system', 'MAS',
            'autonomous agents', 'intelligent agents', 'software agents',
            'agent-based simulation', 'agent based simulation'
        ],
        'governance': [
            'governance mechanisms', 'monitoring', 'incentive alignment',
            'contract theory', 'agency relationships', 'corporate governance'
        ]
    },
    'ai_llm': {
        'ai_general': [
            'artificial intelligence', 'machine learning', 'deep learning',
            'neural networks', 'AI', 'ML', 'DL',
            'computer vision', 'natural language processing', 'NLP',
            'reinforcement learning', 'supervised learning'
        ],
        'llm_specific': [
            'large language model', 'large language models', 'LLM', 'LLMs',
            'transformer', 'transformers', 'BERT', 'GPT',
            'language model', 'foundation model', 'foundation models',
            'generative AI', 'ChatGPT', 'OpenAI', 'pretrained model'
        ],
        'applications': [
            'conversational AI', 'dialogue system', 'chatbot',
            'text generation', 'language generation',
            'prompt engineering', 'fine-tuning', 'AI assistant'
        ]
    },
    'agent_terms': {
        'general': [
            'agent', 'agents', 'agented', 'agenting',
            'agency', 'agencies'
        ],
        'compound': [
            'AI agent', 'AI agents', 'artificial agent',
            'digital agent', 'virtual agent', 'cognitive agent',
            'conversational agent', 'autonomous agent', 'smart agent'
        ]
    }
}

# Create efficient regex patterns for matching
def create_regex_pattern(keywords):
    """Create case-insensitive regex pattern with word boundaries"""
    # Escape special regex characters and add word boundaries
    escaped = []
    for kw in keywords:
        # Handle hyphenated terms and special cases
        if '-' in kw:
            # Allow both hyphenated and space versions
            variants = [re.escape(kw), re.escape(kw.replace('-', ' '))]
            escaped.extend(variants)
        else:
            escaped.append(re.escape(kw))
    
    # Create pattern with word boundaries
    pattern = r'\b(?:' + '|'.join(escaped) + r')\b'
    return pattern

# Generate patterns for all categories
PATTERNS = {}
for category, subcategories in KEYWORD_TAXONOMY.items():
    PATTERNS[category] = {}
    all_keywords = []
    for subcat, keywords in subcategories.items():
        PATTERNS[category][subcat] = create_regex_pattern(keywords)
        all_keywords.extend(keywords)
    PATTERNS[category]['all'] = create_regex_pattern(all_keywords)

print("=== KEYWORD TAXONOMY CREATED ===")
for category, subcats in KEYWORD_TAXONOMY.items():
    total_keywords = sum(len(keywords) for keywords in subcats.values())
    print(f"{category:15}: {total_keywords:3d} keywords across {len(subcats)} subcategories")

print(f"\nRegex patterns generated for efficient text matching")

## 3. Efficient Text Analysis Functions

Implementing fast keyword matching and publication categorization functions.

In [None]:
# Efficient text analysis functions
def fast_keyword_match(text, pattern):
    """Fast regex-based keyword matching with case-insensitive search"""
    if pd.isna(text) or text is None:
        return False
    try:
        return bool(re.search(pattern, str(text), re.IGNORECASE))
    except:
        return False

def extract_matching_terms(text, original_keywords):
    """Extract actual matching terms from text"""
    if pd.isna(text) or text is None:
        return []
    
    text_lower = str(text).lower()
    matches = []
    
    for keyword in original_keywords:
        # Use word boundaries for more precise matching
        pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
        if re.search(pattern, text_lower):
            matches.append(keyword)
    
    return matches

def analyze_publications_comprehensive(df, field_map):
    """Comprehensive publication analysis by keyword categories"""
    
    if df is None or len(df) == 0:
        return None
    
    results = {
        'total_publications': len(df),
        'analysis_timestamp': datetime.now().isoformat(),
        'by_category': {},
        'temporal_data': {},
        'matching_articles': {},
        'field_coverage': {}
    }
    
    # Determine text fields to search
    search_fields = []
    text_field_types = ['title', 'abstract', 'keywords']
    
    for field_type in text_field_types:
        if field_type in field_map and field_map[field_type] in df.columns:
            col_name = field_map[field_type]
            # Check if field has meaningful content
            non_null_count = df[col_name].notna().sum()
            coverage = non_null_count / len(df) * 100
            
            search_fields.append((field_type, col_name))
            results['field_coverage'][field_type] = {
                'column': col_name,
                'coverage_pct': coverage,
                'non_null_count': non_null_count
            }
            print(f"Will search {field_type:10}: {col_name:30} ({coverage:5.1f}% coverage)")
    
    if not search_fields:
        print("❌ No searchable text fields found!")
        return results
    
    # Year field for temporal analysis
    year_field = field_map.get('year')
    
    print(f"\nAnalyzing {len(KEYWORD_TAXONOMY)} categories across {len(search_fields)} text fields...")
    
    # Analyze each category
    for category in KEYWORD_TAXONOMY.keys():
        print(f"\n  Analyzing {category}...")
        
        category_mask = pd.Series([False] * len(df))
        subcategory_results = {}
        subcategory_matches = {}
        
        # Check each subcategory
        for subcat in KEYWORD_TAXONOMY[category].keys():
            subcat_mask = pd.Series([False] * len(df))
            subcat_matching_terms = []
            
            # Search in all specified fields
            for field_name, col_name in search_fields:
                if col_name in df.columns:
                    try:
                        field_matches = df[col_name].apply(
                            lambda x: fast_keyword_match(x, PATTERNS[category][subcat])
                        )
                        subcat_mask |= field_matches
                        
                        # Track which terms were found (sample)
                        sample_matches = df[field_matches][col_name].head(5).apply(
                            lambda x: extract_matching_terms(x, KEYWORD_TAXONOMY[category][subcat])
                        ).tolist()
                        subcat_matching_terms.extend([term for terms in sample_matches for term in terms])
                        
                    except Exception as e:
                        print(f"    ⚠️  Error searching {field_name}: {e}")
            
            subcategory_results[subcat] = {
                'count': subcat_mask.sum(),
                'percentage': (subcat_mask.sum() / len(df)) * 100,
                'sample_terms': list(set(subcat_matching_terms[:10]))  # Top 10 unique terms
            }
            subcategory_matches[subcat] = subcat_mask
            category_mask |= subcat_mask
        
        # Store category results
        total_matches = category_mask.sum()
        results['by_category'][category] = {
            'total': total_matches,
            'percentage': (total_matches / len(df)) * 100,
            'subcategories': subcategory_results
        }
        
        # Store matching article indices
        results['matching_articles'][category] = df[category_mask].index.tolist()
        
        # Temporal analysis if year data available
        if year_field and year_field in df.columns and total_matches > 0:
            matching_df = df[category_mask]
            years = pd.to_numeric(matching_df[year_field], errors='coerce').dropna()
            
            if len(years) > 0:
                # Filter reasonable years (1990-2024)
                valid_years = years[(years >= 1990) & (years <= 2024)]
                if len(valid_years) > 0:
                    year_counts = valid_years.value_counts().sort_index()
                    results['temporal_data'][category] = {
                        'year_counts': year_counts.to_dict(),
                        'year_range': (int(valid_years.min()), int(valid_years.max())),
                        'peak_year': int(year_counts.idxmax()) if len(year_counts) > 0 else None,
                        'articles_with_years': len(valid_years),
                        'coverage_pct': len(valid_years) / total_matches * 100
                    }
        
        print(f"    Found {total_matches:,} articles ({(total_matches/len(df)*100):.1f}%)")
    
    return results

print("Text analysis functions ready for execution")

## 4. Execute Comprehensive Analysis

Running the complete analysis on the agent SCM dataset.

In [None]:
# Execute comprehensive analysis
if df is not None and 'field_map' in globals():
    print("=== EXECUTING COMPREHENSIVE ANALYSIS ===")
    print(f"Dataset: {len(df):,} publications")
    print(f"Fields mapped: {list(field_map.keys())}")
    print(f"Analysis categories: {list(KEYWORD_TAXONOMY.keys())}")
    
    # Run the analysis
    analysis_results = analyze_publications_comprehensive(df, field_map)
    
    if analysis_results:
        print("\n" + "="*60)
        print("ANALYSIS SUMMARY")
        print("="*60)
        
        total_pubs = analysis_results['total_publications']
        print(f"Total Publications: {total_pubs:,}\n")
        
        # Category overview
        for category, data in analysis_results['by_category'].items():
            print(f"{category.upper().replace('_', ' '):20}: {data['total']:6,} articles ({data['percentage']:5.1f}%)")
            
            # Show top subcategories
            sorted_subcats = sorted(data['subcategories'].items(), 
                                  key=lambda x: x[1]['count'], reverse=True)
            
            for subcat, subdata in sorted_subcats[:2]:  # Top 2 subcategories
                print(f"  └─ {subcat:15}: {subdata['count']:6,} articles ({subdata['percentage']:5.1f}%)")
                if subdata['sample_terms']:
                    terms_str = ', '.join(subdata['sample_terms'][:3])
                    print(f"     Sample terms: {terms_str}")
            print()
        
        # Field coverage summary
        print("FIELD COVERAGE:")
        for field, coverage_data in analysis_results['field_coverage'].items():
            print(f"  {field:10}: {coverage_data['coverage_pct']:5.1f}% ({coverage_data['non_null_count']:,} records)")
        
        print("\n" + "="*60)
else:
    print("❌ Cannot run analysis - data or field mapping not available")
    analysis_results = None

In [None]:
# Analyze data structure and identify OpenAlex fields
if df is not None:
    print("=== COLUMN ANALYSIS ===")
    print(f"Total columns: {len(df.columns)}")
    
    # Map common OpenAlex field patterns
    field_mapping = {
        'title': ['title', 'display_name', 'work_title'],
        'abstract': ['abstract', 'abstract_inverted_index'],
        'keywords': ['keywords', 'concepts', 'topics', 'mesh_terms'],
        'authors': ['authors', 'authorships', 'author_names'],
        'year': ['publication_year', 'year', 'publication_date', 'published_date'],
        'venue': ['primary_location', 'host_venue', 'journal', 'source'],
        'doi': ['doi', 'ids'],
        'citations': ['cited_by_count', 'citation_count', 'citations'],
        'type': ['type', 'work_type', 'publication_type']
    }
    
    identified_fields = {}
    
    print("\nIdentified fields:")
    for field_type, possible_names in field_mapping.items():
        matches = []
        for col in df.columns:
            if any(name.lower() in col.lower() for name in possible_names):
                matches.append(col)
        
        if matches:
            # Prefer exact matches, then shortest name
            best_match = min(matches, key=len)
            identified_fields[field_type] = best_match
            print(f"  {field_type:12}: {best_match}")
        else:
            print(f"  {field_type:12}: ❌ NOT FOUND")
    
    print(f"\nAll columns:")
    for i, col in enumerate(df.columns, 1):
        dtype = str(df[col].dtype)
        null_pct = (df[col].isnull().sum() / len(df)) * 100
        print(f"  {i:2d}. {col:30} | {dtype:10} | {null_pct:.1f}% null")
        
    # Store field mapping for later use
    globals()['field_map'] = identified_fields

## 2. Supply Chain & Agency Keyword Definitions

In [None]:
# Comprehensive keyword definitions for multi-level analysis
KEYWORD_TAXONOMY = {
    'supply_chain': {
        'core': [
            'supply chain', 'supply-chain', 'supply chains',
            'logistics', 'procurement', 'sourcing',
            'inventory management', 'distribution', 'warehousing',
            'supplier', 'vendor management', 'supply network'
        ],
        'operations': [
            'operations management', 'production planning', 'demand forecasting',
            'capacity planning', 'lean manufacturing', 'just-in-time',
            'value chain', 'supply chain management', 'SCM'
        ],
        'technology': [
            'supply chain technology', 'supply chain digitalization',
            'supply chain analytics', 'blockchain supply chain',
            'IoT supply chain', 'AI supply chain',
            'ERP', 'WMS', 'TMS'
        ]
    },
    'agency': {
        'theory': [
            'agency theory', 'principal-agent', 'principal agent',
            'moral hazard', 'adverse selection', 'information asymmetry',
            'agency costs', 'agency problems'
        ],
        'modeling': [
            'agent-based modeling', 'agent based modeling', 'ABM',
            'multi-agent system', 'multi agent system', 'MAS',
            'autonomous agents', 'intelligent agents', 'software agents'
        ],
        'governance': [
            'governance mechanisms', 'monitoring', 'incentive alignment',
            'contract theory', 'agency relationships'
        ]
    },
    'ai_llm': {
        'ai_general': [
            'artificial intelligence', 'machine learning', 'deep learning',
            'neural networks', 'AI', 'ML', 'DL',
            'computer vision', 'natural language processing', 'NLP'
        ],
        'llm_specific': [
            'large language model', 'large language models', 'LLM', 'LLMs',
            'transformer', 'transformers', 'BERT', 'GPT',
            'language model', 'foundation model', 'foundation models',
            'generative AI', 'ChatGPT', 'OpenAI'
        ],
        'applications': [
            'conversational AI', 'dialogue system', 'chatbot',
            'text generation', 'language generation',
            'prompt engineering', 'fine-tuning'
        ]
    },
    'agent_terms': {
        'general': [
            'agent', 'agents', 'agented', 'agenting',
            'agency', 'agencies'
        ],
        'compound': [
            'AI agent', 'AI agents', 'artificial agent',
            'digital agent', 'virtual agent', 'cognitive agent',
            'conversational agent', 'autonomous agent'
        ]
    }
}

# Create efficient regex patterns
def create_pattern(keywords):
    """Create case-insensitive regex pattern from keyword list"""
    escaped = [re.escape(kw.lower()) for kw in keywords]
    return '|'.join(escaped)

PATTERNS = {}
for category, subcategories in KEYWORD_TAXONOMY.items():
    PATTERNS[category] = {}
    all_keywords = []
    for subcat, keywords in subcategories.items():
        PATTERNS[category][subcat] = create_pattern(keywords)
        all_keywords.extend(keywords)
    PATTERNS[category]['all'] = create_pattern(all_keywords)

print("=== KEYWORD TAXONOMY CREATED ===")
for category, subcats in KEYWORD_TAXONOMY.items():
    total_keywords = sum(len(keywords) for keywords in subcats.values())
    print(f"{category:15}: {total_keywords:3d} keywords across {len(subcats)} subcategories")

## 3. Efficient Text Analysis Functions

In [None]:
def fast_keyword_match(text, pattern):
    """Fast regex-based keyword matching"""
    if pd.isna(text) or text is None:
        return False
    return bool(re.search(pattern, str(text).lower()))

def extract_matching_terms(text, pattern, original_keywords):
    """Extract actual matching terms from text"""
    if pd.isna(text) or text is None:
        return []
    
    text_lower = str(text).lower()
    matches = []
    
    for keyword in original_keywords:
        if keyword.lower() in text_lower:
            matches.append(keyword)
    
    return matches

def analyze_publications_by_category(df, field_map):
    """Comprehensive publication analysis by keyword categories"""
    
    results = {
        'total_publications': len(df),
        'by_category': {},
        'temporal_data': {},
        'matching_articles': {}
    }
    
    # Text fields to search
    search_fields = []
    if 'title' in field_map:
        search_fields.append(('title', field_map['title']))
    if 'abstract' in field_map:
        search_fields.append(('abstract', field_map['abstract']))
    if 'keywords' in field_map:
        search_fields.append(('keywords', field_map['keywords']))
    
    print(f"Searching in fields: {[f[0] for f in search_fields]}")
    
    # Year field for temporal analysis
    year_field = field_map.get('year')
    
    # Analyze each category
    for category in KEYWORD_TAXONOMY.keys():
        print(f"\nAnalyzing {category}...")
        
        category_mask = pd.Series([False] * len(df))
        subcategory_results = {}
        
        # Check each subcategory
        for subcat in KEYWORD_TAXONOMY[category].keys():
            subcat_mask = pd.Series([False] * len(df))
            
            # Search in all specified fields
            for field_name, col_name in search_fields:
                if col_name in df.columns:
                    field_matches = df[col_name].apply(
                        lambda x: fast_keyword_match(x, PATTERNS[category][subcat])
                    )
                    subcat_mask |= field_matches
            
            subcategory_results[subcat] = subcat_mask.sum()
            category_mask |= subcat_mask
        
        # Store results
        results['by_category'][category] = {
            'total': category_mask.sum(),
            'percentage': (category_mask.sum() / len(df)) * 100,
            'subcategories': subcategory_results
        }
        
        # Store matching article indices
        results['matching_articles'][category] = df[category_mask].index.tolist()
        
        # Temporal analysis if year data available
        if year_field and year_field in df.columns:
            matching_df = df[category_mask]
            if len(matching_df) > 0:
                # Clean year data
                years = pd.to_numeric(matching_df[year_field], errors='coerce').dropna()
                if len(years) > 0:
                    year_counts = years.value_counts().sort_index()
                    results['temporal_data'][category] = {
                        'year_counts': year_counts.to_dict(),
                        'year_range': (int(years.min()), int(years.max())),
                        'peak_year': int(year_counts.idxmax()),
                        'articles_with_years': len(years)
                    }
        
        print(f"  Found {results['by_category'][category]['total']:,} articles ({results['by_category'][category]['percentage']:.1f}%)")
    
    return results

print("Analysis functions ready")

## 4. Execute Comprehensive Analysis

In [None]:
# Run the comprehensive analysis
if df is not None and 'field_map' in globals():
    print("=== STARTING COMPREHENSIVE ANALYSIS ===")
    print(f"Dataset: {len(df):,} publications")
    print(f"Fields available: {list(field_map.keys())}")
    
    # Execute analysis
    analysis_results = analyze_publications_by_category(df, field_map)
    
    print("\n=== ANALYSIS SUMMARY ===")
    for category, data in analysis_results['by_category'].items():
        print(f"{category.upper():15}: {data['total']:6,} articles ({data['percentage']:5.1f}%)")
        
        # Show subcategory breakdown
        for subcat, count in data['subcategories'].items():
            pct = (count / analysis_results['total_publications']) * 100
            print(f"  └─ {subcat:12}: {count:6,} articles ({pct:5.1f}%)")
        print()
    
    # Intersection analysis
    print("=== INTERSECTION ANALYSIS ===")
    categories = list(analysis_results['matching_articles'].keys())
    
    for i, cat1 in enumerate(categories):
        for cat2 in categories[i+1:]:
            set1 = set(analysis_results['matching_articles'][cat1])
            set2 = set(analysis_results['matching_articles'][cat2])
            intersection = len(set1 & set2)
            
            if intersection > 0:
                total = analysis_results['total_publications']
                pct = (intersection / total) * 100
                print(f"{cat1} ∩ {cat2}: {intersection:,} articles ({pct:.2f}%)")
else:
    print("❌ Cannot run analysis - data or field mapping not available")

## 5. Temporal Evolution Analysis - Focus on 'Agent' Terms

In [None]:
# Deep dive into agent term evolution over time
def analyze_agent_evolution(df, field_map, analysis_results):
    """Detailed analysis of agent term evolution and AI/LLM connections"""
    
    if 'year' not in field_map:
        print("❌ No year field available for temporal analysis")
        return None
    
    year_field = field_map['year']
    
    # Get agent-related articles
    agent_indices = set(analysis_results['matching_articles'].get('agency', []))
    agent_term_indices = set(analysis_results['matching_articles'].get('agent_terms', []))
    ai_indices = set(analysis_results['matching_articles'].get('ai_llm', []))
    
    # Combine agent-related terms
    all_agent_indices = agent_indices | agent_term_indices
    
    print(f"=== AGENT TERM EVOLUTION ANALYSIS ===")
    print(f"Total agent-related articles: {len(all_agent_indices):,}")
    print(f"Agent + AI intersection: {len(all_agent_indices & ai_indices):,}")
    
    # Create temporal analysis
    agent_df = df.loc[list(all_agent_indices)].copy()
    ai_df = df.loc[list(ai_indices)].copy()
    agent_ai_df = df.loc[list(all_agent_indices & ai_indices)].copy()
    
    # Clean year data
    for temp_df, name in [(agent_df, 'agent'), (ai_df, 'ai'), (agent_ai_df, 'agent_ai')]:
        temp_df['clean_year'] = pd.to_numeric(temp_df[year_field], errors='coerce')
        temp_df = temp_df.dropna(subset=['clean_year'])
        temp_df['clean_year'] = temp_df['clean_year'].astype(int)
        
        # Filter reasonable years (1990-2024)
        temp_df = temp_df[(temp_df['clean_year'] >= 1990) & (temp_df['clean_year'] <= 2024)]
        globals()[f'{name}_df_clean'] = temp_df
    
    # Year-by-year analysis
    year_analysis = {}
    
    all_years = set()
    if len(agent_df_clean) > 0:
        all_years.update(agent_df_clean['clean_year'].unique())
    if len(ai_df_clean) > 0:
        all_years.update(ai_df_clean['clean_year'].unique())
    
    for year in sorted(all_years):
        agent_count = len(agent_df_clean[agent_df_clean['clean_year'] == year])
        ai_count = len(ai_df_clean[ai_df_clean['clean_year'] == year])
        agent_ai_count = len(agent_ai_df_clean[agent_ai_df_clean['clean_year'] == year])
        
        year_analysis[year] = {
            'agent_articles': agent_count,
            'ai_articles': ai_count,
            'agent_ai_articles': agent_ai_count,
            'agent_ai_ratio': agent_ai_count / max(agent_count, 1)
        }
    
    return {
        'year_analysis': year_analysis,
        'agent_df': agent_df_clean,
        'ai_df': ai_df_clean,
        'agent_ai_df': agent_ai_df_clean,
        'summary': {
            'total_agent_articles': len(agent_df_clean),
            'total_ai_articles': len(ai_df_clean),
            'total_agent_ai_articles': len(agent_ai_df_clean),
            'year_range': (min(all_years) if all_years else None, max(all_years) if all_years else None)
        }
    }

# Execute agent evolution analysis
if df is not None and 'analysis_results' in globals():
    evolution_results = analyze_agent_evolution(df, field_map, analysis_results)
    
    if evolution_results:
        print("\n=== EVOLUTION SUMMARY ===")
        summary = evolution_results['summary']
        print(f"Agent articles: {summary['total_agent_articles']:,}")
        print(f"AI articles: {summary['total_ai_articles']:,}")
        print(f"Agent+AI articles: {summary['total_agent_ai_articles']:,}")
        print(f"Year range: {summary['year_range'][0]}-{summary['year_range'][1]}")
        
        # Show key trends
        print("\n=== KEY TRENDS (Last 10 Years) ===")
        recent_years = sorted([y for y in evolution_results['year_analysis'].keys() if y >= 2014])
        
        for year in recent_years[-10:]:  # Last 10 years
            data = evolution_results['year_analysis'][year]
            ratio = data['agent_ai_ratio'] * 100
            print(f"{year}: Agent={data['agent_articles']:3d}, AI={data['ai_articles']:4d}, Agent+AI={data['agent_ai_articles']:3d} ({ratio:.1f}%)")
else:
    print("❌ Cannot run evolution analysis")

## 6. Visualization Dashboard

In [None]:
# Create comprehensive visualizations
def create_analysis_dashboard(analysis_results, evolution_results=None):
    """Create a comprehensive visualization dashboard"""
    
    fig = make_subplots(
        rows=3, cols=2,
        subplot_titles=[
            'Publications by Category',
            'Category Intersections',
            'Temporal Evolution: Agent Terms',
            'Agent-AI Connection Over Time',
            'Supply Chain vs Agency Publications',
            'Research Domain Distribution'
        ],
        specs=[
            [{'type': 'bar'}, {'type': 'scatter'}],
            [{'type': 'scatter'}, {'type': 'scatter'}],
            [{'type': 'bar'}, {'type': 'pie'}]
        ]
    )
    
    # 1. Publications by Category
    categories = list(analysis_results['by_category'].keys())
    counts = [analysis_results['by_category'][cat]['total'] for cat in categories]
    
    fig.add_trace(
        go.Bar(name='Publications', x=categories, y=counts, marker_color='lightblue'),
        row=1, col=1
    )
    
    # 2. Temporal evolution if available
    if evolution_results and 'year_analysis' in evolution_results:
        years = sorted(evolution_results['year_analysis'].keys())
        agent_counts = [evolution_results['year_analysis'][y]['agent_articles'] for y in years]
        ai_counts = [evolution_results['year_analysis'][y]['ai_articles'] for y in years]
        
        fig.add_trace(
            go.Scatter(x=years, y=agent_counts, mode='lines+markers', name='Agent Articles', line=dict(color='red')),
            row=2, col=1
        )
        
        fig.add_trace(
            go.Scatter(x=years, y=ai_counts, mode='lines+markers', name='AI Articles', line=dict(color='blue')),
            row=2, col=1
        )
        
        # Agent-AI connection ratio
        ratios = [evolution_results['year_analysis'][y]['agent_ai_ratio'] * 100 for y in years]
        fig.add_trace(
            go.Scatter(x=years, y=ratios, mode='lines+markers', name='Agent-AI Connection %', line=dict(color='green')),
            row=2, col=2
        )
    
    # 3. Domain distribution pie chart
    fig.add_trace(
        go.Pie(labels=categories, values=counts, name="Domains"),
        row=3, col=2
    )
    
    # Update layout
    fig.update_layout(
        height=1200,
        title_text="Research Publications Analysis Dashboard",
        showlegend=True
    )
    
    return fig

# Generate visualizations
if 'analysis_results' in globals():
    print("Creating visualization dashboard...")
    
    # Create dashboard
    dashboard = create_analysis_dashboard(
        analysis_results, 
        evolution_results if 'evolution_results' in globals() else None
    )
    
    # Display
    dashboard.show()
    
    # Save as HTML
    output_path = data_dir.parent / 'reports' / 'analysis_dashboard.html'
    output_path.parent.mkdir(exist_ok=True)
    dashboard.write_html(str(output_path))
    print(f"Dashboard saved to: {output_path}")
    
    # Create simple matplotlib plots for backup
    plt.figure(figsize=(15, 10))
    
    # Plot 1: Category overview
    plt.subplot(2, 3, 1)
    categories = list(analysis_results['by_category'].keys())
    counts = [analysis_results['by_category'][cat]['total'] for cat in categories]
    plt.bar(categories, counts, color='skyblue')
    plt.title('Publications by Category')
    plt.xticks(rotation=45)
    
    # Plot 2: Temporal evolution (if available)
    if 'evolution_results' in globals() and evolution_results:
        plt.subplot(2, 3, 2)
        years = sorted(evolution_results['year_analysis'].keys())
        agent_counts = [evolution_results['year_analysis'][y]['agent_articles'] for y in years]
        ai_counts = [evolution_results['year_analysis'][y]['ai_articles'] for y in years]
        
        plt.plot(years, agent_counts, 'r-o', label='Agent Articles', markersize=4)
        plt.plot(years, ai_counts, 'b-s', label='AI Articles', markersize=4)
        plt.title('Temporal Evolution')
        plt.xlabel('Year')
        plt.ylabel('Number of Articles')
        plt.legend()
        plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    
    # Save matplotlib figure
    plot_path = data_dir.parent / 'reports' / 'analysis_plots.png'
    plt.savefig(plot_path, dpi=300, bbox_inches='tight')
    print(f"Static plots saved to: {plot_path}")
    
    plt.show()
    
else:
    print("❌ No analysis results available for visualization")

## 7. Generate Analysis Report

In [None]:
# Generate comprehensive analysis report
def generate_analysis_report(analysis_results, evolution_results=None, df=None):
    """Generate a comprehensive markdown report"""
    
    report = []
    report.append("# OpenAlex Publications Analysis Report")
    report.append(f"*Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*\n")
    
    # Executive Summary
    report.append("## Executive Summary")
    total = analysis_results['total_publications']
    report.append(f"- **Total Publications Analyzed**: {total:,}")
    
    for category, data in analysis_results['by_category'].items():
        report.append(f"- **{category.title()} Publications**: {data['total']:,} ({data['percentage']:.1f}%)")
    
    # Detailed Category Analysis
    report.append("\n## Detailed Analysis by Category")
    
    for category, data in analysis_results['by_category'].items():
        report.append(f"\n### {category.title().replace('_', ' ')} Publications")
        report.append(f"- **Total**: {data['total']:,} publications ({data['percentage']:.1f}% of dataset)")
        report.append(f"- **Subcategory Breakdown**:")
        
        for subcat, count in data['subcategories'].items():
            pct = (count / total) * 100
            report.append(f"  - {subcat.title()}: {count:,} articles ({pct:.1f}%)")
    
    # Temporal Analysis
    if evolution_results:
        report.append("\n## Temporal Evolution Analysis")
        
        summary = evolution_results['summary']
        report.append(f"- **Analysis Period**: {summary['year_range'][0]}-{summary['year_range'][1]}")
        report.append(f"- **Agent-related Publications**: {summary['total_agent_articles']:,}")
        report.append(f"- **AI-related Publications**: {summary['total_ai_articles']:,}")
        report.append(f"- **Agent+AI Publications**: {summary['total_agent_ai_articles']:,}")
        
        # Key trends
        report.append("\n### Key Trends (2020-2024)")
        recent_years = [y for y in evolution_results['year_analysis'].keys() if y >= 2020]
        
        if recent_years:
            report.append("| Year | Agent Articles | AI Articles | Agent+AI | Connection Rate |")
            report.append("|------|----------------|-------------|----------|-----------------|")
            
            for year in sorted(recent_years):
                data = evolution_results['year_analysis'][year]
                ratio = data['agent_ai_ratio'] * 100
                report.append(f"| {year} | {data['agent_articles']:,} | {data['ai_articles']:,} | {data['agent_ai_articles']:,} | {ratio:.1f}% |")
    
    # Intersection Analysis
    report.append("\n## Category Intersections")
    categories = list(analysis_results['matching_articles'].keys())
    
    report.append("| Category 1 | Category 2 | Intersection | Percentage |")
    report.append("|------------|------------|--------------|------------|")
    
    for i, cat1 in enumerate(categories):
        for cat2 in categories[i+1:]:
            set1 = set(analysis_results['matching_articles'][cat1])
            set2 = set(analysis_results['matching_articles'][cat2])
            intersection = len(set1 & set2)
            
            if intersection > 0:
                pct = (intersection / total) * 100
                report.append(f"| {cat1.title()} | {cat2.title()} | {intersection:,} | {pct:.2f}% |")
    
    # Recommendations
    report.append("\n## Key Findings & Recommendations")
    
    # Find dominant categories
    sorted_cats = sorted(analysis_results['by_category'].items(), 
                        key=lambda x: x[1]['total'], reverse=True)
    
    report.append(f"1. **Dominant Research Area**: {sorted_cats[0][0].title().replace('_', ' ')} represents the largest category with {sorted_cats[0][1]['total']:,} publications")
    
    if evolution_results:
        # Find growth trends
        recent_data = {year: data for year, data in evolution_results['year_analysis'].items() if year >= 2020}
        if len(recent_data) >= 2:
            years = sorted(recent_data.keys())
            ai_growth = recent_data[years[-1]]['ai_articles'] - recent_data[years[0]]['ai_articles']
            report.append(f"2. **AI Research Growth**: AI-related publications showed {'positive' if ai_growth > 0 else 'negative'} growth in recent years")
    
    # Check supply chain relevance
    sc_relevant = analysis_results['by_category'].get('supply_chain', {}).get('total', 0)
    sc_percentage = (sc_relevant / total) * 100
    
    if sc_percentage < 10:
        report.append(f"3. **Supply Chain Relevance**: Only {sc_percentage:.1f}% of publications are supply chain-related. Consider refining search criteria for better relevance.")
    else:
        report.append(f"3. **Supply Chain Relevance**: {sc_percentage:.1f}% of publications are supply chain-related, indicating good dataset relevance.")
    
    return "\n".join(report)

# Generate and save report
if 'analysis_results' in globals():
    print("Generating comprehensive analysis report...")
    
    report_content = generate_analysis_report(
        analysis_results,
        evolution_results if 'evolution_results' in globals() else None,
        df
    )
    
    # Save report
    report_path = data_dir.parent / 'reports' / 'openalex_analysis_report.md'
    report_path.parent.mkdir(exist_ok=True)
    
    with open(report_path, 'w', encoding='utf-8') as f:
        f.write(report_content)
    
    print(f"✅ Report saved to: {report_path}")
    
    # Display summary
    print("\n" + "="*50)
    print("ANALYSIS COMPLETE")
    print("="*50)
    print(f"📊 Dashboard: reports/analysis_dashboard.html")
    print(f"📈 Plots: reports/analysis_plots.png")
    print(f"📄 Report: reports/openalex_analysis_report.md")
    print("="*50)
    
else:
    print("❌ Cannot generate report - analysis results not available")