# SLR Data Acquisition and Analysis Notebook

This notebook orchestrates the process of fetching, processing, and analyzing publication data for the "Agentic AI in SCM" Systematic Literature Review.

In [None]:
# Imports and Setup
import sys
import os

# Add slr_core to Python path (adjust if your notebook is elsewhere or slr_core is installed)
# This assumes the notebook is in notebooks/slr_analytics/ and slr_core is at the project root
module_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from slr_core.data_acquirer import DataAcquirer
from slr_core.data_processor import DataProcessor
# from slr_core.keyword_analyzer import KeywordAnalyzer # Placeholder for later
# from slr_core.semantic_analyzer import SemanticAnalyzer # Placeholder for later
# from slr_core.visualizer import Visualizer # Placeholder for later
from slr_core.config_manager import ConfigManager

print("Modules imported.")

## 1. Configuration

Load configuration settings using `ConfigManager`. 
This will load from `config/slr_config.yaml` and allow access to environment variables for API keys.

In [None]:
# Initialize Configuration Manager
config_mgr = ConfigManager()
print("ConfigManager initialized.")
# Example: Access a config value
print(f"Using raw data directory: {config_mgr.get('data_paths.raw_data_dir')}")

# Initialize Data Acquirer and Processor with ConfigManager
data_acquirer = DataAcquirer(config_manager=config_mgr)
print("DataAcquirer initialized.")

data_processor = DataProcessor(config_manager=config_mgr)
print("DataProcessor initialized.")

## 2. Data Acquisition

Fetch data from academic APIs.

In [None]:
# Define search parameters (load from config with local overrides if needed)
default_params = config_mgr.default_search_params

search_query = default_params.get("query", "agentic AI in Supply Chain Management") # Default query if not in YAML
start_year = default_params.get("start_year", 2021)
end_year = default_params.get("end_year", 2023) 
max_results_per_source = default_params.get("max_results_per_source", 10)

# Optional: Allow overriding specific parameters in the cell for quick tests
# start_year = 2020 
# search_query = "different query for this run"

# Fetch data
print(f"Fetching data for query: '{search_query}' from {start_year}-{end_year} (max_results_per_source: {max_results_per_source})...")
all_fetched_data = data_acquirer.fetch_all_sources(
    query=search_query,
    start_year=start_year,
    end_year=end_year,
    max_results_per_source=max_results_per_source
)

# Display some results
for source, results in all_fetched_data.items():
    print(f"--- Results from {source} ({len(results)} articles) ---")
    if results:
        for i, paper in enumerate(results[:2]): # Print first 2 results from each source
            print(f"  Paper {i+1}: {paper.get('title', 'N/A')}")
            if paper.get('doi'):
                print(f"     DOI: {paper.get('doi')}")
    else:
        print("  No results found or an error occurred.")
        
print("\nData acquisition test complete.")

## 3. Data Processing and Structuring

Process the raw fetched data to standardize formats, clean, and deduplicate.

In [None]:
# Process the fetched data
# 'all_fetched_data' is the output from the Data Acquisition step (Cell 6)
# DataProcessor 'data_processor' is initialized in Cell 4
if 'all_fetched_data' in locals() and all_fetched_data:
    print("\nProcessing fetched data...")
    processed_df = data_processor.process_raw_data(all_fetched_data)
    
    if not processed_df.empty:
        print("\n--- Sample of Processed DataFrame ---")
        print(processed_df.head())
        print(f"\nShape of processed DataFrame: {processed_df.shape}")
        print(f"\nColumns: {processed_df.columns.tolist()}")
        
        # Save the processed data
        processed_filename = config_mgr.get("data_paths.processed_articles_file", "slr_processed_articles.csv")
        processed_file_path = data_processor.save_processed_data(processed_df, filename=processed_filename)
        if processed_file_path:
            print(f"Processed data saved to: {processed_file_path}")
        else:
            print("Failed to save processed data.")
    else:
        print("No data was processed, or the resulting DataFrame is empty.")
else:
    print("No fetched data available to process. Please run the Data Acquisition step first.")