### Data Processing Orchestrator

This notebook orchestrates a data processing workflow by preparing a configuration file and running an external analysis script (`run_sequence.py`) for one or more dates.

**Workflow:**

1.  **Prerequisites:**
    *   The `Yloader` application has been run to download OHLCV data.
    *   A `finviz` data generation process has created `.parquet` files (e.g., `df_finviz_YYYY-MM-DD.parquet`) in the `Downloads` directory.
2.  **Find Data:** The notebook scans the `Downloads` directory for recent Finviz data files.
3.  **Select Date(s):** It extracts all available dates from the filenames and selects a subset for processing based on user configuration (e.g., only the latest date).
4.  **Configure & Run:** For each selected date, it generates a `config.py` file and executes the `run_sequence.py` script.



### Setup and Configuration

**This is the only cell you need to modify.** Adjust the variables below to match your environment and desired processing scope.

In [None]:
import sys
import re
from pathlib import Path
import pandas as pd

# --- Project and Path Configuration ---

# Autodetect the project's root directory.
# Assumes this notebook is in `root/notebooks/` or the `root/` directory.
NOTEBOOK_DIR = Path.cwd()
ROOT_DIR = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == 'notebooks' else NOTEBOOK_DIR

# Add the project's root directory to the Python path so we can import 'config'
if str(ROOT_DIR) not in sys.path:
    sys.path.append(str(ROOT_DIR))

# Add the project's source directory to the Python path
SRC_DIR = ROOT_DIR / 'src'
if str(SRC_DIR) not in sys.path:
    sys.path.append(str(SRC_DIR))
    
# Import the custom utility module now that the path is set
import utils
from config import ANNUAL_RISK_FREE_RATE, TRADING_DAYS_PER_YEAR

# --- Data File Configuration ---
DOWNLOADS_DIR = Path.home() / "Downloads"
DATA_FILE_PREFIX = 'df_finviz'  # Prefix for files like 'df_finviz_2024-01-15.parquet'
DATA_FILE_EXTENSION = 'parquet'
DATA_FILES_TO_SCAN = 100  # How many recent files to check for dates

# --- Analysis Run Configuration ---

# Define which dates to process using a slice.
# Examples:
#   slice(-1, None, None) -> Processes only the most recent date.
#   slice(None)           -> Processes ALL found dates.
#   slice(-5, None, None) -> Processes the 5 most recent dates.
#   slice(0, 5, None)     -> Processes the 5 oldest dates.
DATE_SLICE = slice(-1, None, None)  # Defaults to getting the last item

# --- config.py Generation Parameters ---
# These values will be written into the config.py file for each run.
DEST_DIR = ROOT_DIR / 'data' # Destination directory for processed data
ANNUAL_RISK_FREE_RATE = 0.04
TRADING_DAYS_PER_YEAR = 252
DAILY_RISK_FREE_RATE = ANNUAL_RISK_FREE_RATE / TRADING_DAYS_PER_YEAR

# --- Verification ---
print(f"Project Root Directory: {ROOT_DIR}")
print(f"Source Directory: {SRC_DIR}")
print(f"Scanning for data files in: {DOWNLOADS_DIR}")
# print(f"Date selection rule: {DATE_SLICE}")

# Set pandas display options for better readability
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

# Enable auto-reloading of external modules
%load_ext autoreload
%autoreload 2

### Step 1: Find Recent Data Files

This step searches the configured directory for data files that match the specified prefix and extension.

In [None]:
# --- Execute Step 1 ---
print("--- Step 1: Finding recent data files ---")

# Use the utility function to get a list of recent filenames.
# NOTE: We pass `directory_name=DOWNLOADS_DIR.name` to match the expected
# function signature in the existing `utils.py` module.
found_files = utils.get_recent_files_in_directory(
    prefix=DATA_FILE_PREFIX,
    extension=DATA_FILE_EXTENSION,
    count=DATA_FILES_TO_SCAN,
    directory_name=DOWNLOADS_DIR.name  # Corrected argument
)

if found_files:
    print(f"Found {len(found_files)} potential data file(s).")
    # Display the first 5 found files for brevity
    for i, filename in enumerate(found_files[:20]):
        # print(f"  {i+1}. {filename}")
        print(f"  {i}. {filename}")        
    if len(found_files) > 20:
        print("  ...")
else:
    print(f"No files matching '{DATA_FILE_PREFIX}*.{DATA_FILE_EXTENSION}' found in '{DOWNLOADS_DIR}'.")
    # Initialize as empty list to prevent errors in the next step
    found_files = []

### Step 2: Extract and Select Dates for Processing

This step extracts dates from the found filenames, sorts them, and then selects the dates to be processed based on the `DATE_SLICE` configuration.

In [None]:
import re
from typing import List

def extract_and_sort_dates(filenames: List[str]) -> List[str]:
    """
    Extracts unique date strings (YYYY-MM-DD) from filenames and sorts them.

    Args:
        filenames: A list of filenames.

    Returns:
        A sorted list of unique date strings.
    """
    date_pattern = re.compile(r'\d{4}-\d{2}-\d{2}')
    
    # Use a set comprehension for a concise way to find all unique matches
    all_dates = {
        match.group(0)
        for filename in filenames
        if (match := date_pattern.search(filename))
    }
    
    # sorted() works directly on sets
    return sorted(all_dates)

In [None]:
def print_in_columns(items: List[str], num_columns: int = 5):
    """
    Prints a list of strings in a numbered, multi-column format.

    Args:
        items: The list of strings to print.
        num_columns: The number of columns to display.
    """
    if not items:
        print("No items to display.")
        return

    # Iterate through the items in chunks the size of num_columns
    for i in range(0, len(items), num_columns):
        # Get the slice of items for the current row
        row_slice = items[i : i + num_columns]
        
        row_items = []
        for j, item in enumerate(row_slice):
            # The original index of the item in the full list
            original_index = i + j
            
            # Format the label (e.g., "  0.") to a fixed width for alignment
            label = f"  {original_index}."
            
            # Combine the fixed-width label with the item string
            row_items.append(f"{label:<8}{item}")
            
        # Join the formatted items for the row and print
        print("  ".join(row_items))

In [None]:
# 1. Extract the data
available_dates = extract_and_sort_dates(found_files)

# 2. Display the data
print(f"Extracted {len(available_dates)} unique dates from filenames:")
print_in_columns(available_dates, num_columns=5)

In [None]:
from typing import Optional

def parse_str_to_slice(slice_str: str) -> Optional[slice]:
    """
    Parses a string like "start:stop:step" into a slice object.
    Returns None if the format is invalid.
    """
    try:
        # Split the string by colons to get the slice parts
        parts = slice_str.split(':')
        if len(parts) > 3:
            return None # Invalid format, too many colons

        # Create a function to convert a string part to an integer or None
        to_int_or_none = lambda s: int(s) if s.strip() else None

        # Unpack parts and convert them
        start = to_int_or_none(parts[0]) if len(parts) > 0 else None
        stop = to_int_or_none(parts[1]) if len(parts) > 1 else None
        step = to_int_or_none(parts[2]) if len(parts) > 2 else None
        
        return slice(start, stop, step)
    except (ValueError, IndexError):
        # Catches errors from non-integer parts or bad format
        return None

In [None]:
def prompt_for_slice_update(variable_name: str, current_value: slice) -> slice:
    """
    Displays a slice's current value and prompts the user to keep or change it.
    Loops until a valid slice format is entered.

    Args:
        variable_name: The name of the variable (e.g., "DATE_SLICE").
        current_value: The current slice object.

    Returns:
        The updated (or original) slice object.
    """
    # Format the current slice object into a user-friendly string
    s = current_value
    current_value_str = f"{s.start or ''}:{s.stop or ''}"
    if s.step is not None:
        current_value_str += f":{s.step}"

    # --- Start the user prompt loop ---
    while True:
        prompt = (
            f"\n-> The current {variable_name} is: '{current_value_str}'\n"
            f"   Enter a new slice (e.g., ':10', '-5:', '::-1') or press ENTER to continue: "
        )
        user_input = input(prompt).strip()

        # Case 1: User presses ENTER to keep the current value
        if not user_input:
            print(f"   Continuing with the current value.")
            return current_value

        # Case 2: User enters a new value, try to parse it
        new_slice = parse_str_to_slice(user_input)
        
        if new_slice is not None:
            print(f"   {variable_name} updated.")
            return new_slice
        else:
            # Case 3: The input was invalid, inform the user and loop again
            print(f"   Error: Invalid slice format '{user_input}'. Please try again.")

In [None]:
# 2. Call the function to allow the user to modify the slice
DATE_SLICE = prompt_for_slice_update("DATE_SLICE", DATE_SLICE)

In [None]:
# --- Continue execution with the (potentially new) value ---
print("\nContinuing script execution...")
print(f"The script will now proceed using DATE_SLICE = '{DATE_SLICE}'")

In [None]:
# 2. Select the dates to process based on the configured slice
dates_to_process = available_dates[DATE_SLICE]
print(f"Selected {len(dates_to_process)} date(s) to process:")
print(f'dates_to_process:\n{dates_to_process}')

### Step 3: Generate Configuration and Run Analysis for Each Selected Date

This is the main execution step. It iterates through the list of selected dates. For each date, it generates a fresh `config.py` and runs the `run_sequence.py` script.

In [None]:
def create_config_file(date_str: str, config_path: Path):
    """
    Creates a config.py file with dynamic paths and parameters.
    It pulls configuration from the global variables set in the setup cell.

    Args:
        date_str (str): The date to be written into the config file.
        config_path (Path): The path where the config.py file will be saved.
    """
    # Use repr() to get a string representation of the path, which correctly
    # handles backslashes on Windows (e.g., 'C:\\Users\\...')
    config_content = f"""# config.py
# This file is auto-generated by a notebook. DO NOT EDIT MANUALLY.

# --- File path configuration ---
DATE_STR = '{date_str}'
DOWNLOAD_DIR = {repr(str(DOWNLOADS_DIR))}
DEST_DIR = {repr(str(DEST_DIR))}

# --- Analysis Parameters ---
ANNUAL_RISK_FREE_RATE = {ANNUAL_RISK_FREE_RATE}
TRADING_DAYS_PER_YEAR = {TRADING_DAYS_PER_YEAR}
DAILY_RISK_FREE_RATE = {DAILY_RISK_FREE_RATE}
"""
    
    with open(config_path, 'w') as f:
        f.write(config_content)

# --- Execute Step 3 ---
print("\n--- Step 3: Starting processing sequence ---")

if not dates_to_process:
    print("No dates to process. Halting execution.")
else:
    for date_str in dates_to_process:
        print(f"\n{'='*20} PROCESSING DATE: {date_str} {'='*20}")
        
        # Define the path for the config file (in the project root)
        config_file_path = ROOT_DIR / 'config.py'
        
        # 1. Create the config.py file for the current date
        create_config_file(date_str, config_file_path)
        print(f"Successfully created config file: {config_file_path}")

        # 2. Run the external processing script
        print(f"Executing run_sequence.py for {date_str}...")
        %run -i {ROOT_DIR / 'run_sequence.py'}
        print(f"--- Finished processing for {date_str} ---")

    print(f"\n{'='*20} WORKFLOW COMPLETE {'='*20}")