In [1]:
# --- Configuration & File Path Setup ---
import os
from IPython.display import display, Markdown

def get_ohlcv_files(temp_dir):
    """Return list of files matching df_OHLCV_ pattern"""
    os.makedirs(temp_dir, exist_ok=True)
    return [f for f in os.listdir(temp_dir) if f.startswith('df_OHLCV_')]

def display_file_selector(files):
    """Show interactive file selection interface"""
    display(Markdown("**Available OHLCV files:**"))
    for idx, f in enumerate(files, 1):
        display(Markdown(f"- ({idx}) `{f}`"))
        
def get_user_choice(files):
    """Handle user input with validation"""
    while True:
        try:
            prompt = f"\n\033[1mSelect file (1-{len(files)}): \033[0m"
            choice = int(input(prompt))
            if 1 <= choice <= len(files):
                return files[choice-1]
            display(Markdown(f"<span style='color:red'>Enter 1-{len(files)}</span>"))
        except ValueError:
            display(Markdown("<span style='color:red'>Numbers only!</span>"))

def generate_clean_filename(source_file):
    """Create cleaned filename with _clean suffix"""
    base, ext = os.path.splitext(source_file)
    return f"{base}_clean{ext}"

def main_processor(temp_dir='temp'):
    """Orchestrate the full file selection process"""
    # Get matching files
    ohlcv_files = get_ohlcv_files(temp_dir)
    if not ohlcv_files:
        display(Markdown("**Error:** No OHLCV files found!"))
        return None, None
    
    # User interaction flow
    display_file_selector(ohlcv_files)
    selected_file = get_user_choice(ohlcv_files)
    clean_file = generate_clean_filename(selected_file)
    
    # Build paths
    source_path = os.path.join(temp_dir, selected_file)
    dest_path = os.path.join(temp_dir, clean_file)
    
    # Final confirmation
    display(Markdown(f"""
    **Selected paths:**
    - Source: `{source_path}`  
    - Destination: `{dest_path}`
    """))
    return source_path, dest_path

# --- Execute the processor ---
SOURCE_PATH, DEST_PATH = main_processor()

**Available OHLCV files:**

- (1) `df_OHLCV_2025-03-03.pkl`

- (2) `df_OHLCV_2025-03-03_clean.pkl`

- (3) `df_OHLCV_2025-03-06.pkl`


    **Selected paths:**
    - Source: `temp\df_OHLCV_2025-03-03.pkl`  
    - Destination: `temp\df_OHLCV_2025-03-03_clean.pkl`
    

In [None]:
# --- Configuration & File Path Setup ---
import os
from IPython.display import display, Markdown  # Jupyter-specific display tools

# Create temp directory if needed
TEMP_DIR = os.path.join(os.getcwd(), 'temp')
os.makedirs(TEMP_DIR, exist_ok=True)

# List files starting with 'df_OHLCV_' in TEMP_DIR
matching_files = [f for f in os.listdir(TEMP_DIR) if f.startswith('df_OHLCV_')]

if not matching_files:
    display(Markdown("**Error:** No matching files found in temp directory!"))
    exit()

# Display files with bold headers using Markdown
display(Markdown("**Available files:**"))
for idx, filename in enumerate(matching_files, 1):
    display(Markdown(f"- ({idx}) `{filename}`"))

# Jupyter-optimized input handling
while True:
    try:
        print(f'\nInput a number into the prompt (top of the screen) to select file to process')
        # Create a prominent input prompt
        choice = int(input(f"\n\033[1mEnter choice (1-{len(matching_files)}): \033[0m"))
        if 1 <= choice <= len(matching_files):
            SOURCE_FILENAME = matching_files[choice - 1]
            break
        else:
            display(Markdown(f"<span style='color:red'>Invalid: Enter 1-{len(matching_files)}</span>"))
    except ValueError:
        display(Markdown("<span style='color:red'>Numbers only please!</span>"))

# Generate cleaned filename
base, ext = os.path.splitext(SOURCE_FILENAME)
CLEANED_FILENAME = f"{base}_clean{ext}"

# Build paths and show confirmation
source_path = os.path.join(TEMP_DIR, SOURCE_FILENAME)
dest_path = os.path.join(TEMP_DIR, CLEANED_FILENAME)

display(Markdown(f"""
**Selected paths:**
- Source: `{source_path}`  
- Destination: `{dest_path}`
"""))

In [None]:
# --- Data Loading & Initial Inspection ---
import pandas as pd

# Load raw data from pickle file
df = pd.read_pickle(source_path)

# Display initial data structure
print("[Raw Data Overview]")
display(df.head())
df.info()

In [None]:
# --- Data Filtering & Cleaning ---
import utils  # Custom utility functions

# 1. Align dates across all symbols using SPY as reference
df = utils.filter_df_dates_to_reference_symbol(df=df, reference_symbol='SPY')

# 2. Remove symbols with missing data points
df_clean, missing_symbols = utils.filter_symbols_with_missing_values(df)

# Display cleaning results
print("\n[Cleaning Report]")
print(f"Removed {len(missing_symbols)} symbols with missing data: {missing_symbols}")
print("\n[Cleaned Data Structure]")
df_clean.info()

In [None]:
# --- Save Cleaned Data ---
# Save processed data to pickle file
df_clean.to_pickle(dest_path)
print(f"\n[Save Successful] Cleaned data saved to:\n{dest_path}")