### Setup and Configuration

This cell contains all imports and user-configurable parameters for the analysis pipeline.

In [5]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import pprint
import matplotlib.pyplot as plt # Import for plotting
from IPython.display import display, Markdown
from scipy.stats import linregress 

# --- 1. PANDAS & IPYTHON OPTIONS ---
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 3000)
%load_ext autoreload
%autoreload 2

# --- 2. PROJECT PATH CONFIGURATION ---
NOTEBOOK_DIR = Path.cwd()
ROOT_DIR = NOTEBOOK_DIR.parent.parent  # Adjust if your notebook is in a 'notebooks' subdirectory
DATA_DIR = ROOT_DIR / 'data'
SRC_DIR = ROOT_DIR / 'src'

# Add 'src' to the Python path to import custom modules
if str(SRC_DIR) not in sys.path:
    sys.path.append(str(SRC_DIR))

# --- 3. IMPORT CUSTOM MODULES ---
import utils
import plotting_utils

# --- 4. ANALYSIS & FILTERING CONFIGURATION ---

# File searching parameters
# FILE_PREFIX = ''  # e.g., '2024'
FILE_CONTAINS_PATTERN = 'df_OHLCV_clean_stocks_etfs'

# # Parameters defining the time windows for metric calculation
PERIOD_PARAMS = {
    'lookback_days': 22,
    'recent_days': 0,
}

# This is not use for filtering, it's use to calculate metrics in SORT_ORDER
# Parameters for filtering the calculated metrics to find candidates
METRIC_FILTERS = {
    'min_lookback_improvement': 0,
    'current_rank_bracket_start': 1,
    'current_rank_bracket_end': 1000,
    # --- Select ONE mode by commenting out the others ---
    # 'Reversal' Mode
    'min_recent_bottom_to_recent_start': 0,
    'min_recent_bottom_to_current': 0,    
    # 'Dip' Mode
    # 'min_current_to_recent_start': 10,
}

# --- 5. VERIFICATION ---
print("--- Path Configuration ---")
print(f"✅ Project Root: {ROOT_DIR}")
print(f"✅ Data Dir:     {DATA_DIR}")
print(f"✅ Source Dir:   {SRC_DIR}")
assert all([ROOT_DIR.exists(), DATA_DIR.exists(), SRC_DIR.exists()]), "A key directory was not found!"

print("\n--- Module Verification ---")
print(f"✅ Successfully imported 'utils' and 'plotting_utils'.")



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
--- Path Configuration ---
✅ Project Root: c:\Users\ping\Files_win10\python\py311\stocks
✅ Data Dir:     c:\Users\ping\Files_win10\python\py311\stocks\data
✅ Source Dir:   c:\Users\ping\Files_win10\python\py311\stocks\src

--- Module Verification ---
✅ Successfully imported 'utils' and 'plotting_utils'.


In [6]:
print("--- Step 1: Loading latest consolidated Finviz data ---")

# Find the most recent file matching the pattern
# This function is now understood to return List[str] (filenames), not List[Path].
latest_finviz_filepaths = utils.get_recent_files(
    directory_path=DATA_DIR,
    extension='parquet',
    prefix='202',
    contains_pattern='df_finviz_merged_stocks_etfs',
    count=1
)

if not latest_finviz_filepaths:
    raise FileNotFoundError(f"No files found in '{DATA_DIR}' with prefix '{FILE_PREFIX}' and pattern '{FILE_CONTAINS_PATTERN}'")

# Get the filename string from the list
latest_filename = latest_finviz_filepaths[0]

# Manually construct the full path before loading
full_file_path = DATA_DIR / latest_filename
df_finviz_latest = pd.read_parquet(full_file_path, engine='pyarrow')


# --- Robust Index Setting (this logic remains correct) ---
if df_finviz_latest.index.name == 'Ticker':
    print("Info: 'Ticker' is already the index. No action needed.")
elif 'Ticker' in df_finviz_latest.columns:
    print("Info: 'Ticker' column found. Setting it as the DataFrame index.")
    df_finviz_latest.set_index('Ticker', inplace=True)
elif 'ticker' in df_finviz_latest.columns:
    print("Info: 'ticker' column found. Renaming and setting as index.")
    df_finviz_latest.rename(columns={'ticker': 'Ticker'}, inplace=True)
    df_finviz_latest.set_index('Ticker', inplace=True)
elif df_finviz_latest.index.name is None:
    print("Info: Index is unnamed. Assuming it contains tickers and assigning the name 'Ticker'.")
    df_finviz_latest.index.name = 'Ticker'
else:
    print("ERROR: Loaded DataFrame has an unexpected format.")
    print(f"Columns: {df_finviz_latest.columns.tolist()}")
    print(f"Index Name: '{df_finviz_latest.index.name}'")
    raise ValueError("Could not find a 'Ticker' column or a usable index to proceed.")


# Correct the print statement to work with the filename string
print(f"✅ Successfully loaded: {latest_filename}")
print(f"Shape: {df_finviz_latest.shape}")
print(df_finviz_latest.head(3))

--- Step 1: Loading latest consolidated Finviz data ---
Info: Index is unnamed. Assuming it contains tickers and assigning the name 'Ticker'.
✅ Successfully loaded: 2025-09-12_df_finviz_merged_stocks_etfs.parquet
Shape: (1463, 139)
        No.                Company               Index      Sector                   Industry Country Exchange                                   Info  MktCap AUM, M  Rank  Market Cap, M    P/E  Fwd P/E   PEG    P/S    P/B    P/C  P/FCF  Book/sh  Cash/sh  Dividend %  Dividend TTM Dividend Ex Date  Payout Ratio %    EPS  EPS next Q  EPS this Y %  EPS next Y %  EPS past 5Y %  EPS next 5Y %  Sales past 5Y %  Sales Q/Q %  EPS Q/Q %  EPS YoY TTM %  Sales YoY TTM %  Sales, M  Income, M  EPS Surprise %  Revenue Surprise %  Outstanding, M  Float, M  Float %  Insider Own %  Insider Trans %  Inst Own %  Inst Trans %  Short Float %  Short Ratio  Short Interest, M  ROA %   ROE %  ROIC %  Curr R  Quick R  LTDebt/Eq  Debt/Eq  Gross M %  Oper M %  Profit M %  Perf 3D %  Per

In [7]:
# Manually construct the full path before loading
full_file_path = DATA_DIR / 'df_OHLCV_clean_stocks_etfs.parquet'
df_OHLCV = pd.read_parquet(full_file_path, engine='pyarrow')

### Cell 2: The Chronological Split Code

This cell contains the logic to find the split date and create the `df_train` and `df_test` DataFrames.

In [8]:
# --- 1. Find the Chronological Split Point ---

# Get all unique dates from the index and sort them
unique_dates = df_OHLCV.index.get_level_values('Date').unique().sort_values()

# Determine the index for the 70% split
split_index = int(len(unique_dates) * 0.7)

# Find the actual date at that split index
split_date = unique_dates[split_index]

print(f"Total unique trading dates in dataset: {len(unique_dates)}")
print(f"The data will be split on the date: {split_date.date()}")

# --- 2. Create the Training and Testing Sets ---

# The training set includes all data UP TO and INCLUDING the split_date
df_train = df_OHLCV[df_OHLCV.index.get_level_values('Date') <= split_date]

# The testing set includes all data AFTER the split_date
df_test = df_OHLCV[df_OHLCV.index.get_level_values('Date') > split_date]


# --- 3. Verify the Split ---

print("\n--- Verification ---")
print(f"Original DataFrame shape: {df_OHLCV.shape}")
print(f"Training set shape:   {df_train.shape}")
print(f"Testing set shape:    {df_test.shape}")

print("\nDate Ranges:")
print(f"  Training: {df_train.index.get_level_values('Date').min().date()} to {df_train.index.get_level_values('Date').max().date()}")
print(f"  Testing:  {df_test.index.get_level_values('Date').min().date()} to {df_test.index.get_level_values('Date').max().date()}")

# Final check to ensure no overlap
assert df_train.index.get_level_values('Date').max() < df_test.index.get_level_values('Date').min()
print("\nVerification successful: There is no date overlap between train and test sets.")

Total unique trading dates in dataset: 250
The data will be split on the date: 2025-05-28

--- Verification ---
Original DataFrame shape: (371000, 5)
Training set shape:   (261184, 5)
Testing set shape:    (109816, 5)

Date Ranges:
  Training: 2024-09-13 to 2025-05-28
  Testing:  2025-05-29 to 2025-09-12

Verification successful: There is no date overlap between train and test sets.


In [9]:
df_train

Unnamed: 0_level_0,Unnamed: 1_level_0,Adj Open,Adj High,Adj Low,Adj Close,Volume
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A,2024-09-13,135.7660,136.5590,135.3290,136.2620,924860
AAL,2024-09-13,10.9800,11.1800,10.6100,10.6900,36956000
AAON,2024-09-13,93.6669,94.8421,93.0793,93.7367,208147
AAPL,2024-09-13,222.5440,223.0020,220.8820,221.4690,36937711
ABBV,2024-09-13,187.1660,187.8710,185.1870,187.5040,2815309
...,...,...,...,...,...,...
ZM,2025-05-28,79.1600,80.1750,79.0150,79.6700,2481400
ZS,2025-05-28,256.0000,256.0000,252.5000,253.6500,2223400
ZTO,2025-05-28,17.0900,17.6400,17.0900,17.5300,4477600
ZTS,2025-05-28,165.6060,166.0950,163.9720,164.8490,1729463


In [10]:
df_test

Unnamed: 0_level_0,Unnamed: 1_level_0,Adj Open,Adj High,Adj Low,Adj Close,Volume
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A,2025-05-29,116.7040,116.7440,112.7630,113.0420,3845481
AAL,2025-05-29,11.6400,11.6700,11.3100,11.4000,61318000
AAON,2025-05-29,98.1157,98.1157,95.4418,95.9307,547849
AAPL,2025-05-29,203.3490,203.5790,198.2850,199.7230,51455131
ABBV,2025-05-29,181.8890,185.1510,181.1260,184.0310,4968445
...,...,...,...,...,...,...
ZM,2025-09-12,84.6000,85.0000,83.8200,83.9800,1856513
ZS,2025-09-12,287.0000,288.5300,282.4600,283.1900,1340900
ZTO,2025-09-12,19.2600,19.2900,19.0003,19.0600,735599
ZTS,2025-09-12,149.6450,150.0500,146.5350,148.2000,3111221


### High-Level Review

*   **Configuration Management:** The parameters are split. `strategy_params` are static, while `lookback` and `rolling` are loop variables. This is a common pattern, but it can be improved by unifying them into a single configuration object for each run. This makes logging and debugging much clearer.
*   **Encapsulation:** The optimization loop contains three distinct responsibilities: 1) generating parameter combinations, 2) running the backtest for each combination, and 3) calculating performance metrics. These can be encapsulated into their own functions to make the top-level script cleaner and the components more reusable.
*   **Performance:** The current structure is logically sound for an optimization task. The most expensive part, `precompute_signals`, *must* be re-run for each parameter combination because its calculations depend on `lookback` and `rolling`. While we can't eliminate this work, we can ensure the structure is clean and acknowledge that for massive speed-ups, one might need to switch to a specialized backtesting library (like `vectorbt`) that can test many parameter combinations at once. We will focus on making the current Python/Pandas implementation as clean and well-structured as possible.

### Refactoring Strategy

1.  **Consolidate Parameters:** We'll combine the loop parameters and the static `strategy_params` into a single configuration dictionary for each backtest run.
2.  **Create a Performance Analysis Function:** We'll extract the metric calculation logic (win rate, total return, etc.) into a dedicated `analyze_performance` function. This isolates analysis from the optimization loop.
3.  **Create an Optimization Orchestrator:** We'll wrap the entire optimization loop in a new function, `run_parameter_optimization`. This function will be responsible for iterating through parameter sets, calling the backtest, calling the performance analysis, and collecting the results.
4.  **Refine the `run_backtest` Signature:** We'll update `run_backtest` and its helpers to accept the new consolidated configuration dictionary. This simplifies the function calls.

### Refactored Code

Here is the complete, refactored solution. I've included the previously refactored functions with slight modifications to accept the new configuration structure.

In [11]:
import numpy as np
import pandas as pd

def calculate_rolling_z_scores(df_group, rolling_window=20):
    """
    Calculates a rolling Z-score for the 'Adj Low' price for an entire ticker history.
    
    This function is designed to be used with pandas' groupby().apply().
    
    Args:
        df_group (pd.DataFrame): The DataFrame for a single ticker.
        rolling_window (int): The lookback window for calculating mean and std.
        
    Returns:
        pd.Series: A Series of Z-scores with the same index as the input df_group.
    """
    # Use the column directly, which is more efficient.
    low_price = df_group['Adj Low']
    
    # Calculate rolling statistics. min_periods=1 ensures we get values even at the start.
    rolling_mean = low_price.rolling(window=rolling_window, min_periods=1).mean()
    rolling_std = low_price.rolling(window=rolling_window, min_periods=1).std()
    
    # Calculate the Z-score for all dates at once.
    z_score = (low_price - rolling_mean) / rolling_std
    
    # --- ROBUSTNESS IMPROVEMENTS ---
    # 1. Handle division by zero: if std is 0, z_score is inf. Replace with 0.
    z_score = z_score.replace([np.inf, -np.inf], 0)
    
    # 2. Fill any initial NaNs (from std dev calculation) with 0.
    z_score = z_score.fillna(0)
    
    # Return the entire series, renamed appropriately.
    return z_score.rename('low_rolling_z_score')


In [12]:
import numpy as np
import pandas as pd
from scipy.stats import linregress # We no longer need this for the vectorized function

def analyze_ticker_trends_vectorized(df_group, lookback_days=60):
    """
    Vectorized analysis of trends for a ticker's price and volume.
    
    This version calculates linear regression metrics using rolling covariance
    and variance, avoiding the limitations of .apply() and maximizing performance.
    """
    if len(df_group) < lookback_days:
        return None # groupby().apply() will correctly handle None returns by skipping the group

    # --- 1. Setup for Vectorized Regression ---
    
    # Create a time-series index [0, 1, 2, ...] that aligns with the data
    time_index = pd.Series(np.arange(len(df_group)), index=df_group.index)
    
    # The variance of a sequence [0, 1, ..., n-1] is constant. Pre-calculate it.
    var_time = np.var(np.arange(lookback_days), ddof=0)
    
    # --- 2. Calculate Rolling Metrics for Each Column ---

    # We'll work with a dictionary of the series we want to analyze
    series_to_analyze = {
        'high': df_group['Adj High'],
        'low': df_group['Adj Low'],
        'volume': df_group['Volume'].astype(float) # Ensure volume is float
    }
    
    df_results = pd.DataFrame(index=df_group.index)

    for name, series in series_to_analyze.items():
        # Rolling covariance between the series and the time index
        rolling_cov = time_index.rolling(window=lookback_days).cov(series, ddof=0)
        
        # Rolling variance of the series itself
        rolling_var_series = series.rolling(window=lookback_days).var(ddof=0)
        
        # Slope = cov(t, y) / var(t)
        df_results[f'{name}_slope'] = rolling_cov / var_time
        
        # R-squared = cov(t, y)^2 / (var(t) * var(y))
        # Add a small epsilon to prevent division by zero
        denominator = (var_time * rolling_var_series) + 1e-9
        df_results[f'{name}_r_squared'] = (rolling_cov**2) / denominator

    # --- 3. Calculate Volatility and Penalty Scores (as before) ---
    
    yesterday_low = df_group['Adj Low'].shift(1)
    worst_case_returns = (df_group['Adj High'] - yesterday_low) / yesterday_low
    df_results['unified_std_dev_returns'] = worst_case_returns.rolling(window=lookback_days).std(ddof=0)
    
    volume_std_dev = df_group['Volume'].pct_change().rolling(window=lookback_days).std(ddof=0)
    df_results['volume_std_dev_returns'] = volume_std_dev
    
    df_results['low_penalty_score'] = (1 - df_results['low_r_squared']) * (df_results['unified_std_dev_returns'] + 1e-9)
    df_results['high_penalty_score'] = (1 - df_results['high_r_squared']) * (df_results['unified_std_dev_returns'] + 1e-9)
    df_results['volume_penalty_score'] = (1 - df_results['volume_r_squared']) * (df_results['volume_std_dev_returns'] + 1e-9)
    
    return df_results


#### Step 1: Refined Core Backtesting Functions

We'll modify the function signatures to accept a single `config` dictionary. This makes them more modular.

In [13]:
import pandas as pd
from tqdm import tqdm
from itertools import product

# Assume analyze_ticker_trends_vectorized and calculate_rolling_z_scores are defined elsewhere

def precompute_signals(df_ohlcv, config):
    """
    Pre-computes all trading signals and returns the full features DataFrame
    for the dates where signals were triggered.
    """
    print("Pre-computing features for this parameter set...")
    
    trends = df_ohlcv.groupby(level='Ticker', group_keys=False).apply(
        analyze_ticker_trends_vectorized, config['lookback_days']
    )
    
    z_scores = df_ohlcv.groupby(level='Ticker', group_keys=False).apply(
        calculate_rolling_z_scores, config['rolling_window']
    )
    
    features = trends.join(z_scores).dropna()

    signals = features[
        (features['low_slope'] > config['slope_thresh']) &
        (features['low_r_squared'] > config['r2_thresh']) &
        (features['volume_slope'] > 0) &
        (features['low_rolling_z_score'] < config['z_entry_thresh'])
    ]
    
    # --- KEY CHANGE: Return the full signals DataFrame, not just the index ---
    return signals

def run_backtest(df_ohlcv, config):
    """
    Orchestrates the backtesting process with enhanced logging.
    """
    # Now returns a DataFrame of features for triggered signals
    entry_signals_features = precompute_signals(df_ohlcv, config)
    
    trades = []
    open_positions = {}
    
    all_dates = df_ohlcv.index.get_level_values('Date').unique().sort_values()
    start_index = max(config['lookback_days'], config['rolling_window'])

    for i in tqdm(range(start_index, len(all_dates) - 1), desc="Backtesting"):
        current_date = all_dates[i]
        next_day_date = all_dates[i+1]

        closed_trades, open_positions = handle_exits_for_day(
            current_date, next_day_date, open_positions, df_ohlcv, config
        )
        trades.extend(closed_trades)

        # --- KEY CHANGE: Filter the features DataFrame for today's signals ---
        signals_today = entry_signals_features[
            entry_signals_features.index.get_level_values('Date') == current_date
        ]
        
        # Pass the full signals_today DataFrame to the handler
        open_positions = handle_entries_for_day(
            current_date, next_day_date, signals_today, open_positions, df_ohlcv
        )
                
    # --- Create the final DataFrame and reorder columns for clarity ---
    if not trades:
        return pd.DataFrame()

    final_trades_df = pd.DataFrame(trades)
    log_columns = [
        'ticker', 'signal_date', 'entry_date', 'exit_signal_date', 'exit_date', 'reason',
        'return', 'entry_price_actual', 'exit_price_actual', 'exit_trigger_price', 
        'exit_target_value', 'entry_signal_features'
    ]
    # Ensure all columns exist, fill missing with None
    for col in log_columns:
        if col not in final_trades_df.columns:
            final_trades_df[col] = None
            
    return final_trades_df[log_columns]


### Step 2: New Encapsulated Helper Functions

These new functions isolate the logic for performance analysis and the optimization loop itself.

In [None]:
def analyze_performance(trade_results):
    """
    Calculates performance metrics from a DataFrame of trades.
    
    Returns a dictionary of key metrics.
    """
    if trade_results.empty:
        return {'num_trades': 0, 'win_rate': 0, 'avg_return': 0, 'total_return': 0}
    
    win_rate = (trade_results['return'] > 0).mean()
    total_return = (1 + trade_results['return']).prod() - 1
    avg_return = trade_results['return'].mean()
    
    return {
        'num_trades': len(trade_results),
        'win_rate': win_rate,
        'avg_return': avg_return,
        'total_return': total_return
    }

def run_parameter_optimization(df, param_grid, static_params):
    """
    Orchestrates the entire parameter optimization process.
    
    Args:
        df (pd.DataFrame): The OHLCV data.
        param_grid (dict): Dictionary with lists of parameters to test.
        static_params (dict): Dictionary of parameters that are not being optimized.

    Returns:
        pd.DataFrame: A summary of results for each parameter combination.
    """
    results_log = []
    
    # Use itertools.product to create a clean generator for all combinations
    keys, values = zip(*param_grid.items())
    param_combinations = [dict(zip(keys, v)) for v in product(*values)]

    print(f"Starting optimization for {len(param_combinations)} combinations...")
    
    for param_set in tqdm(param_combinations, desc="Optimization Progress"):
        # Combine static and dynamic parameters into a single config for this run
        current_config = {**static_params, **param_set}
        
        # 1. Run the backtest with the current configuration
        trade_results = run_backtest(df, current_config)
        
        # 2. Analyze the performance of this run
        performance_metrics = analyze_performance(trade_results)
        
        # 3. Log the results
        log_entry = {**param_set, **performance_metrics}
        results_log.append(log_entry)
        
    return pd.DataFrame(results_log)

def handle_entries_for_day(current_date, next_day_date, signals_today, open_positions, df_ohlcv):
    """
    Processes entries and stores signal details in the open_positions dict.
    """
    # --- KEY CHANGE: Loop through the signals DataFrame ---
    for ticker, signal_row in signals_today.iterrows():
        # The ticker is now in the index of signal_row, so we use its name
        ticker_name = ticker[0] 
        
        if ticker_name not in open_positions:
            try:
                entry_price = df_ohlcv.loc[(ticker_name, next_day_date), 'Adj High']
                
                # --- LOGGING: Store more info about the entry signal ---
                open_positions[ticker_name] = {
                    'entry_date': next_day_date,
                    'entry_price': entry_price,
                    'signal_date': current_date,
                    'signal_features': signal_row.to_dict() # Store all features that triggered the signal
                }
            except KeyError:
                pass
                
    return open_positions

def handle_exits_for_day(current_date, next_day_date, open_positions, df_ohlcv, config):
    """
    Checks for exits and logs detailed information about the exit trigger.
    Corrected version with valid syntax for the if/elif chain.
    """
    closed_trades = []
    positions_to_close = []

    for ticker, pos in open_positions.items():
        try:
            current_close_price = df_ohlcv.loc[(ticker, current_date), 'Adj Close']
        except KeyError:
            continue 

        exit_reason = None
        exit_target_value = None 
        
        # --- SYNTAX FIX: Calculate all threshold values *before* the conditional block ---
        profit_target_price = pos['entry_price'] * (1 + config['profit_target'])
        stop_loss_price = pos['entry_price'] * (1 - config['stop_loss'])
        days_held = (current_date.to_pydatetime().date() - pos['entry_date'].to_pydatetime().date()).days
        
        # --- Now, check conditions in a contiguous if/elif/elif block ---
        if current_close_price >= profit_target_price:
            exit_reason = "Profit Target"
            exit_target_value = profit_target_price 

        elif current_close_price <= stop_loss_price:
            exit_reason = "Stop-Loss"
            exit_target_value = stop_loss_price 

        elif days_held >= config['time_hold_days']:
            exit_reason = "Time Hold"
            exit_target_value = days_held 

        if exit_reason:
            try:
                exit_price = df_ohlcv.loc[(ticker, next_day_date), 'Adj Low']
                trade_return = (exit_price - pos['entry_price']) / pos['entry_price']
                
                trade_log = {
                    'ticker': ticker, 
                    'entry_date': pos['entry_date'], 
                    'exit_date': next_day_date,
                    'return': trade_return, 
                    'reason': exit_reason,
                    'signal_date': pos['signal_date'],
                    'entry_signal_features': pos['signal_features'],
                    'entry_price_actual': pos['entry_price'],
                    'exit_signal_date': current_date,
                    'exit_trigger_price': current_close_price,
                    'exit_target_value': exit_target_value,
                    'exit_price_actual': exit_price,
                }
                closed_trades.append(trade_log)
                positions_to_close.append(ticker)
            except KeyError:
                pass
                
    for ticker in positions_to_close:
        del open_positions[ticker]
        
    return closed_trades, open_positions

#### Step 3: The New, Clean Top-Level Script

Your main script is now incredibly simple and readable. It's all about configuration and orchestration.

In [40]:
# --- 1. DEFINE CONFIGURATION ---

# Parameters to be optimized, defining the search space
optimization_grid = {
    'lookback_days': [30, 60, 90],
    'rolling_window': [15, 20]
}

# Static strategy parameters that do not change during optimization
strategy_params = {
    'slope_thresh': 1.0,
    'r2_thresh': 0.50,
    'z_entry_thresh': 0,
    'profit_target': 0.10,
    'stop_loss': 0.05,
    'time_hold_days': 20
}


# --- 2. RUN ORCHESTRATOR ---

# The main call is now a single, descriptive function
optimization_results = run_parameter_optimization(
    df_train, optimization_grid, strategy_params
)


# --- 3. ANALYZE RESULTS ---

print("\n\n--- Optimization Complete ---")
print(optimization_results.sort_values(by='total_return', ascending=False))

Starting optimization for 6 combinations...


Optimization Progress:   0%|          | 0/6 [00:00<?, ?it/s]

Pre-computing features for this parameter set...


Backtesting: 100%|██████████| 145/145 [00:01<00:00, 103.66it/s]
Optimization Progress:  17%|█▋        | 1/6 [00:46<03:51, 46.28s/it]

Pre-computing features for this parameter set...


Backtesting: 100%|██████████| 145/145 [00:01<00:00, 116.10it/s]
Optimization Progress:  33%|███▎      | 2/6 [01:30<02:59, 44.82s/it]

Pre-computing features for this parameter set...


Backtesting: 100%|██████████| 115/115 [00:01<00:00, 111.47it/s]
Optimization Progress:  50%|█████     | 3/6 [02:16<02:16, 45.50s/it]

Pre-computing features for this parameter set...


Backtesting: 100%|██████████| 115/115 [00:00<00:00, 131.42it/s]
Optimization Progress:  67%|██████▋   | 4/6 [03:02<01:31, 45.70s/it]

Pre-computing features for this parameter set...


Backtesting: 100%|██████████| 85/85 [00:00<00:00, 214.89it/s]
Optimization Progress:  83%|████████▎ | 5/6 [03:45<00:44, 44.88s/it]

Pre-computing features for this parameter set...


Backtesting: 100%|██████████| 85/85 [00:00<00:00, 215.61it/s]
Optimization Progress: 100%|██████████| 6/6 [04:29<00:00, 44.90s/it]



--- Optimization Complete ---
   lookback_days  rolling_window  num_trades  win_rate  avg_return  total_return
5             90              20          59  0.186441   -0.081050     -0.994506
4             90              15          66  0.151515   -0.080568     -0.996883
3             60              20         159  0.207547   -0.050479     -0.999825
1             30              20         157  0.210191   -0.057217     -0.999946
2             60              15         178  0.196629   -0.053968     -0.999968
0             30              15         252  0.238095   -0.052574     -1.000000





### Step 1: The "One-Trade" Deep Dive

The most powerful debugging technique is to isolate a single trade and follow it from signal generation to exit. If the logic holds for one trade, it's likely correct for all of them.

1.  **Pick a Winning Trade and a Losing Trade:** Run one of the backtests again (e.g., the one with `lookback=30`, `rolling=20`) and save the `trade_results` DataFrame.

In [41]:
# 1. Pick a configuration to analyze
config_to_test = {
    **strategy_params, 
    'lookback_days': 30, 
    'rolling_window': 20
}

# 2. Run the backtest to get the detailed trade log
trade_log_df = run_backtest(df_train, config_to_test)

# 3. Isolate and inspect a single trade
if not trade_log_df.empty:
    # Get the first losing trade
    losing_trade = trade_log_df[trade_log_df['return'] < 0].iloc[0]

    print("--- Detailed Log for a Single Losing Trade ---")
    # Using .T transposes the Series for easy vertical reading
    print(losing_trade.T)
else:
    print("No trades were made for this configuration.")

Pre-computing features for this parameter set...


Backtesting: 100%|██████████| 145/145 [00:00<00:00, 172.88it/s]

--- Detailed Log for a Single Losing Trade ---
ticker                                                                  UI
signal_date                                            2024-11-05 00:00:00
entry_date                                             2024-11-06 00:00:00
exit_signal_date                                       2024-11-06 00:00:00
exit_date                                              2024-11-07 00:00:00
reason                                                           Stop-Loss
return                                                           -0.052919
entry_price_actual                                                 266.805
exit_price_actual                                                  252.686
exit_trigger_price                                                 250.481
exit_target_value                                                253.46475
entry_signal_features    {'high_slope': 1.907652280311449, 'high_r_squa...
Name: 3, dtype: object





In [42]:
losing_trade.entry_signal_features

{'high_slope': 1.907652280311449,
 'high_r_squared': 0.9624262377381637,
 'low_slope': 1.7543586206896535,
 'low_r_squared': 0.9102128986812584,
 'volume_slope': 785.3339265850957,
 'volume_r_squared': 0.12062909556348794,
 'unified_std_dev_returns': 0.015844150831881298,
 'volume_std_dev_returns': 0.3589794852718475,
 'low_penalty_score': 0.0014226004658386508,
 'high_penalty_score': 0.0005953243941715457,
 'volume_penalty_score': 0.315676115517029,
 'low_rolling_z_score': -0.3616785939465286}

In [43]:
df_loss = trade_log_df[trade_log_df['return'] < 0]
print(df_loss)

    ticker signal_date entry_date exit_signal_date  exit_date         reason    return  entry_price_actual  exit_price_actual  exit_trigger_price  exit_target_value                              entry_signal_features
3       UI  2024-11-05 2024-11-06       2024-11-06 2024-11-07      Stop-Loss -0.052919             266.805            252.686             250.481          253.46475  {'high_slope': 1.907652280311449, 'high_r_squa...
5     CACI  2024-11-13 2024-11-14       2024-11-14 2024-11-15      Stop-Loss -0.142562             542.220            464.920             491.320          515.10900  {'high_slope': 2.4967675194661108, 'high_r_squ...
6     LDOS  2024-11-14 2024-11-15       2024-11-18 2024-11-19      Stop-Loss -0.066974             167.333            156.126             157.525          158.96635  {'high_slope': 1.3691176863181362, 'high_r_squ...
7     MSCI  2024-10-29 2024-10-30       2024-11-19 2024-11-20      Time Hold -0.017101             581.385            571.443           

In [48]:
df_win = trade_log_df[trade_log_df['return'] > 0]
print(df_win)

    ticker signal_date entry_date exit_signal_date  exit_date         reason    return  entry_price_actual  exit_price_actual  exit_trigger_price  exit_target_value                              entry_signal_features
0      FIX  2024-10-25 2024-10-28       2024-11-06 2024-11-07  Profit Target  0.110984             389.425            432.645             435.664           428.3675  {'high_slope': 1.780408898776421, 'high_r_squa...
1     COIN  2024-10-31 2024-11-01       2024-11-06 2024-11-07  Profit Target  0.269195             190.940            242.340             254.310           210.0340  {'high_slope': 1.907668520578417, 'high_r_squa...
2       GS  2024-11-04 2024-11-05       2024-11-06 2024-11-07  Profit Target  0.102491             516.913            569.892             584.062           568.6043  {'high_slope': 1.3004215795328118, 'high_r_squ...
4       CW  2024-11-01 2024-11-04       2024-11-11 2024-11-12  Profit Target  0.086341             352.764            383.222           

In [44]:
optimization_results

Unnamed: 0,lookback_days,rolling_window,num_trades,win_rate,avg_return,total_return
0,30,15,252,0.238095,-0.052574,-1.0
1,30,20,157,0.210191,-0.057217,-0.999946
2,60,15,178,0.196629,-0.053968,-0.999968
3,60,20,159,0.207547,-0.050479,-0.999825
4,90,15,66,0.151515,-0.080568,-0.996883
5,90,20,59,0.186441,-0.08105,-0.994506


In [45]:
# Pick a configuration to analyze
config_to_test = {**strategy_params, 'lookback_days': 30, 'rolling_window': 20}

# Run a single backtest and get the detailed trade log
single_run_trades = run_backtest(df_train, config_to_test)

# Find a winning and a losing trade to investigate
print("Sample winning trade:")
print(single_run_trades[single_run_trades['return'] > 0].head(1))

print("\nSample losing trade:")
print(single_run_trades[single_run_trades['return'] < 0].head(1))

Pre-computing features for this parameter set...


Backtesting: 100%|██████████| 145/145 [00:00<00:00, 175.26it/s]


Sample winning trade:
  ticker signal_date entry_date exit_signal_date  exit_date         reason    return  entry_price_actual  exit_price_actual  exit_trigger_price  exit_target_value                              entry_signal_features
0    FIX  2024-10-25 2024-10-28       2024-11-06 2024-11-07  Profit Target  0.110984             389.425            432.645             435.664           428.3675  {'high_slope': 1.780408898776421, 'high_r_squa...

Sample losing trade:
  ticker signal_date entry_date exit_signal_date  exit_date     reason    return  entry_price_actual  exit_price_actual  exit_trigger_price  exit_target_value                              entry_signal_features
3     UI  2024-11-05 2024-11-06       2024-11-06 2024-11-07  Stop-Loss -0.052919             266.805            252.686             250.481          253.46475  {'high_slope': 1.907652280311449, 'high_r_squa...


In [46]:
df_trends = analyze_ticker_trends_vectorized(df_train, lookback_days=30)


In [47]:
df_trends.loc['FIX', '2024-10-25']

high_slope                      7.393045
high_r_squared                  0.018086
low_slope                       7.150013
low_r_squared                   0.017813
volume_slope              -264511.524138
volume_r_squared                0.091981
unified_std_dev_returns         4.392628
volume_std_dev_returns         14.814005
low_penalty_score               4.314382
high_penalty_score              4.313183
volume_penalty_score           13.451396
Name: (FIX, 2024-10-25 00:00:00), dtype: float64