### Setup and Configuration

This cell contains all imports and user-configurable parameters for the analysis pipeline.

In [1]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import pprint
import matplotlib.pyplot as plt # Import for plotting
from IPython.display import display, Markdown
from scipy.stats import linregress 

# --- 1. PANDAS & IPYTHON OPTIONS ---
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 3000)
%load_ext autoreload
%autoreload 2

# --- 2. PROJECT PATH CONFIGURATION ---
NOTEBOOK_DIR = Path.cwd()
ROOT_DIR = NOTEBOOK_DIR.parent.parent  # Adjust if your notebook is in a 'notebooks' subdirectory
DATA_DIR = ROOT_DIR / 'data'
SRC_DIR = ROOT_DIR / 'src'

# Add 'src' to the Python path to import custom modules
if str(SRC_DIR) not in sys.path:
    sys.path.append(str(SRC_DIR))

# --- 3. IMPORT CUSTOM MODULES ---
import utils
import plotting_utils

# --- 4. ANALYSIS & FILTERING CONFIGURATION ---

# File searching parameters
# FILE_PREFIX = ''  # e.g., '2024'
FILE_CONTAINS_PATTERN = 'df_OHLCV_clean_stocks_etfs'

# # Parameters defining the time windows for metric calculation
PERIOD_PARAMS = {
    'lookback_days': 22,
    'recent_days': 0,
}

# This is not use for filtering, it's use to calculate metrics in SORT_ORDER
# Parameters for filtering the calculated metrics to find candidates
METRIC_FILTERS = {
    'min_lookback_improvement': 0,
    'current_rank_bracket_start': 1,
    'current_rank_bracket_end': 1000,
    # --- Select ONE mode by commenting out the others ---
    # 'Reversal' Mode
    'min_recent_bottom_to_recent_start': 0,
    'min_recent_bottom_to_current': 0,    
    # 'Dip' Mode
    # 'min_current_to_recent_start': 10,
}

# --- 5. VERIFICATION ---
print("--- Path Configuration ---")
print(f"✅ Project Root: {ROOT_DIR}")
print(f"✅ Data Dir:     {DATA_DIR}")
print(f"✅ Source Dir:   {SRC_DIR}")
assert all([ROOT_DIR.exists(), DATA_DIR.exists(), SRC_DIR.exists()]), "A key directory was not found!"

print("\n--- Module Verification ---")
print(f"✅ Successfully imported 'utils' and 'plotting_utils'.")



--- Path Configuration ---
✅ Project Root: c:\Users\ping\Files_win10\python\py311\stocks
✅ Data Dir:     c:\Users\ping\Files_win10\python\py311\stocks\data
✅ Source Dir:   c:\Users\ping\Files_win10\python\py311\stocks\src

--- Module Verification ---
✅ Successfully imported 'utils' and 'plotting_utils'.


In [2]:
print("--- Step 1: Loading latest consolidated Finviz data ---")

# Find the most recent file matching the pattern
# This function is now understood to return List[str] (filenames), not List[Path].
latest_finviz_filepaths = utils.get_recent_files(
    directory_path=DATA_DIR,
    extension='parquet',
    prefix='202',
    contains_pattern='df_finviz_merged_stocks_etfs',
    count=1
)

if not latest_finviz_filepaths:
    raise FileNotFoundError(f"No files found in '{DATA_DIR}' with prefix '{FILE_PREFIX}' and pattern '{FILE_CONTAINS_PATTERN}'")

# Get the filename string from the list
latest_filename = latest_finviz_filepaths[0]

# Manually construct the full path before loading
full_file_path = DATA_DIR / latest_filename
df_finviz_latest = pd.read_parquet(full_file_path, engine='pyarrow')


# --- Robust Index Setting (this logic remains correct) ---
if df_finviz_latest.index.name == 'Ticker':
    print("Info: 'Ticker' is already the index. No action needed.")
elif 'Ticker' in df_finviz_latest.columns:
    print("Info: 'Ticker' column found. Setting it as the DataFrame index.")
    df_finviz_latest.set_index('Ticker', inplace=True)
elif 'ticker' in df_finviz_latest.columns:
    print("Info: 'ticker' column found. Renaming and setting as index.")
    df_finviz_latest.rename(columns={'ticker': 'Ticker'}, inplace=True)
    df_finviz_latest.set_index('Ticker', inplace=True)
elif df_finviz_latest.index.name is None:
    print("Info: Index is unnamed. Assuming it contains tickers and assigning the name 'Ticker'.")
    df_finviz_latest.index.name = 'Ticker'
else:
    print("ERROR: Loaded DataFrame has an unexpected format.")
    print(f"Columns: {df_finviz_latest.columns.tolist()}")
    print(f"Index Name: '{df_finviz_latest.index.name}'")
    raise ValueError("Could not find a 'Ticker' column or a usable index to proceed.")


# Correct the print statement to work with the filename string
print(f"✅ Successfully loaded: {latest_filename}")
print(f"Shape: {df_finviz_latest.shape}")
print(df_finviz_latest.head(3))

--- Step 1: Loading latest consolidated Finviz data ---
Info: Index is unnamed. Assuming it contains tickers and assigning the name 'Ticker'.
✅ Successfully loaded: 2025-09-11_df_finviz_merged_stocks_etfs.parquet
Shape: (1434, 139)
        No.                Company               Index      Sector                   Industry Country Exchange                                   Info  MktCap AUM, M  Rank  Market Cap, M    P/E  Fwd P/E   PEG    P/S    P/B    P/C  P/FCF  Book/sh  Cash/sh  Dividend %  Dividend TTM Dividend Ex Date  Payout Ratio %    EPS  EPS next Q  EPS this Y %  EPS next Y %  EPS past 5Y %  EPS next 5Y %  Sales past 5Y %  Sales Q/Q %  EPS Q/Q %  EPS YoY TTM %  Sales YoY TTM %  Sales, M  Income, M  EPS Surprise %  Revenue Surprise %  Outstanding, M  Float, M  Float %  Insider Own %  Insider Trans %  Inst Own %  Inst Trans %  Short Float %  Short Ratio  Short Interest, M  ROA %   ROE %  ROIC %  Curr R  Quick R  LTDebt/Eq  Debt/Eq  Gross M %  Oper M %  Profit M %  Perf 3D %  Per

In [3]:
# Manually construct the full path before loading
full_file_path = DATA_DIR / 'df_OHLCV_clean_stocks_etfs.parquet'
df_OHLCV = pd.read_parquet(full_file_path, engine='pyarrow')

In [4]:
df_OHLCV

Unnamed: 0_level_0,Unnamed: 1_level_0,Adj Open,Adj High,Adj Low,Adj Close,Volume
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A,2024-09-12,137.1450,137.5620,133.2250,135.3490,1628935
AA,2024-09-12,30.6070,31.3675,30.2218,30.9625,5961883
AAL,2024-09-12,11.0300,11.1400,10.4300,10.8800,37815400
AAON,2024-09-12,91.4261,93.1491,91.1672,93.0893,214272
AAPL,2024-09-12,221.4690,222.5140,218.8020,221.7380,37629918
...,...,...,...,...,...,...
ZM,2025-09-11,84.1000,85.1300,82.6358,84.7500,2843228
ZS,2025-09-11,282.0000,292.9400,281.4500,286.6600,1946425
ZTO,2025-09-11,19.0000,19.3400,19.0000,19.3400,834024
ZTS,2025-09-11,149.7750,151.9500,149.4200,150.6100,2808558


In [None]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 364500 entries, ('A', Timestamp('2024-09-12 00:00:00')) to ('ZWS', Timestamp('2025-09-11 00:00:00'))
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   Adj Open   364500 non-null  float64
 1   Adj High   364500 non-null  float64
 2   Adj Low    364500 non-null  float64
 3   Adj Close  364500 non-null  float64
 4   Volume     364500 non-null  int64  
dtypes: float64(4), int64(1)
memory usage: 15.3+ MB


In [None]:
# Returns a single boolean: True if any NaN exists, False otherwise.
has_nan = df_train.isna().any().any()

print(f"Are there any NaN values in the DataFrame? {has_nan}")

Are there any NaN values in the DataFrame? False


### Cell 2: The Chronological Split Code

This cell contains the logic to find the split date and create the `df_train` and `df_test` DataFrames.

In [None]:
# --- 1. Find the Chronological Split Point ---

# Get all unique dates from the index and sort them
unique_dates = df_train.index.get_level_values('Date').unique().sort_values()

# Determine the index for the 70% split
split_index = int(len(unique_dates) * 0.7)

# Find the actual date at that split index
split_date = unique_dates[split_index]

print(f"Total unique trading dates in dataset: {len(unique_dates)}")
print(f"The data will be split on the date: {split_date.date()}")

# --- 2. Create the Training and Testing Sets ---

# The training set includes all data UP TO and INCLUDING the split_date
df_train = df_OHLCV[df_OHLCV.index.get_level_values('Date') <= split_date]

# The testing set includes all data AFTER the split_date
df_test = df_OHLCV[df_OHLCV.index.get_level_values('Date') > split_date]


# --- 3. Verify the Split ---

print("\n--- Verification ---")
print(f"Original DataFrame shape: {df_OHLCV.shape}")
print(f"Training set shape:   {df_train.shape}")
print(f"Testing set shape:    {df_test.shape}")

print("\nDate Ranges:")
print(f"  Training: {df_train.index.get_level_values('Date').min().date()} to {df_train.index.get_level_values('Date').max().date()}")
print(f"  Testing:  {df_test.index.get_level_values('Date').min().date()} to {df_test.index.get_level_values('Date').max().date()}")

# Final check to ensure no overlap
assert df_train.index.get_level_values('Date').max() < df_test.index.get_level_values('Date').min()
print("\nVerification successful: There is no date overlap between train and test sets.")

Total unique trading dates in dataset: 250
The data will be split on the date: 2025-05-27

--- Verification ---
Original DataFrame shape: (364500, 5)
Training set shape:   (256608, 5)
Testing set shape:    (107892, 5)

Date Ranges:
  Training: 2024-09-12 to 2025-05-27
  Testing:  2025-05-28 to 2025-09-11

Verification successful: There is no date overlap between train and test sets.


In [11]:
df_train

Unnamed: 0_level_0,Unnamed: 1_level_0,Adj Open,Adj High,Adj Low,Adj Close,Volume
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A,2024-09-12,137.1450,137.5620,133.2250,135.3490,1628935
AA,2024-09-12,30.6070,31.3675,30.2218,30.9625,5961883
AAL,2024-09-12,11.0300,11.1400,10.4300,10.8800,37815400
AAON,2024-09-12,91.4261,93.1491,91.1672,93.0893,214272
AAPL,2024-09-12,221.4690,222.5140,218.8020,221.7380,37629918
...,...,...,...,...,...,...
ZM,2025-05-27,79.9250,80.3400,78.3400,78.9000,3780900
ZS,2025-05-27,257.7000,259.4000,256.1400,257.3000,1758700
ZTO,2025-05-27,16.8300,17.0400,16.7300,16.7600,3232300
ZTS,2025-05-27,163.4530,166.3340,163.2840,165.7060,2330868


In [12]:
df_test

Unnamed: 0_level_0,Unnamed: 1_level_0,Adj Open,Adj High,Adj Low,Adj Close,Volume
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A,2025-05-28,110.5970,111.3460,109.9090,110.6470,2738855
AA,2025-05-28,28.1636,28.1836,27.5139,27.7849,4875623
AAL,2025-05-28,11.7000,11.7400,11.3900,11.4300,61811800
AAON,2025-05-28,99.9215,100.3110,96.6191,96.9084,480295
AAPL,2025-05-28,200.3630,202.5000,199.6730,200.1930,45391157
...,...,...,...,...,...,...
ZM,2025-09-11,84.1000,85.1300,82.6358,84.7500,2843228
ZS,2025-09-11,282.0000,292.9400,281.4500,286.6600,1946425
ZTO,2025-09-11,19.0000,19.3400,19.0000,19.3400,834024
ZTS,2025-09-11,149.7750,151.9500,149.4200,150.6100,2808558


### 2. The Enhanced Analysis Function

This is the core of our new logic. The function `analyze_ticker_channel` will be applied to each ticker. It contains a nested helper function `_perform_regression` to avoid code repetition, which is excellent practice.


In [13]:
def analyze_ticker_trends(df_group, lookback_days=60):
    """
    Analyzes the trends of a ticker's price channel (High/Low) and Volume.
    
    - Price channel analysis uses a unified "worst-case" volatility.
    - Volume analysis uses its own standard pct_change volatility.
    """
    
    def _perform_price_regression(price_series, time_index, unified_std_dev):
        """Helper for High/Low regression, using the shared volatility metric."""
        normalized_price = price_series / price_series.iloc[0]
        try:
            res = linregress(x=time_index, y=normalized_price)
            slope, r_squared = res.slope, res.rvalue**2
        except (ValueError, ZeroDivisionError):
            return {'slope': 0.0, 'r_squared': np.nan, 'penalty_score': np.nan}
        penalty_score = (1 - r_squared) * (unified_std_dev + 1e-9)
        return {'slope': slope, 'r_squared': r_squared, 'penalty_score': penalty_score}

    # --- Main function logic ---
    
    # 1. Select the lookback window
    window = df_group.tail(lookback_days)
    if len(window) < lookback_days or len(window) < 10:
        # Define all expected columns for a clean NaN return
        cols = ['high_slope', 'high_r_squared', 'high_penalty_score',
                'low_slope', 'low_r_squared', 'low_penalty_score',
                'unified_std_dev_returns', 'volume_slope', 'volume_r_squared',
                'volume_std_dev_returns', 'volume_penalty_score']
        return pd.Series(dict.fromkeys(cols, np.nan))

    # 2. Calculate the UNIFIED "Worst-Case" Volatility for PRICE
    yesterday_low = window['Adj Low'].shift(1)
    worst_case_returns = (window['Adj High'] - yesterday_low) / yesterday_low
    unified_std_dev = worst_case_returns.std(ddof=0) # ddof=0 for population std dev
    unified_std_dev = 0.0 if pd.isna(unified_std_dev) else unified_std_dev

    # 3. Analyze Volume Trend and Volatility (SELF-CONTAINED)
    time_index = np.arange(len(window))
    start_volume = window['Volume'].iloc[0]
    
    if start_volume > 0:
        normalized_volume = window['Volume'] / start_volume
        try:
            vol_res = linregress(x=time_index, y=normalized_volume)
            volume_slope, volume_r_squared = vol_res.slope, vol_res.rvalue**2
        except ValueError:
            volume_slope, volume_r_squared = 0.0, np.nan
    else: # Handle zero start volume edge case
        volume_slope, volume_r_squared = 0.0, np.nan
        
    volume_std_dev = window['Volume'].pct_change().std(ddof=0)
    volume_std_dev = 0.0 if pd.isna(volume_std_dev) else volume_std_dev
    volume_penalty_score = (1 - volume_r_squared) * (volume_std_dev + 1e-9)

    # 4. Analyze the High and Low series using the helper
    high_metrics = _perform_price_regression(window['Adj High'], time_index, unified_std_dev)
    low_metrics = _perform_price_regression(window['Adj Low'], time_index, unified_std_dev)
    
    # 5. Combine all results into a single Series
    final_results = {
        'high_slope': high_metrics['slope'], 'high_r_squared': high_metrics['r_squared'], 'high_penalty_score': high_metrics['penalty_score'],
        'low_slope': low_metrics['slope'], 'low_r_squared': low_metrics['r_squared'], 'low_penalty_score': low_metrics['penalty_score'],
        'unified_std_dev_returns': unified_std_dev,
        'volume_slope': volume_slope, 'volume_r_squared': volume_r_squared,
        'volume_std_dev_returns': volume_std_dev, 'volume_penalty_score': volume_penalty_score
    }
    
    return pd.Series(final_results)

### 3. Running the Analysis

Now, we apply our powerful new function to the entire DataFrame using `groupby().apply()`.


In [14]:
# If your ticker level is named, use level='ticker_name'
channel_analysis = df_train.groupby(level=0).apply(
    analyze_ticker_trends, lookback_days=60
)

print("Channel Analysis Results:")
print(channel_analysis)

Channel Analysis Results:
        high_slope  high_r_squared  high_penalty_score  low_slope  low_r_squared  low_penalty_score  unified_std_dev_returns  volume_slope  volume_r_squared  volume_std_dev_returns  volume_penalty_score
Ticker                                                                                                                                                                                                    
A        -0.002311        0.463536            0.013775  -0.002208       0.359508           0.016446                 0.025678     -0.002725          0.016087                0.336662              0.331246
AA       -0.004000        0.421278            0.024472  -0.003863       0.316432           0.028906                 0.042287      0.006194          0.050110                0.368025              0.349583
AAL      -0.001568        0.107854            0.043716  -0.001291       0.065503           0.045791                 0.049001     -0.003689          0.037066      

In [15]:
channel_analysis.shape

(1458, 11)

In [16]:
# Returns a single boolean: True if any NaN exists, False otherwise.
has_nan = channel_analysis.isna().any().any()

print(f"Are there any NaN values in the DataFrame? {has_nan}")

Are there any NaN values in the DataFrame? False


# Normalize the Residual:  
To solve the "volatility is not constant" problem, we should not use the raw dollar residual. We should normalize it by dividing it by the standard deviation of the residuals over the lookback period. This creates a "Standardized Residual" or "Z-Score".

### Setup: Imports and a Special Sample Price Series

First, let's create a price series that has a clear uptrend but with two distinct volatility "regimes": a quiet first half and a choppy, more volatile second half. This will help us see the problem with raw residuals.

In [None]:
# import pandas as pd
# import numpy as np
# from scipy.stats import linregress
# import matplotlib.pyplot as plt

# # --- A PERFECT, MANUALLY-CRAFTED EXAMPLE (Corrected) ---
# time_index = np.arange(100)
# trend = 0.5 * time_index + 100

# # Manually create the residuals
# quiet_residuals = pd.Series([0.5, -0.5, 1.0, -1.0, 0.25] * 10) # 50 elements
# volatile_residuals = pd.Series([1.5, -1.5, 3.0, -3.0, 0.75] * 10) # 50 elements

# # The Key Intervention
# quiet_residuals.iloc[20] = -3.0  # -$3 dip in the quiet period
# volatile_residuals.iloc[20] = -9.0 # -$9 dip in the volatile period

# # Combine to create the price series
# all_residuals = pd.Series(np.concatenate([quiet_residuals, volatile_residuals]))
# price = trend + all_residuals
# price_series = pd.Series(price, name='Adj_High')


# # --- THE CORRECT IMPLEMENTATION: ROLLING Z-SCORE ---

# # First, get the trend and raw residuals as before
# res = linregress(x=time_index, y=price_series)
# predicted_price = res.intercept + res.slope * time_index
# raw_residuals = price_series - predicted_price

# # Now, calculate the ROLLING mean and std dev over a 30-day window
# rolling_mean = raw_residuals.rolling(window=30, min_periods=10).mean()
# rolling_std = raw_residuals.rolling(window=30, min_periods=10).std()

# # Calculate the ROLLING Z-SCORE
# # Z-Score = (Current Residual - Rolling Mean of Residuals) / Rolling Std Dev of Residuals
# rolling_z_score = (raw_residuals - rolling_mean) / rolling_std

# # --- Plot the final, correct result ---
# plt.figure(figsize=(14, 6))
# plt.plot(time_index, rolling_z_score, label='Rolling Z-Score (30-day window)', color='purple')
# plt.axhline(0, color='black', linestyle='--')
# plt.axhline(2.0, color='red', linestyle=':', label='+2 Std Dev')
# plt.axhline(-2.0, color='green', linestyle=':', label='-2 Std Dev')
# plt.axvspan(0, 49, color='green', alpha=0.1)
# plt.axvspan(49, 99, color='red', alpha=0.1)

# # Highlight our specific events to confirm
# plt.scatter(20, rolling_z_score.iloc[20], color='blue', s=150, zorder=5, label=f"-$3 Dip Z-Score: {rolling_z_score.iloc[20]:.2f}")
# plt.scatter(70, rolling_z_score.iloc[70], color='magenta', s=150, zorder=5, label=f"-$9 Dip Z-Score: {rolling_z_score.iloc[70]:.2f}")

# plt.title('The Correct Solution: Rolling Z-Scores ARE Comparable Across Regimes')
# plt.ylabel('Standard Deviations from LOCAL Trend')
# plt.xlabel('Days')
# plt.legend()
# plt.grid(True)
# plt.ylim(-4, 4) # Set a consistent y-axis
# plt.show()

### Phase B: The Code Implementation (Revised for T+1 and Slippage)

#### Cell 1: Setup and Feature Engineering Functions

This cell is largely the same, but I've added `Adj Close` to our sample data, as we'll need it to generate the exit signals at the close of the day.

In [17]:
import pandas as pd
import numpy as np
from scipy.stats import linregress
from tqdm.notebook import tqdm # For progress bars!

# # --- Sample Data (Now with Adj Close) ---
# np.random.seed(42)
# dates = pd.to_datetime(pd.date_range(start='2023-01-01', periods=252)) # One year of data
# data = {
#     'ticker': np.repeat(['GOOD', 'BAD'], 252),
#     'date': np.tile(dates, 2),
# }
# # GOOD: A nice trending stock with dips
# good_trend = 100 + 0.1 * np.arange(252) + np.sin(np.arange(252) * 0.5) * 2
# good_vol = 500_000 * (1 + 0.001 * np.arange(252))
# # BAD: A choppy, downward trending stock
# bad_trend = 150 - 0.05 * np.arange(252) + np.random.randn(252) * 3
# bad_vol = 300_000 * (1 - 0.001 * np.arange(252))
# data.update({
#     'Adj Low': np.concatenate([good_trend, bad_trend]),
#     'Adj High': np.concatenate([good_trend + 2, bad_trend + 2]),
#     'Adj Close': np.concatenate([good_trend + 1.5, bad_trend + 1.5]), # Added Close
#     'Volume': np.concatenate([good_vol, bad_vol]).astype(int)
# })
# train = pd.DataFrame(data).set_index(['ticker', 'date'])

# --- FEATURE FUNCTION 1: TRENDS ---
# (Unchanged)
def analyze_ticker_trends(df_group, lookback_days=60):
    window = df_group.tail(lookback_days)
    if len(window) < lookback_days: return pd.Series()
    time_index = np.arange(len(window))
    low_res = linregress(time_index, window['Adj Low'])
    vol_res = linregress(time_index, window['Volume'])
    return pd.Series({
        'low_slope': low_res.slope,
        'low_r_squared': low_res.rvalue**2,
        'volume_slope': vol_res.slope
    })

# --- FEATURE FUNCTION 2: ROLLING Z-SCORE ---
# (Unchanged)
def calculate_rolling_z_scores(df_group, rolling_window=20):
    if len(df_group) < rolling_window: return pd.Series()
    rolling_mean = df_group['Adj Low'].rolling(rolling_window).mean()
    rolling_std = df_group['Adj Low'].rolling(rolling_window).std()
    z_score = (df_group['Adj Low'] - rolling_mean) / rolling_std
    return pd.Series({'low_rolling_z_score': z_score.iloc[-1]})

#### Cell 2: The Revised Backtesting Engine (The Core Changes are Here)

This function has been significantly updated to handle the T+1 logic and the buy-high/sell-low rule. I've added comments to highlight exactly where the changes are.

In [None]:
def run_backtest_revised(df_ohlcv, lookback_days, rolling_window, params):
    trades = []
    open_positions = {}
    
    # Get all unique dates to iterate through
    all_dates = df_ohlcv.index.get_level_values('Date').unique().sort_values()
    
    start_index = max(lookback_days, rolling_window)

    # We now need the index `i` to find the next day
    for i in tqdm(range(start_index, len(all_dates)), desc="Backtesting"):
        current_date = all_dates[i] # This is our Signal Generation Date

        # --- KEY CHANGE: Determine the Trade Execution Date (T+1) ---
        # If it's the last day, we can't trade tomorrow, so we stop.
        if i + 1 >= len(all_dates):
            break
        next_day_date = all_dates[i+1] # This is our Trade Execution Date

        # --- 1. EXIT LOGIC ---
        for ticker in list(open_positions.keys()):
            pos = open_positions[ticker]
            
            try:
                # Use CURRENT day's close to DECIDE whether to sell
                current_close_price = df_ohlcv.loc[(ticker, current_date), 'Adj Close']
            except KeyError:
                continue # Skip if ticker has no data today

            # Check exit conditions based on today's close
            exit_signal_triggered = False
            if current_close_price >= pos['entry_price'] * (1 + params['profit_target']):
                exit_signal_triggered = True
                exit_reason = "Profit Target"
            elif current_close_price <= pos['entry_price'] * (1 - params['stop_loss']):
                exit_signal_triggered = True
                exit_reason = "Stop-Loss"
            elif (current_date - pos['entry_date']).days >= params['time_hold_days']:
                exit_signal_triggered = True
                exit_reason = "Time Hold"
            
            if exit_signal_triggered:
                try:
                    # --- KEY CHANGE: Execute the sale on the NEXT DAY at the LOW ---
                    exit_price = df_ohlcv.loc[(ticker, next_day_date), 'Adj Low']
                    
                    trade_return = (exit_price - pos['entry_price']) / pos['entry_price']
                    trades.append({
                        'ticker': ticker, 'entry_date': pos['entry_date'], 'exit_date': next_day_date,
                        'return': trade_return, 'reason': exit_reason
                    })
                    del open_positions[ticker]
                except KeyError:
                    # If no data on the next day, we can't exit. Hold the position.
                    pass

        # --- 2. ENTRY LOGIC ---
        # Data up to the current date is used for signal generation
        data_so_far = df_ohlcv[df_ohlcv.index.get_level_values('Date') <= current_date]
        
        trends = data_so_far.groupby(level='Ticker').apply(analyze_ticker_trends, lookback_days)
        z_scores = data_so_far.groupby(level='Ticker').apply(calculate_rolling_z_scores, rolling_window)
        features = trends.join(z_scores).dropna()

        signals = features[
            (features['low_slope'] > params['slope_thresh']) &
            (features['low_r_squared'] > params['r2_thresh']) &
            (features['volume_slope'] > 0) &
            (features['low_rolling_z_score'] < params['z_entry_thresh'])
        ]
        
        for ticker in signals.index:
            if ticker not in open_positions:
                try:
                    # --- KEY CHANGE: Execute the buy on the NEXT DAY at the HIGH ---
                    entry_price = df_ohlcv.loc[(ticker, next_day_date), 'Adj High']
                    
                    open_positions[ticker] = {
                        'entry_date': next_day_date, # Entry date is now tomorrow
                        'entry_price': entry_price
                    }
                except KeyError:
                    # If no data on the next day, we can't enter the trade.
                    pass
                
    return pd.DataFrame(trades)


#### Cell 3: The Optimization Loop

This cell remains the same, but it now calls our new, more robust `run_backtest_revised` function.

In [None]:
# --- STRATEGY & BACKTEST PARAMETERS ---
strategy_params = {
    'slope_thresh': 0.05,
    'r2_thresh': 0.50,
    'z_entry_thresh': -1.5,
    'profit_target': 0.10, # 10%
    'stop_loss': 0.05,     # 5%
    'time_hold_days': 20
}

# --- OPTIMIZATION PARAMETERS ---
lookback_days_options = [30, 60, 90]
rolling_window_options = [15, 20]

results_log = []

for lookback in lookback_days_options:
    for rolling in rolling_window_options:
        print(f"\n--- Running for lookback={lookback}, rolling_window={rolling} ---")
        
        # --- CALL THE REVISED FUNCTION ---
        trade_results = run_backtest_revised(df_train, lookback, rolling, strategy_params)
        
        if not trade_results.empty:
            win_rate = (trade_results['return'] > 0).mean()
            total_return = (1 + trade_results['return']).prod() - 1
            avg_return = trade_results['return'].mean()
            num_trades = len(trade_results)
        else:
            win_rate, total_return, avg_return, num_trades = 0, 0, 0, 0
            
        results_log.append({
            'lookback': lookback, 'rolling': rolling, 'num_trades': num_trades,
            'win_rate': win_rate, 'avg_return': avg_return, 'total_return': total_return
        })

# --- ANALYZE OPTIMIZATION RESULTS ---
optimization_results = pd.DataFrame(results_log)
print("\n\n--- Optimization Complete (Revised, Conservative Method) ---")
print(optimization_results.sort_values(by='total_return', ascending=False))


--- Running for lookback=30, rolling_window=15 ---


Backtesting:   0%|          | 0/146 [00:00<?, ?it/s]


--- Running for lookback=30, rolling_window=20 ---


Backtesting:   0%|          | 0/146 [00:00<?, ?it/s]

In [None]:
df_train.info()