### Setup and Configuration

This cell contains all imports and user-configurable parameters for the analysis pipeline.

In [None]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import pprint
import matplotlib.pyplot as plt # Import for plotting
from IPython.display import display, Markdown
from scipy.stats import linregress 

# --- 1. PANDAS & IPYTHON OPTIONS ---
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 3000)
%load_ext autoreload
%autoreload 2

# --- 2. PROJECT PATH CONFIGURATION ---
NOTEBOOK_DIR = Path.cwd()
ROOT_DIR = NOTEBOOK_DIR.parent.parent  # Adjust if your notebook is in a 'notebooks' subdirectory
DATA_DIR = ROOT_DIR / 'data'
SRC_DIR = ROOT_DIR / 'src'

# Add 'src' to the Python path to import custom modules
if str(SRC_DIR) not in sys.path:
    sys.path.append(str(SRC_DIR))

# --- 3. IMPORT CUSTOM MODULES ---
import utils
import plotting_utils

# --- 4. ANALYSIS & FILTERING CONFIGURATION ---

# File searching parameters
# FILE_PREFIX = ''  # e.g., '2024'
FILE_CONTAINS_PATTERN = 'df_OHLCV_clean_stocks_etfs'

# # Parameters defining the time windows for metric calculation
PERIOD_PARAMS = {
    'lookback_days': 22,
    'recent_days': 0,
}

# This is not use for filtering, it's use to calculate metrics in SORT_ORDER
# Parameters for filtering the calculated metrics to find candidates
METRIC_FILTERS = {
    'min_lookback_improvement': 0,
    'current_rank_bracket_start': 1,
    'current_rank_bracket_end': 1000,
    # --- Select ONE mode by commenting out the others ---
    # 'Reversal' Mode
    'min_recent_bottom_to_recent_start': 0,
    'min_recent_bottom_to_current': 0,    
    # 'Dip' Mode
    # 'min_current_to_recent_start': 10,
}

# --- 5. VERIFICATION ---
print("--- Path Configuration ---")
print(f"✅ Project Root: {ROOT_DIR}")
print(f"✅ Data Dir:     {DATA_DIR}")
print(f"✅ Source Dir:   {SRC_DIR}")
assert all([ROOT_DIR.exists(), DATA_DIR.exists(), SRC_DIR.exists()]), "A key directory was not found!"

print("\n--- Module Verification ---")
print(f"✅ Successfully imported 'utils' and 'plotting_utils'.")



In [None]:
print("--- Step 1: Loading latest consolidated Finviz data ---")

# Find the most recent file matching the pattern
# This function is now understood to return List[str] (filenames), not List[Path].
latest_finviz_filepaths = utils.get_recent_files(
    directory_path=DATA_DIR,
    extension='parquet',
    prefix='202',
    contains_pattern='df_finviz_merged_stocks_etfs',
    count=1
)

if not latest_finviz_filepaths:
    raise FileNotFoundError(f"No files found in '{DATA_DIR}' with prefix '{FILE_PREFIX}' and pattern '{FILE_CONTAINS_PATTERN}'")

# Get the filename string from the list
latest_filename = latest_finviz_filepaths[0]

# Manually construct the full path before loading
full_file_path = DATA_DIR / latest_filename
df_finviz_latest = pd.read_parquet(full_file_path, engine='pyarrow')


# --- Robust Index Setting (this logic remains correct) ---
if df_finviz_latest.index.name == 'Ticker':
    print("Info: 'Ticker' is already the index. No action needed.")
elif 'Ticker' in df_finviz_latest.columns:
    print("Info: 'Ticker' column found. Setting it as the DataFrame index.")
    df_finviz_latest.set_index('Ticker', inplace=True)
elif 'ticker' in df_finviz_latest.columns:
    print("Info: 'ticker' column found. Renaming and setting as index.")
    df_finviz_latest.rename(columns={'ticker': 'Ticker'}, inplace=True)
    df_finviz_latest.set_index('Ticker', inplace=True)
elif df_finviz_latest.index.name is None:
    print("Info: Index is unnamed. Assuming it contains tickers and assigning the name 'Ticker'.")
    df_finviz_latest.index.name = 'Ticker'
else:
    print("ERROR: Loaded DataFrame has an unexpected format.")
    print(f"Columns: {df_finviz_latest.columns.tolist()}")
    print(f"Index Name: '{df_finviz_latest.index.name}'")
    raise ValueError("Could not find a 'Ticker' column or a usable index to proceed.")


# Correct the print statement to work with the filename string
print(f"✅ Successfully loaded: {latest_filename}")
print(f"Shape: {df_finviz_latest.shape}")
print(df_finviz_latest.head(3))

In [17]:
import pandas as pd

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 3000)

# DATA_DIR = r'c:\Users\ping\Files_win10\python\py311\stocks\data'
# Manually construct the full path before loading
full_file_path = r'c:\Users\ping\Files_win10\python\py311\stocks\data\df_OHLCV_clean_stocks_etfs.parquet'
df_OHLCV = pd.read_parquet(full_file_path, engine='pyarrow')

### Cell 2: The Chronological Split Code

This cell contains the logic to find the split date and create the `df_train` and `df_test` DataFrames.

In [2]:
# --- 1. Find the Chronological Split Point ---

# Get all unique dates from the index and sort them
unique_dates = df_OHLCV.index.get_level_values('Date').unique().sort_values()

# Determine the index for the 70% split
split_index = int(len(unique_dates) * 0.7)

# Find the actual date at that split index
split_date = unique_dates[split_index]

print(f"Total unique trading dates in dataset: {len(unique_dates)}")
print(f"The data will be split on the date: {split_date.date()}")

# --- 2. Create the Training and Testing Sets ---

# The training set includes all data UP TO and INCLUDING the split_date
df_train = df_OHLCV[df_OHLCV.index.get_level_values('Date') <= split_date]

# The testing set includes all data AFTER the split_date
df_test = df_OHLCV[df_OHLCV.index.get_level_values('Date') > split_date]


# --- 3. Verify the Split ---

print("\n--- Verification ---")
print(f"Original DataFrame shape: {df_OHLCV.shape}")
print(f"Training set shape:   {df_train.shape}")
print(f"Testing set shape:    {df_test.shape}")

print("\nDate Ranges:")
print(f"  Training: {df_train.index.get_level_values('Date').min().date()} to {df_train.index.get_level_values('Date').max().date()}")
print(f"  Testing:  {df_test.index.get_level_values('Date').min().date()} to {df_test.index.get_level_values('Date').max().date()}")

# Final check to ensure no overlap
assert df_train.index.get_level_values('Date').max() < df_test.index.get_level_values('Date').min()
print("\nVerification successful: There is no date overlap between train and test sets.")

Total unique trading dates in dataset: 250
The data will be split on the date: 2025-05-28

--- Verification ---
Original DataFrame shape: (371000, 5)
Training set shape:   (261184, 5)
Testing set shape:    (109816, 5)

Date Ranges:
  Training: 2024-09-13 to 2025-05-28
  Testing:  2025-05-29 to 2025-09-12

Verification successful: There is no date overlap between train and test sets.


### Jupyter Notebook: Verifying `analyze_ticker_trends_vectorized`


#### Cell 1: Setup and Imports

First, let's import the necessary libraries and define both versions of the function we want to compare.

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import linregress

# --- Function 1: The NEW, Correct, Log-Vectorized Version (The one we are verifying) ---

def analyze_ticker_trends_log_vectorized(df_group, lookback_days=60):
    """
    This function output has been verified. 

    Vectorized analysis of trends using LOG-TRANSFORMED data.
    
    This is the CORRECTED version. The slope of a log-transformed series represents
    the average percentage change, making it comparable across different stocks.

    The first n lookback_days rows will be all NaN.
    'unified_std_dev_returns' and 'volume_std_dev_returns' will have NaN on 
    lookback_days + 1 row.
    """
    if len(df_group) < lookback_days:
        return None 

    time_index = pd.Series(np.arange(len(df_group)), index=df_group.index)
    var_time = np.var(np.arange(lookback_days), ddof=0)
    
    series_to_analyze = {
        # Uses natural log 
        'high': np.log(df_group['Adj High']),
        'low': np.log(df_group['Adj Low']),
        'volume': np.log(df_group['Volume'].astype(float) + 1)
    }
    
    df_results = pd.DataFrame(index=df_group.index)

    for name, log_series in series_to_analyze.items():
        # Uses covariance and variance of population
        rolling_cov = time_index.rolling(window=lookback_days).cov(log_series, ddof=0)
        rolling_var_series = log_series.rolling(window=lookback_days).var(ddof=0)
        
        df_results[f'{name}_slope'] = rolling_cov / var_time
        denominator = (var_time * rolling_var_series) + 1e-9
        df_results[f'{name}_r_squared'] = (rolling_cov**2) / denominator

    yesterday_low = df_group['Adj Low'].shift(1)
    worst_case_returns = (df_group['Adj High'] - yesterday_low) / yesterday_low
    df_results['unified_std_dev_returns'] = worst_case_returns.rolling(window=lookback_days).std(ddof=0)
    
    volume_std_dev = df_group['Volume'].pct_change().rolling(window=lookback_days).std(ddof=0)
    df_results['volume_std_dev_returns'] = volume_std_dev
    
    return df_results # Penalty scores omitted for verification clarity

print("Verification functions are defined.")

Verification functions are defined.


#### Cell 2: Create Realistic Sample Data

We need some sample data that mimics your real dataset to perform the check.

In [None]:
# Create a date range
dates = pd.to_datetime(pd.date_range(start='2023-01-01', periods=200, freq='B'))

# Create data for Ticker 'A' (upward trend)
price_a = 100 + np.linspace(0, 50, 200) + np.random.randn(200) * 2
volume_a = 100000 + np.sin(np.arange(200)/10) * 20000 + np.random.randint(-5000, 5000, 200)
df_a = pd.DataFrame({
    'Ticker': 'A',
    'Date': dates,
    'Adj High': price_a + 1,
    'Adj Low': price_a - 1,
    'Volume': volume_a
})

# Create data for Ticker 'B' (downward trend)
price_b = 200 - np.linspace(0, 30, 200) + np.random.randn(200) * 3
volume_b = 500000 + np.cos(np.arange(200)/5) * 50000 + np.random.randint(-10000, 10000, 200)
df_b = pd.DataFrame({
    'Ticker': 'B',
    'Date': dates,
    'Adj High': price_b + 1.5,
    'Adj Low': price_b - 1.5,
    'Volume': volume_b
})

# Combine and set the multi-index
sample_df = pd.concat([df_a, df_b]).set_index(['Ticker', 'Date'])

print("Sample DataFrame created with shape:", sample_df.shape)
sample_df.head()

In [None]:
sample_df

In [4]:
MSFT = df_train.loc['MSFT'].copy()
print(f'MSFT.head(3):\n{MSFT.head(3)}')
print(f'\nMSFT.tail(3):\n{MSFT.tail(3)}')
print(f'\nlen(MSFT): {len(MSFT)}')

MSFT.head(3):
            Adj Open  Adj High  Adj Low  Adj Close    Volume
Date                                                        
2024-09-13   422.657   428.612  422.290    427.381  15993778
2024-09-16   427.391   430.300  425.029    428.126  13938564
2024-09-17   436.950   438.558  429.049    431.907  19015898

MSFT.tail(3):
            Adj Open  Adj High  Adj Low  Adj Close    Volume
Date                                                        
2025-05-23   449.242   452.945  448.173    449.441  16911254
2025-05-27   455.731   460.193  455.371    459.934  21008780
2025-05-28   460.463   461.761  456.180    456.609  17114387

len(MSFT): 176


#### Cell 3: The New Verification

Now, we will compare the log-vectorized results against the original normalized results.


In [5]:
MSFT.to_csv(r'C:\Users\ping\Desktop\MSFT.csv', index=True)

In [None]:
# --- Parameters for our Test ---
TICKER_TO_CHECK = 'MSFT'
DATE_TO_CHECK = pd.to_datetime('2024-09-26')
LOOKBACK_DAYS = 10

print(f"--- Verifying LOG-TRANSFORMED calculations for '{TICKER_TO_CHECK}' on {DATE_TO_CHECK.date()} ---\n")

# --- 1. Run the FAST Log-Vectorized function ---
ticker_history = df_train.loc[TICKER_TO_CHECK]
vectorized_results_full = analyze_ticker_trends_log_vectorized(ticker_history, LOOKBACK_DAYS)
vectorized_result_today = vectorized_results_full.loc[DATE_TO_CHECK]

# --- 2. Run the SLOW Original (Normalized) function ---
historical_slice = ticker_history.loc[:DATE_TO_CHECK]
original_result_today = analyze_ticker_trends_original(historical_slice, LOOKBACK_DAYS)

# --- 3. Compare the results ---
comparison_df = pd.DataFrame({
    'Log-Vectorized': vectorized_result_today,
    'Original (Normalized)': original_result_today
}).dropna()

print("### Important Note ###")
print("The slope of a log(price) series is mathematically very similar to the slope of a normalized (price/price_0) series.")
print("Therefore, both the SLOPES and R-SQUARED values should now be very close.\n")

print("### Side-by-Side Comparison ###")
print(comparison_df)

# --- 4. Programmatic Check for ALL values ---
try:
    # We now test BOTH slope and r_squared
    pd.testing.assert_series_equal(
        comparison_df['Log-Vectorized'],
        comparison_df['Original (Normalized)'],
        atol=0.05 # Use a slightly larger tolerance for slope approximation
    )
    print("\n[SUCCESS]: Log-vectorized results closely match the original normalized results!")
except AssertionError as e:
    print("\n[FAILURE]: Results do not match.")
    print(e)

In [None]:
vectorized_results_full.head(20)

In [None]:
vectorized_results_full.tail()

### Refactored Code

Here is the complete, refactored solution. I've included the previously refactored functions with slight modifications to accept the new configuration structure.

In [6]:
import numpy as np
import pandas as pd

def calculate_rolling_z_scores_general(df_group, columns_to_process, rolling_window=20):
    """
    This function output has been verified. 

    Calculates rolling Z-scores for a list of specified columns.
    
    This is a flexible, reusable, and efficient version.
    
    Args:
        df_group (pd.DataFrame): The DataFrame for a single ticker.
        columns_to_process (list): A list of column names to calculate Z-scores for.
        rolling_window (int): The lookback window.
        
    Returns:
        pd.DataFrame: A DataFrame with Z-score columns, prefixed with 'z_score_'.
                      Returns NaNs for non-computable values.
    """
    if df_group.empty or len(df_group) < rolling_window:
        # Return an empty DataFrame with the expected column names for consistency
        return pd.DataFrame(columns=[f"z_score_{col}" for col in columns_to_process])

    # Select the subset of data to work on
    data_subset = df_group[columns_to_process]
    
    # Calculate rolling stats for all columns at once.
    # This correctly produces NaNs for the initial, incomplete windows.
    rolling_mean = data_subset.rolling(window=rolling_window).mean()
    rolling_std = data_subset.rolling(window=rolling_window).std()
    
    # Calculate Z-scores for all columns in one vectorized operation.
    z_scores_df = (data_subset - rolling_mean) / rolling_std
    
    # Handle true division-by-zero errors (where std is 0)
    z_scores_df = z_scores_df.replace([np.inf, -np.inf], 0)
    
    # Add a descriptive prefix to the column names (e.g., 'Adj Low' -> 'z_score_Adj Low')
    return z_scores_df.add_prefix('z_score_')

In [27]:
z_scores = df_train.groupby(level='Ticker', group_keys=False).apply(
    calculate_rolling_z_scores_general, df_train.columns, rolling_window=10
)

In [28]:
z_MSFT_w10 = z_scores.loc['MSFT']
z_MSFT_w10.to_csv(r'C:\Users\ping\Desktop\z_MSFT_w10.csv', index=True)

#### Step 1: Refined Core Backtesting Functions

We'll modify the function signatures to accept a single `config` dictionary. This makes them more modular.

In [12]:
import pandas as pd
from tqdm import tqdm
from itertools import product

def precompute_signals(df_ohlcv, config):
    """Pre-computes trading signals using the general-purpose Z-score function."""
    print("Pre-computing features for this parameter set...")
    
    trends = df_ohlcv.groupby(level='Ticker', group_keys=False).apply(
        analyze_ticker_trends_log_vectorized, config['lookback_days']
    )
    
    # --- KEY CHANGE: Define which Z-scores you want to calculate ---
    z_score_columns = ['Adj Open', 'Adj High', 'Adj Low', 'Adj Close', 'Volume'] # You can now easily add 'Adj Close', etc.
    
    # --- Use the new general function ---
    z_scores = df_ohlcv.groupby(level='Ticker', group_keys=False).apply(
        calculate_rolling_z_scores_general, 
        columns_to_process=z_score_columns,
        rolling_window=config['rolling_window']
    )
    
    features = trends.join(z_scores).dropna()

    # --- KEY CHANGE: Update the signal condition to use the new column name ---
    # The column is now 'z_score_Adj Low', not 'low_rolling_z_score'.
    signals = features[
        (features['low_slope'] > config['slope_thresh']) &
        (features['low_r_squared'] > config['r2_thresh']) &
        (features['volume_slope'] > 0) &
        (features['z_score_Adj Low'] < config['z_entry_thresh'])
        # You could now add another condition like:
        # & (features['z_score_Volume'] > 1.0) 
    ]


####################################
    # return signals
    return signals, features
####################################


def run_backtest(df_ohlcv, config):
    """
    Orchestrates the backtesting process with enhanced logging.
    """
    # Now returns a DataFrame of features for triggered signals
    entry_signals_features = precompute_signals(df_ohlcv, config)
    
    trades = []
    open_positions = {}
    
    all_dates = df_ohlcv.index.get_level_values('Date').unique().sort_values()
    start_index = max(config['lookback_days'], config['rolling_window'])

    for i in tqdm(range(start_index, len(all_dates) - 1), desc="Backtesting"):
        current_date = all_dates[i]
        next_day_date = all_dates[i+1]

        closed_trades, open_positions = handle_exits_for_day(
            current_date, next_day_date, open_positions, df_ohlcv, config
        )
        trades.extend(closed_trades)

        # --- KEY CHANGE: Filter the features DataFrame for today's signals ---
        signals_today = entry_signals_features[
            entry_signals_features.index.get_level_values('Date') == current_date
        ]
        
        # Pass the full signals_today DataFrame to the handler
        open_positions = handle_entries_for_day(
            current_date, next_day_date, signals_today, open_positions, df_ohlcv
        )
                
    # --- Create the final DataFrame and reorder columns for clarity ---
    if not trades:
        return pd.DataFrame()

    final_trades_df = pd.DataFrame(trades)
    log_columns = [
        'ticker', 'signal_date', 'entry_date', 'exit_signal_date', 'exit_date', 'reason',
        'return', 'entry_price_actual', 'exit_price_actual', 'exit_trigger_price', 
        'exit_target_value', 'entry_signal_features'
    ]
    # Ensure all columns exist, fill missing with None
    for col in log_columns:
        if col not in final_trades_df.columns:
            final_trades_df[col] = None
            
    return final_trades_df[log_columns]


In [9]:
# 1. Define the full configuration
config = {
    'lookback_days': 30,
    'rolling_window': 10,  # Set the value you want to test
    'slope_thresh': 0.05,
    'r2_thresh': 0.50,
    'z_entry_thresh': -1.5,
    # ... other params if needed
}

# 2. Call the function correctly
signals_df, features_df = precompute_signals(df_train, config) 

Pre-computing features for this parameter set...


### Jupyter Notebook: Testing `precompute_signals`

#### Cell 1: Setup and Test Functions

First, we need the function definitions and a small, predictable sample dataset. Testing on the full `df_train` is too difficult to verify manually.


In [None]:
import pandas as pd
import numpy as np

# --- Paste your verified helper functions here ---
# analyze_ticker_trends_log_vectorized(...)
# calculate_rolling_z_scores_general(...)

# --- The function we are testing ---
def precompute_signals(df_ohlcv, config):
    """Pre-computes trading signals using the general-purpose Z-score function."""
    print("Pre-computing features for this parameter set...")
    
    trends = df_ohlcv.groupby(level='Ticker', group_keys=False).apply(
        analyze_ticker_trends_log_vectorized, config['lookback_days']
    )
    
    z_score_columns = ['Adj Low', 'Volume']
    z_scores = df_ohlcv.groupby(level='Ticker', group_keys=False).apply(
        calculate_rolling_z_scores_general, 
        columns_to_process=z_score_columns,
        rolling_window=config['rolling_window']
    )
    
    features = trends.join(z_scores).dropna()

    signals = features[
        (features['low_slope'] > config['slope_thresh']) &
        (features['low_r_squared'] > config['r2_thresh']) &
        (features['volume_slope'] > 0) &
        (features['z_score_Adj Low'] < config['z_entry_thresh'])
    ]
    return signals

# --- A special version for "white-box" testing that returns the intermediate features ---
def precompute_signals_for_testing(df_ohlcv, config):
    """A modified version that returns both the features and the final signals."""
    # (Code is identical to above, just the return statement is different)
    trends = df_ohlcv.groupby(level='Ticker', group_keys=False).apply(analyze_ticker_trends_log_vectorized, config['lookback_days'])

###########################  
    # z_score_columns = ['Adj Low', 'Volume']
    z_score_columns = df_ohlcv.columns
###########################  

    z_scores = df_ohlcv.groupby(level='Ticker', group_keys=False).apply(calculate_rolling_z_scores_general, columns_to_process=z_score_columns, rolling_window=config['rolling_window'])
    features = trends.join(z_scores).dropna()
###########################    
    # signals = features[
    #     (features['low_slope'] > config['slope_thresh']) &
    #     (features['low_r_squared'] > config['r2_thresh']) &
    #     (features['volume_slope'] > 0) &
    #     (features['z_score_Adj Low'] < config['z_entry_thresh'])
    # ]
    # signals = features[
    #     ((features['low_slope'] > config['slope_thresh']) &
    #     (features['low_slope'] < .008)) &    
    #     (features['low_r_squared'] > config['r2_thresh']) &
    #     (features['volume_slope'] > 0) &
    #     (features['z_score_Adj Low'] < config['z_entry_thresh'])
    # ]
# --- same-column filter ------------------------------------------------------
    low_slope_ok = (
        # (features['low_slope'] > config['slope_thresh']) &
        (features['high_slope'] > features['low_slope'])
    )

    # --- final signal mask -------------------------------------------------------
    signals = features[
        low_slope_ok
        & (features['low_r_squared'] > config['r2_thresh'])
        & (features['volume_slope'] > 0)
        & (features['z_score_Adj Low'] < config['z_entry_thresh'])
        & (features['volume_slope'] > 0)
        & (features['z_score_Adj Low'] > 0)        
    ]

###########################          
    return features, signals, trends # Return both for inspection

print("Setup complete.")

Setup complete.


#### Cell 2: Create a Controlled Sample Dataset

We'll create a small DataFrame with one ticker that has a predictable trend.

In [None]:
# Create a date range
dates = pd.to_datetime(pd.date_range(start='2023-01-01', periods=50, freq='B'))

# Create data for Ticker 'TEST' with a clear upward trend and some noise
price = 100 + np.linspace(0, 20, 50) + np.sin(np.arange(50)/2) * 2
volume = 100000 + np.linspace(0, 50000, 50) # Strong positive volume slope
df_test = pd.DataFrame({
    'Ticker': 'TEST', 'Date': dates,
    'Adj High': price + 1, 'Adj Low': price - 1, 'Adj Close': price,
    'Volume': volume
})

# Add a ticker with insufficient data to test edge cases
df_short = pd.DataFrame({
    'Ticker': 'SHORT', 'Date': dates[:5],
    'Adj High': 10, 'Adj Low': 9, 'Adj Close': 9.5, 'Volume': 1000
})


# Combine and set index
sample_df = pd.concat([df_test, df_short]).set_index(['Ticker', 'Date'])
print("Sample DataFrame created.")
sample_df.head()

Sample DataFrame created.


Unnamed: 0_level_0,Unnamed: 1_level_0,Adj High,Adj Low,Adj Close,Volume
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TEST,2023-01-02,101.0,99.0,100.0,100000.0
TEST,2023-01-03,102.367014,100.367014,101.367014,101020.408163
TEST,2023-01-04,103.499269,101.499269,102.499269,102040.816327
TEST,2023-01-05,104.21948,102.21948,103.21948,103061.22449
TEST,2023-01-06,104.451248,102.451248,103.451248,104081.632653


In [31]:
sample_df.to_csv(r'C:\Users\ping\Desktop\sample_df.csv', index=True)

In [18]:
print(f'sample_df:\n{sample_df}')

sample_df:
                     Adj High     Adj Low   Adj Close         Volume
Ticker Date                                                         
TEST   2023-01-02  101.000000   99.000000  100.000000  100000.000000
       2023-01-03  102.367014  100.367014  101.367014  101020.408163
       2023-01-04  103.499269  101.499269  102.499269  102040.816327
       2023-01-05  104.219480  102.219480  103.219480  103061.224490
       2023-01-06  104.451248  102.451248  103.451248  104081.632653
       2023-01-09  104.237761  102.237761  103.237761  105102.040816
       2023-01-10  103.731220  101.731220  102.731220  106122.448980
       2023-01-11  103.155576  101.155576  102.155576  107142.857143
       2023-01-12  102.751701  100.751701  101.751701  108163.265306
       2023-01-13  102.718409  100.718409  101.718409  109183.673469
       2023-01-16  103.163784  101.163784  102.163784  110204.081633
       2023-01-17  104.078715  102.078715  103.078715  111224.489796
       2023-01-18  105.

#### Step 1: "White-Box" Test - Inspect the Intermediate `features`

This is the most important step. We use our special testing function to look "inside" and see if the features are being generated correctly before the filtering logic is applied.

In [101]:
# --- Test Configuration ---
test_config = {
    'lookback_days': 10,
    'rolling_window': 5,
    'slope_thresh': 0, # Set low to ensure we get some signals
    'r2_thresh': 0.3,
    'z_entry_thresh': -0.5,
    'volume_thresh': 0,
}

# --- Run the special testing function ---
all_features, signals_from_test_func, trends = precompute_signals_for_testing(sample_df, test_config)


print("--- Test 1: Inspecting the full 'features' DataFrame ---")
print(f"Shape of the features DataFrame: {all_features.shape}")
print("Note: The first (lookback_days - 1) rows should be missing due to NaNs.")
print("The ticker 'SHORT' should not appear at all.\n")

# Display the head and tail to check values
print("Head of features:")
print(all_features.head())
print("\nTail of features:")
print(all_features.tail())

# --- Verification Checks for 'features' ---
assert 'SHORT' not in all_features.index.get_level_values('Ticker'), "FAIL: Ticker with insufficient data was not filtered out."
print("\n[SUCCESS]: Ticker with insufficient data was correctly ignored.")

assert not all_features.isnull().values.any(), "FAIL: The features DataFrame contains unexpected NaNs after dropna()."
print("[SUCCESS]: Features DataFrame contains no NaNs.")

print("\n--- Sanity Check Passed for Feature Generation ---")

--- Test 1: Inspecting the full 'features' DataFrame ---
Shape of the features DataFrame: (40, 12)
Note: The first (lookback_days - 1) rows should be missing due to NaNs.
The ticker 'SHORT' should not appear at all.

Head of features:
                   high_slope  high_r_squared  low_slope  low_r_squared  volume_slope  volume_r_squared  unified_std_dev_returns  volume_std_dev_returns  z_score_Adj High  z_score_Adj Low  z_score_Adj Close  z_score_Volume
Ticker Date                                                                                                                                                                                                                   
TEST   2023-01-16   -0.000585        0.064806  -0.000596       0.064794      0.009666          0.999850                 0.006645                0.000274          0.145474         0.145474           0.145474        1.264911
       2023-01-17   -0.000929        0.208933  -0.000947       0.208935      0.009574          0

In [103]:
trends.to_csv(r'C:\Users\ping\Desktop\trends.csv', index=True)
all_features.to_csv(r'C:\Users\ping\Desktop\all_features.csv', index=True)
signals_from_test_func.to_csv(r'C:\Users\ping\Desktop\signals_from_test_func.csv', index=True)

In [None]:
signals = features[
    (features['low_slope'] > config['slope_thresh']) &
    (features['low_r_squared'] > config['r2_thresh']) &
    (features['volume_slope'] > 0) &
    (features['z_score_Adj Low'] < config['z_entry_thresh'])
]

test_config = {
    'lookback_days': 10,
    'rolling_window': 5,
    'slope_thresh': 0.005, # Set low to ensure we get some signals
    'r2_thresh': 0.3,
    'z_entry_thresh': -0.5,
    'volume_thresh': 0,
}

In [94]:
print(f'trends:\n{trends}')

trends:
                   high_slope  high_r_squared  low_slope  low_r_squared  volume_slope  volume_r_squared  unified_std_dev_returns  volume_std_dev_returns
Ticker Date                                                                                                                                             
TEST   2023-01-02         NaN             NaN        NaN            NaN           NaN               NaN                      NaN                     NaN
       2023-01-03         NaN             NaN        NaN            NaN           NaN               NaN                      NaN                     NaN
       2023-01-04         NaN             NaN        NaN            NaN           NaN               NaN                      NaN                     NaN
       2023-01-05         NaN             NaN        NaN            NaN           NaN               NaN                      NaN                     NaN
       2023-01-06         NaN             NaN        NaN            NaN   

In [27]:
print(f'sample_df:\n{sample_df}')

sample_df:
                     Adj High     Adj Low   Adj Close         Volume
Ticker Date                                                         
TEST   2023-01-02  101.000000   99.000000  100.000000  100000.000000
       2023-01-03  102.367014  100.367014  101.367014  101020.408163
       2023-01-04  103.499269  101.499269  102.499269  102040.816327
       2023-01-05  104.219480  102.219480  103.219480  103061.224490
       2023-01-06  104.451248  102.451248  103.451248  104081.632653
       2023-01-09  104.237761  102.237761  103.237761  105102.040816
       2023-01-10  103.731220  101.731220  102.731220  106122.448980
       2023-01-11  103.155576  101.155576  102.155576  107142.857143
       2023-01-12  102.751701  100.751701  101.751701  108163.265306
       2023-01-13  102.718409  100.718409  101.718409  109183.673469
       2023-01-16  103.163784  101.163784  102.163784  110204.081633
       2023-01-17  104.078715  102.078715  103.078715  111224.489796
       2023-01-18  105.

#### Step 2: "Black-Box" Test - Verify the Final `signals` DataFrame

Now we test the real function. Our goal is to prove that **every single row** in the final `signals_df` meets the filtering criteria defined in our `test_config`.

In [26]:
# --- Run the REAL function to get the final output ---
signals_df = precompute_signals(sample_df, test_config)

print(f"\n--- Test 2: Verifying the final 'signals' DataFrame ---")
print(f"Found {len(signals_df)} potential signals to verify.")

# --- Programmatic Verification ---
# For every signal found, we cross-reference it with the 'all_features' DataFrame
# and assert that its values meet the criteria.

for idx, signal_row in signals_df.iterrows():
    # Find the original, unfiltered features for this specific signal
    original_features = all_features.loc[idx]

    # Assert that each condition is met
    try:
        assert original_features['low_slope'] > test_config['slope_thresh']
        assert original_features['low_r_squared'] > test_config['r2_thresh']
        assert original_features['volume_slope'] > test_config['volume_thresh']
        assert original_features['z_score_Adj Low'] < test_config['z_entry_thresh']
    except AssertionError as e:
        print(f"\n[FAILURE]: Verification failed for signal at index {idx}!")
        print("Signal Row:")
        print(signal_row)
        print("\nThresholds:")
        print(test_config)
        raise e

if not signals_df.empty:
    print("\n[SUCCESS]: All rows in the final signals DataFrame correctly meet the filter criteria.")
else:
    print("\n[INFO]: No signals were generated with this config, test passed vacuously.")

print("\n--- Sanity Check Passed for Signal Filtering ---")

Pre-computing features for this parameter set...

--- Test 2: Verifying the final 'signals' DataFrame ---
Found 3 potential signals to verify.

[SUCCESS]: All rows in the final signals DataFrame correctly meet the filter criteria.

--- Sanity Check Passed for Signal Filtering ---


In [25]:
print(f'signals_df:\n{signals_df}')

signals_df:
                   high_slope  high_r_squared  low_slope  low_r_squared  volume_slope  volume_r_squared  unified_std_dev_returns  volume_std_dev_returns  z_score_Adj Low  z_score_Volume
Ticker Date                                                                                                                                                                              
TEST   2023-01-27    0.006534        0.796915   0.006659       0.796899      0.008892          0.999873                 0.006682                0.000231        -1.446472        1.264911
       2023-02-15    0.005364        0.710198   0.005462       0.710196      0.007971          0.999898                 0.006731                0.000186        -1.562509        1.264911
       2023-03-03    0.006194        0.819506   0.006302       0.819491      0.007275          0.999915                 0.005943                0.000154        -1.298805        1.264911


# penalty_scores are not calculated???
* penalty_score = (1 - r_squared) * (unified_std_dev + 1e-9)  
* volume_penalty_score = (1 - volume_r_squared) * (volume_std_dev + 1e-9)  

In [10]:
signals_df

Unnamed: 0_level_0,Unnamed: 1_level_0,high_slope,high_r_squared,low_slope,low_r_squared,volume_slope,volume_r_squared,unified_std_dev_returns,volume_std_dev_returns,z_score_Adj Open,z_score_Adj High,z_score_Adj Low,z_score_Adj Close,z_score_Volume
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1


In [11]:
features_df

Unnamed: 0_level_0,Unnamed: 1_level_0,high_slope,high_r_squared,low_slope,low_r_squared,volume_slope,volume_r_squared,unified_std_dev_returns,volume_std_dev_returns,z_score_Adj Open,z_score_Adj High,z_score_Adj Low,z_score_Adj Close,z_score_Volume
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
A,2024-10-25,-0.001549,0.170512,-0.001499,0.147198,-0.002818,0.005825,0.016570,0.378456,-1.439517,-1.377791,-1.290638,-1.307466,-0.825383
AAL,2024-10-25,0.007047,0.794413,0.006360,0.785938,-0.006610,0.019615,0.029808,0.818874,0.586457,0.966663,0.795993,0.950780,0.522081
AAON,2024-10-25,0.003277,0.514929,0.003696,0.542243,-0.014090,0.073470,0.019885,0.625558,-0.494063,0.120329,-0.096700,0.783208,0.825011
AAPL,2024-10-25,0.001725,0.486748,0.001832,0.530640,-0.021447,0.178248,0.013944,0.748888,-1.164071,-0.487396,-0.518780,-0.685118,-0.265323
ABBV,2024-10-25,-0.000732,0.310288,-0.000643,0.243094,0.003946,0.014364,0.008131,0.455564,-0.102501,-0.515262,-0.603197,-0.728499,-0.860770
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZM,2025-05-28,0.005305,0.784938,0.005267,0.720825,0.031592,0.508399,0.012891,0.310738,-1.643184,-1.725532,-0.971138,-1.160761,-0.477397
ZS,2025-05-28,0.009303,0.935603,0.009805,0.931124,0.006610,0.069734,0.018673,0.209714,1.409447,0.686478,1.079823,0.604444,-0.296513
ZTO,2025-05-28,-0.003039,0.464491,-0.003090,0.465276,0.039047,0.346424,0.019013,0.861881,-0.843505,-0.516990,-0.582071,-0.233126,0.003116
ZTS,2025-05-28,0.003754,0.893635,0.003965,0.842693,-0.002221,0.004114,0.015717,0.313638,1.667782,1.342668,1.245380,0.965542,-1.348780


### Step 2: New Encapsulated Helper Functions

These new functions isolate the logic for performance analysis and the optimization loop itself.

In [None]:
def analyze_performance(trade_results):
    """
    Calculates performance metrics from a DataFrame of trades.
    
    Returns a dictionary of key metrics.
    """
    if trade_results.empty:
        return {'num_trades': 0, 'win_rate': 0, 'avg_return': 0, 'total_return': 0}
    
    win_rate = (trade_results['return'] > 0).mean()
    total_return = (1 + trade_results['return']).prod() - 1
    avg_return = trade_results['return'].mean()
    
    return {
        'num_trades': len(trade_results),
        'win_rate': win_rate,
        'avg_return': avg_return,
        'total_return': total_return
    }

def run_parameter_optimization(df, param_grid, static_params):
    """
    Orchestrates the entire parameter optimization process.
    
    Args:
        df (pd.DataFrame): The OHLCV data.
        param_grid (dict): Dictionary with lists of parameters to test.
        static_params (dict): Dictionary of parameters that are not being optimized.

    Returns:
        pd.DataFrame: A summary of results for each parameter combination.
    """
    results_log = []
    
    # Use itertools.product to create a clean generator for all combinations
    keys, values = zip(*param_grid.items())
    param_combinations = [dict(zip(keys, v)) for v in product(*values)]

    print(f"Starting optimization for {len(param_combinations)} combinations...")
    
    for param_set in tqdm(param_combinations, desc="Optimization Progress"):
        # Combine static and dynamic parameters into a single config for this run
        current_config = {**static_params, **param_set}
        
        # 1. Run the backtest with the current configuration
        trade_results = run_backtest(df, current_config)
        
        # 2. Analyze the performance of this run
        performance_metrics = analyze_performance(trade_results)
        
        # 3. Log the results
        log_entry = {**param_set, **performance_metrics}
        results_log.append(log_entry)
        
    return pd.DataFrame(results_log)

def handle_entries_for_day(current_date, next_day_date, signals_today, open_positions, df_ohlcv):
    """
    Processes entries and stores signal details in the open_positions dict.
    """
    # --- KEY CHANGE: Loop through the signals DataFrame ---
    for ticker, signal_row in signals_today.iterrows():
        # The ticker is now in the index of signal_row, so we use its name
        ticker_name = ticker[0] 
        
        if ticker_name not in open_positions:
            try:
                entry_price = df_ohlcv.loc[(ticker_name, next_day_date), 'Adj High']
                
                # --- LOGGING: Store more info about the entry signal ---
                open_positions[ticker_name] = {
                    'entry_date': next_day_date,
                    'entry_price': entry_price,
                    'signal_date': current_date,
                    'signal_features': signal_row.to_dict() # Store all features that triggered the signal
                }
            except KeyError:
                pass
                
    return open_positions

def handle_exits_for_day(current_date, next_day_date, open_positions, df_ohlcv, config):
    """
    Checks for exits and logs detailed information about the exit trigger.
    Corrected version with valid syntax for the if/elif chain.
    """
    closed_trades = []
    positions_to_close = []

    for ticker, pos in open_positions.items():
        try:
            current_close_price = df_ohlcv.loc[(ticker, current_date), 'Adj Close']
        except KeyError:
            continue 

        exit_reason = None
        exit_target_value = None 
        
        # --- SYNTAX FIX: Calculate all threshold values *before* the conditional block ---
        profit_target_price = pos['entry_price'] * (1 + config['profit_target'])
        stop_loss_price = pos['entry_price'] * (1 - config['stop_loss'])
        days_held = (current_date.to_pydatetime().date() - pos['entry_date'].to_pydatetime().date()).days
        
        # --- Now, check conditions in a contiguous if/elif/elif block ---
        if current_close_price >= profit_target_price:
            exit_reason = "Profit Target"
            exit_target_value = profit_target_price 

        elif current_close_price <= stop_loss_price:
            exit_reason = "Stop-Loss"
            exit_target_value = stop_loss_price 

        elif days_held >= config['time_hold_days']:
            exit_reason = "Time Hold"
            exit_target_value = days_held 

        if exit_reason:
            try:
                exit_price = df_ohlcv.loc[(ticker, next_day_date), 'Adj Low']
                trade_return = (exit_price - pos['entry_price']) / pos['entry_price']
                
                trade_log = {
                    'ticker': ticker, 
                    'entry_date': pos['entry_date'], 
                    'exit_date': next_day_date,
                    'return': trade_return, 
                    'reason': exit_reason,
                    'signal_date': pos['signal_date'],
                    'entry_signal_features': pos['signal_features'],
                    'entry_price_actual': pos['entry_price'],
                    'exit_signal_date': current_date,
                    'exit_trigger_price': current_close_price,
                    'exit_target_value': exit_target_value,
                    'exit_price_actual': exit_price,
                }
                closed_trades.append(trade_log)
                positions_to_close.append(ticker)
            except KeyError:
                pass
                
    for ticker in positions_to_close:
        del open_positions[ticker]
        
    return closed_trades, open_positions

#### Step 3: The New, Clean Top-Level Script

Your main script is now incredibly simple and readable. It's all about configuration and orchestration.

In [None]:
# --- 1. DEFINE CONFIGURATION ---

# Parameters to be optimized, defining the search space
optimization_grid = {
    'lookback_days': [30, 60, 90],
    'rolling_window': [15, 20]
}

# Static strategy parameters that do not change during optimization
strategy_params = {
    'slope_thresh': 1.0,
    'r2_thresh': 0.50,
    'z_entry_thresh': 0,
    'profit_target': 0.10,
    'stop_loss': 0.05,
    'time_hold_days': 20
}


# --- 2. RUN ORCHESTRATOR ---

# The main call is now a single, descriptive function
optimization_results = run_parameter_optimization(
    df_train, optimization_grid, strategy_params
)


# --- 3. ANALYZE RESULTS ---

print("\n\n--- Optimization Complete ---")
print(optimization_results.sort_values(by='total_return', ascending=False))

### Step 1: The "One-Trade" Deep Dive

The most powerful debugging technique is to isolate a single trade and follow it from signal generation to exit. If the logic holds for one trade, it's likely correct for all of them.

1.  **Pick a Winning Trade and a Losing Trade:** Run one of the backtests again (e.g., the one with `lookback=30`, `rolling=20`) and save the `trade_results` DataFrame.

In [None]:
# 1. Pick a configuration to analyze
config_to_test = {
    **strategy_params, 
    'lookback_days': 30, 
    'rolling_window': 20
}

# 2. Run the backtest to get the detailed trade log
trade_log_df = run_backtest(df_train, config_to_test)

# 3. Isolate and inspect a single trade
if not trade_log_df.empty:
    # Get the first losing trade
    losing_trade = trade_log_df[trade_log_df['return'] < 0].iloc[0]

    print("--- Detailed Log for a Single Losing Trade ---")
    # Using .T transposes the Series for easy vertical reading
    print(losing_trade.T)
else:
    print("No trades were made for this configuration.")

In [None]:
losing_trade.entry_signal_features

In [None]:
_df_trends = analyze_ticker_trends_vectorized(df_train, lookback_days=30)
print(f'_df_trends:\n{_df_trends}')
# _df_trends.to_csv('C:\\Users\\ping\\Desktop\\_df_trends.csv', index=True)
_df_trends.index.names = ['Ticker', 'Date']
_df_trends.reset_index().to_csv(r'C:\Users\ping\Desktop\_df_trends.csv', index=False)

In [None]:
df_train.loc['MSFT']

In [None]:
# Export to CSV
_df_trends.to_csv(r'C:\Users\ping\Desktop\_df_trends.csv', index=True)

In [None]:
_df_trends.info()

In [None]:
df_loss = trade_log_df[trade_log_df['return'] < 0]
print(df_loss)

In [None]:
df_win = trade_log_df[trade_log_df['return'] > 0]
print(df_win)

In [None]:
df_loss.to_csv('C:\\Users\\ping\\Desktop\\df_loss.csv', index=True)
df_win.to_csv('C:\\Users\\ping\\Desktop\\df_win.csv', index=True)

In [None]:
optimization_results

In [None]:
# Pick a configuration to analyze
config_to_test = {**strategy_params, 'lookback_days': 30, 'rolling_window': 20}

# Run a single backtest and get the detailed trade log
single_run_trades = run_backtest(df_train, config_to_test)

# Find a winning and a losing trade to investigate
print("Sample winning trade:")
print(single_run_trades[single_run_trades['return'] > 0].head(1))

print("\nSample losing trade:")
print(single_run_trades[single_run_trades['return'] < 0].head(1))

In [None]:
df_trends = analyze_ticker_trends_vectorized(df_train, lookback_days=30)


In [None]:
df_trends.loc['FIX', '2024-10-25']