In [1]:
import pandas as pd

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 3000)

# DATA_DIR = r'c:\Users\ping\Files_win10\python\py311\stocks\data'
# Manually construct the full path before loading
full_file_path = r'c:\Users\ping\Files_win10\python\py311\stocks\data\df_OHLCV_clean_stocks_etfs.parquet'
df_OHLCV = pd.read_parquet(full_file_path, engine='pyarrow')

In [2]:
# --- 1. Find the Chronological Split Point ---

# Get all unique dates from the index and sort them
unique_dates = df_OHLCV.index.get_level_values('Date').unique().sort_values()

# Determine the index for the 70% split
split_index = int(len(unique_dates) * 0.7)

# Find the actual date at that split index
split_date = unique_dates[split_index]

print(f"Total unique trading dates in dataset: {len(unique_dates)}")
print(f"The data will be split on the date: {split_date.date()}")

# --- 2. Create the Training and Testing Sets ---

# The training set includes all data UP TO and INCLUDING the split_date
df_train = df_OHLCV[df_OHLCV.index.get_level_values('Date') <= split_date]

# The testing set includes all data AFTER the split_date
df_test = df_OHLCV[df_OHLCV.index.get_level_values('Date') > split_date]


# --- 3. Verify the Split ---

print("\n--- Verification ---")
print(f"Original DataFrame shape: {df_OHLCV.shape}")
print(f"Training set shape:   {df_train.shape}")
print(f"Testing set shape:    {df_test.shape}")

print("\nDate Ranges:")
print(f"  Training: {df_train.index.get_level_values('Date').min().date()} to {df_train.index.get_level_values('Date').max().date()}")
print(f"  Testing:  {df_test.index.get_level_values('Date').min().date()} to {df_test.index.get_level_values('Date').max().date()}")

# Final check to ensure no overlap
assert df_train.index.get_level_values('Date').max() < df_test.index.get_level_values('Date').min()
print("\nVerification successful: There is no date overlap between train and test sets.")

Total unique trading dates in dataset: 250
The data will be split on the date: 2025-05-30

--- Verification ---
Original DataFrame shape: (371250, 5)
Training set shape:   (261360, 5)
Testing set shape:    (109890, 5)

Date Ranges:
  Training: 2024-09-17 to 2025-05-30
  Testing:  2025-06-02 to 2025-09-16

Verification successful: There is no date overlap between train and test sets.


### Jupyter Notebook: Verifying `analyze_ticker_trends_vectorized`


#### Cell 1: Setup and Imports

First, let's import the necessary libraries and define both versions of the function we want to compare.

In [3]:
import pandas as pd
import numpy as np
from scipy.stats import linregress

# --- Function 1: The NEW, Correct, Log-Vectorized Version (The one we are verifying) ---

def analyze_ticker_trends_log_vectorized(df_group, lookback_days=60):
    """
    Vectorized analysis of trends, including penalty scores and the underlying
    volatility metrics used to calculate them.
    """
    if len(df_group) < lookback_days:
        return None 

    time_index = pd.Series(np.arange(len(df_group)), index=df_group.index)
    var_time = np.var(np.arange(lookback_days), ddof=0)
    
    # --- 1. TREND ANALYSIS (EXPANDED TO OHLCV) ---
    trend_columns = ['Adj Open', 'Adj High', 'Adj Low', 'Adj Close', 'Volume']
    trend_columns_exist = [col for col in trend_columns if col in df_group.columns]

    df_results = pd.DataFrame(index=df_group.index)

    for name in trend_columns_exist:
        series = df_group[name].astype(float)
        log_series = np.log(series + 1) if name == 'Volume' else np.log(series)
        
        rolling_cov = time_index.rolling(window=lookback_days).cov(log_series, ddof=0)
        rolling_var_series = log_series.rolling(window=lookback_days).var(ddof=0)
        
        simple_name = name.replace('Adj ', '').lower() 
        
        df_results[f'{simple_name}_slope'] = rolling_cov / var_time
        denominator = (var_time * rolling_var_series) + 1e-9
        df_results[f'{simple_name}_r_squared'] = (rolling_cov**2) / denominator

    # --- 2. VOLATILITY CALCULATION ---
    yesterday_low = df_group['Adj Low'].shift(1)
    worst_case_returns = (df_group['Adj High'] - yesterday_low) / yesterday_low
    unified_std_dev = worst_case_returns.rolling(window=lookback_days).std(ddof=0)
    
    volume_std_dev = df_group['Volume'].pct_change().rolling(window=lookback_days).std(ddof=0)
    
    # --- KEY CHANGE: Add the volatility columns to the output DataFrame ---
    df_results['unified_std_dev_returns'] = unified_std_dev
    df_results['volume_std_dev_returns'] = volume_std_dev

    # --- 3. PENALTY SCORE CALCULATION ---
    price_trend_names = ['open', 'high', 'low', 'close']
    for name in price_trend_names:
        r_squared_col = f'{name}_r_squared'
        if r_squared_col in df_results.columns:
            df_results[f'{name}_penalty_score'] = (1 - df_results[r_squared_col]) * (unified_std_dev + 1e-9)

    if 'volume_r_squared' in df_results.columns:
        df_results['volume_penalty_score'] = (1 - df_results['volume_r_squared']) * (volume_std_dev + 1e-9)
    
    return df_results


print("Verification functions are defined.")

Verification functions are defined.


In [4]:
import numpy as np
import pandas as pd

def calculate_rolling_z_scores_general(df_group, columns_to_process, rolling_window=20):
    """
    This function output has been verified. 

    Calculates rolling Z-scores for a list of specified columns.
    
    This is a flexible, reusable, and efficient version.
    
    Args:
        df_group (pd.DataFrame): The DataFrame for a single ticker.
        columns_to_process (list): A list of column names to calculate Z-scores for.
        rolling_window (int): The lookback window.
        
    Returns:
        pd.DataFrame: A DataFrame with Z-score columns, prefixed with 'z_score_'.
                      Returns NaNs for non-computable values.
    """
    if df_group.empty or len(df_group) < rolling_window:
        # Return an empty DataFrame with the expected column names for consistency
        return pd.DataFrame(columns=[f"z_score_{col}" for col in columns_to_process])

    # Select the subset of data to work on
    data_subset = df_group[columns_to_process]
    
    # Calculate rolling stats for all columns at once.
    # This correctly produces NaNs for the initial, incomplete windows.
    rolling_mean = data_subset.rolling(window=rolling_window).mean()
    rolling_std = data_subset.rolling(window=rolling_window).std()
    
    # Calculate Z-scores for all columns in one vectorized operation.
    z_scores_df = (data_subset - rolling_mean) / rolling_std
    
    # Handle true division-by-zero errors (where std is 0)
    z_scores_df = z_scores_df.replace([np.inf, -np.inf], 0)
    
    # Add a descriptive prefix to the column names (e.g., 'Adj Low' -> 'z_score_Adj Low')
    return z_scores_df.add_prefix('z_score_')

In [5]:
import pandas as pd
from tqdm import tqdm
from itertools import product

def run_backtest(df_ohlcv, config):
    """
    Orchestrates the backtesting process with enhanced logging.
    """
    # Now returns a DataFrame of features for triggered signals
    entry_signals_features = precompute_signals(df_ohlcv, config)
    
    trades = []
    open_positions = {}
    
    all_dates = df_ohlcv.index.get_level_values('Date').unique().sort_values()
    start_index = max(config['lookback_days'], config['rolling_window'])

    for i in tqdm(range(start_index, len(all_dates) - 1), desc="Backtesting"):
        current_date = all_dates[i]
        next_day_date = all_dates[i+1]

        closed_trades, open_positions = handle_exits_for_day(
            current_date, next_day_date, open_positions, df_ohlcv, config
        )
        trades.extend(closed_trades)

        # --- KEY CHANGE: Filter the features DataFrame for today's signals ---
        signals_today = entry_signals_features[
            entry_signals_features.index.get_level_values('Date') == current_date
        ]
        
        # Pass the full signals_today DataFrame to the handler
        open_positions = handle_entries_for_day(
            current_date, next_day_date, signals_today, open_positions, df_ohlcv
        )
                
    # --- Create the final DataFrame and reorder columns for clarity ---
    if not trades:
        return pd.DataFrame()

    final_trades_df = pd.DataFrame(trades)
    log_columns = [
        'ticker', 'signal_date', 'entry_date', 'exit_signal_date', 'exit_date', 'reason',
        'return', 'entry_price_actual', 'exit_price_actual', 'exit_trigger_price', 
        'exit_target_value', 'entry_signal_features'
    ]
    # Ensure all columns exist, fill missing with None
    for col in log_columns:
        if col not in final_trades_df.columns:
            final_trades_df[col] = None
            
    return final_trades_df[log_columns]


In [None]:
import operator

def apply_strategy_rules(features, rules, config):
    """
    Applies a list of filtering rules to a features DataFrame.

    Args:
        features (pd.DataFrame): The DataFrame containing all calculated features.
        rules (list): A list of dictionaries, where each dict defines a filtering rule.
        config (dict): The configuration dictionary, used for dynamic thresholds.

    Returns:
        pd.Series: A boolean Series (mask) indicating which rows pass all rules.
    """
    # Start with a mask that is True for all rows. We will progressively filter it.
    final_mask = pd.Series(True, index=features.index)
    
    # Map operator strings to actual Python operator functions for flexibility
    op_map = {
        '>': operator.gt,
        '<': operator.lt,
        '>=': operator.ge,
        '<=': operator.le,
        '==': operator.eq,
        '!=': operator.ne
    }

    for rule in rules:
        op_func = op_map[rule['operator']]
        
        # --- Rule Type 1: Comparing two columns ---
        if 'column_A' in rule and 'column_B' in rule:
            mask = op_func(features[rule['column_A']], features[rule['column_B']])
        
        # --- Rule Type 2: Comparing a column to a value ---
        elif 'column' in rule:
            # Determine the value to compare against
            if 'value' in rule:
                value = rule['value']
            elif 'value_from_config' in rule:
                value = config[rule['value_from_config']]
            else:
                raise ValueError(f"Rule missing 'value' or 'value_from_config': {rule}")
            
            mask = op_func(features[rule['column']], value)
            
        else:
            raise ValueError(f"Invalid rule format: {rule}")
        
        # Combine the mask for this rule with the final mask using a logical AND
        final_mask &= mask
            
    return final_mask
    

In [7]:
import pandas as pd
import numpy as np
import pandas as pd
from tqdm import tqdm
from itertools import product

def precompute_signals(df_ohlcv, config, rules):
    """
    Pre-computes a rich feature set and then applies a dynamic set of rules
    to generate the final trading signals.
    """
    print("Pre-computing features for this parameter set...")
    
    # --- 1. FEATURE GENERATION (No changes here) ---
    trends = df_ohlcv.groupby(level='Ticker', group_keys=False).apply(
        analyze_ticker_trends_log_vectorized, config['lookback_days']
    )
    
    z_score_columns = ['Adj Open', 'Adj High', 'Adj Low', 'Adj Close', 'Volume']
    z_score_columns_exist = [col for col in z_score_columns if col in df_ohlcv.columns]
    z_scores = df_ohlcv.groupby(level='Ticker', group_keys=False).apply(
        calculate_rolling_z_scores_general, 
        columns_to_process=z_score_columns_exist,
        rolling_window=config['rolling_window']
    )
    
    features = trends.join(z_scores).dropna()

    # --- 2. DYNAMIC FILTERING (KEY CHANGE HERE) ---
    print("Applying dynamic strategy rules...")
    # Delegate the filtering logic to our new, specialized function
    signal_mask = apply_strategy_rules(features, rules, config)
    
    signals = features[signal_mask]

######################################    
    # return signals
    return signals, trends, features, df_ohlcv
######################################    


#### Cell 2: Create Realistic Sample Data

We need some sample data that mimics your real dataset to perform the check.

In [8]:
import numpy as np

# Create a date range
dates = pd.to_datetime(pd.date_range(start='2023-01-01', periods=50, freq='B'))

# Create data for Ticker 'TEST' with a clear upward trend and some noise
price = 100 + np.linspace(0, 20, 50) + np.sin(np.arange(50)/2) * 2
volume = 100000 + np.linspace(0, 50000, 50) # Strong positive volume slope
df_test = pd.DataFrame({
    'Ticker': 'TEST', 'Date': dates, 
    'Adj Open': price + 0.5, 'Adj High': price + 1, 'Adj Low': price - 1, 'Adj Close': price,
    'Volume': volume
})

# Add a ticker with insufficient data to test edge cases
df_short = pd.DataFrame({
    'Ticker': 'SHORT', 'Date': dates[:5],
    'Adj High': 10, 'Adj Low': 9, 'Adj Close': 9.5, 'Volume': 1000
})


# Combine and set index
sample_df = pd.concat([df_test, df_short]).set_index(['Ticker', 'Date'])
print("Sample DataFrame created.")
sample_df.head()

Sample DataFrame created.


Unnamed: 0_level_0,Unnamed: 1_level_0,Adj Open,Adj High,Adj Low,Adj Close,Volume
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
TEST,2023-01-02,100.5,101.0,99.0,100.0,100000.0
TEST,2023-01-03,101.867014,102.367014,100.367014,101.367014,101020.408163
TEST,2023-01-04,102.999269,103.499269,101.499269,102.499269,102040.816327
TEST,2023-01-05,103.71948,104.21948,102.21948,103.21948,103061.22449
TEST,2023-01-06,103.951248,104.451248,102.451248,103.451248,104081.632653


### Jupyter Notebook: Testing `precompute_signals`

#### Cell 1: Setup and Test Functions

First, we need the function definitions and a small, predictable sample dataset. Testing on the full `df_train` is too difficult to verify manually.


In [16]:
# --- 1. Define your configuration for thresholds ---
config = {
    'lookback_days': 30,
    'rolling_window': 20,
    'r2_thresh': 0.90,
    'z_entry_thresh': -1.5,
    # You can add new thresholds here anytime
    'max_penalty_score': 0.004 
}

# --- 2. Define your strategy as a list of rules ---
# This is where you can be creative and easily swap rules in and out.
strategy_1_rules = [
    # Static value comparison: volume trend must be positive
    {'column': 'low_slope', 'operator': '>', 'value': 0.009},
    
    # # Dynamic threshold from config: R-squared must be high enough
    # {'column': 'low_r_squared', 'operator': '>', 'value_from_config': 'r2_thresh'},
    
    # # Dynamic threshold from config: Look for a price dip
    # {'column': 'z_score_Adj Low', 'operator': '<', 'value_from_config': 'z_entry_thresh'},
    
    # Column-vs-column comparison: Highs are trending up faster than lows
    {'column_A': 'high_slope', 'operator': '>', 'column_B': 'low_slope'},

    {'column': 'low_penalty_score', 'operator': '<', 'value_from_config': 'max_penalty_score'},
    {'column': 'high_penalty_score', 'operator': '<', 'value_from_config': 'max_penalty_score'},      
]

# --- 3. Run the pre-computation with your chosen strategy ---
# To test a new strategy, you would just pass a different list of rules!
_signals, _trends, _features, _df_ohlcv= precompute_signals(df_train, config, rules=strategy_1_rules)

print("\n--- Strategy Results ---")
print(f"Found {len(_signals)} signals using the dynamic rules.")
print(_signals.head())
print(f'\n_signals.descride():\n{_signals.describe()}')

Pre-computing features for this parameter set...
Applying dynamic strategy rules...

--- Strategy Results ---
Found 376 signals using the dynamic rules.
                   open_slope  open_r_squared  high_slope  high_r_squared  low_slope  low_r_squared  close_slope  close_r_squared  volume_slope  volume_r_squared  unified_std_dev_returns  volume_std_dev_returns  open_penalty_score  high_penalty_score  low_penalty_score  close_penalty_score  volume_penalty_score  z_score_Adj Open  z_score_Adj High  z_score_Adj Low  z_score_Adj Close  z_score_Volume
Ticker Date                                                                                                                                                                                                                                                                                                                                                                                                     
APP    2024-10-29    0.010107        0.931826

In [17]:
print(_signals.tail())

                   open_slope  open_r_squared  high_slope  high_r_squared  low_slope  low_r_squared  close_slope  close_r_squared  volume_slope  volume_r_squared  unified_std_dev_returns  volume_std_dev_returns  open_penalty_score  high_penalty_score  low_penalty_score  close_penalty_score  volume_penalty_score  z_score_Adj Open  z_score_Adj High  z_score_Adj Low  z_score_Adj Close  z_score_Volume
Ticker Date                                                                                                                                                                                                                                                                                                                                                                                                     
DDS    2025-05-30    0.010494        0.926170    0.010673        0.896950   0.010567       0.925339     0.010302         0.878128      0.018398          0.400466                 0.027735            

In [13]:
_signals.describe()

Unnamed: 0,open_slope,open_r_squared,high_slope,high_r_squared,low_slope,low_r_squared,close_slope,close_r_squared,volume_slope,volume_r_squared,unified_std_dev_returns,volume_std_dev_returns,open_penalty_score,high_penalty_score,low_penalty_score,close_penalty_score,volume_penalty_score,z_score_Adj Open,z_score_Adj High,z_score_Adj Low,z_score_Adj Close,z_score_Volume
count,8040.0,8040.0,8040.0,8040.0,8040.0,8040.0,8040.0,8040.0,8040.0,8040.0,8040.0,8040.0,8040.0,8040.0,8040.0,8040.0,8040.0,8040.0,8040.0,8040.0,8040.0,8040.0
mean,0.009015,0.743279,0.009274,0.751379,0.008813,0.744099,0.009057,0.741965,0.016456,0.1652826,0.036925,0.588101,0.010732,0.010468,0.010705,0.010844,0.490797,1.031928,0.976339,0.996583,0.937,0.055427
std,0.004834,0.15225,0.004966,0.145476,0.004727,0.158818,0.00485,0.152959,0.017151,0.1595178,0.021663,0.289762,0.011675,0.011287,0.011912,0.01176,0.263029,0.869015,0.870889,0.908754,0.910232,1.025774
min,0.004408,0.033617,0.005012,0.037358,0.005001,0.029733,0.004035,0.035997,-0.049511,6.339136e-09,0.007653,0.206328,0.000278,0.000222,0.000306,0.000287,0.088978,-3.254827,-2.803485,-3.406237,-3.397731,-2.663222
25%,0.006029,0.6642,0.006221,0.676489,0.005875,0.663197,0.00607,0.663832,0.004875,0.03047089,0.021988,0.42023,0.003652,0.00348,0.003513,0.003609,0.337961,0.574733,0.524,0.545446,0.472331,-0.606835
50%,0.00737,0.779088,0.00757,0.78343,0.007196,0.782853,0.0074,0.77669,0.015315,0.1144041,0.030509,0.518065,0.006968,0.006852,0.006859,0.007093,0.436702,1.104643,1.057384,1.103731,1.038716,-0.19865
75%,0.009998,0.858597,0.01027,0.859726,0.009751,0.864601,0.010033,0.856142,0.027044,0.2632138,0.044607,0.66838,0.012909,0.012635,0.012936,0.013104,0.569279,1.575373,1.519392,1.586656,1.524373,0.452402
max,0.042944,0.980417,0.045192,0.985494,0.041779,0.978801,0.043204,0.985206,0.100791,0.811692,0.166839,3.846569,0.146593,0.146026,0.147183,0.146232,3.81426,4.098381,4.138848,4.10823,4.04546,4.218908


In [15]:
print(f'\n_signals.descride():\n{_signals.describe()}')


_signals.descride():
        open_slope  open_r_squared   high_slope  high_r_squared    low_slope  low_r_squared  close_slope  close_r_squared  volume_slope  volume_r_squared  unified_std_dev_returns  volume_std_dev_returns  open_penalty_score  high_penalty_score  low_penalty_score  close_penalty_score  volume_penalty_score  z_score_Adj Open  z_score_Adj High  z_score_Adj Low  z_score_Adj Close  z_score_Volume
count  8040.000000     8040.000000  8040.000000     8040.000000  8040.000000    8040.000000  8040.000000      8040.000000   8040.000000      8.040000e+03              8040.000000             8040.000000         8040.000000         8040.000000        8040.000000          8040.000000           8040.000000       8040.000000       8040.000000      8040.000000        8040.000000     8040.000000
mean      0.009015        0.743279     0.009274        0.751379     0.008813       0.744099     0.009057         0.741965      0.016456      1.652826e-01                 0.036925              

In [10]:
sample_df.to_csv(r'C:\Users\ping\Desktop\sample_df.csv', index=True)
_trends.to_csv(r'C:\Users\ping\Desktop\_trends.csv', index=True)
_features.to_csv(r'C:\Users\ping\Desktop\_features.csv', index=True)
_df_ohlcv.to_csv(r'C:\Users\ping\Desktop\_df_ohlcv.csv', index=True)

In [None]:
MSFT = df_train.loc['MSFT'].copy()
print(f'MSFT.head(3):\n{MSFT.head(3)}')
print(f'\nMSFT.tail(3):\n{MSFT.tail(3)}')
print(f'\nlen(MSFT): {len(MSFT)}')

MSFT.to_csv(r'C:\Users\ping\Desktop\MSFT.csv', index=True)

In [None]:
# --- Parameters for our Test ---
TICKER_TO_CHECK = 'MSFT'
DATE_TO_CHECK = pd.to_datetime('2024-09-26')
LOOKBACK_DAYS = 10

print(f"--- Verifying LOG-TRANSFORMED calculations for '{TICKER_TO_CHECK}' on {DATE_TO_CHECK.date()} ---\n")

# --- 1. Run the FAST Log-Vectorized function ---
ticker_history = df_train.loc[TICKER_TO_CHECK]
vectorized_results_full = analyze_ticker_trends_log_vectorized(ticker_history, LOOKBACK_DAYS)
vectorized_result_today = vectorized_results_full.loc[DATE_TO_CHECK]

# --- 2. Run the SLOW Original (Normalized) function ---
historical_slice = ticker_history.loc[:DATE_TO_CHECK]
original_result_today = analyze_ticker_trends_original(historical_slice, LOOKBACK_DAYS)

# --- 3. Compare the results ---
comparison_df = pd.DataFrame({
    'Log-Vectorized': vectorized_result_today,
    'Original (Normalized)': original_result_today
}).dropna()

print("### Important Note ###")
print("The slope of a log(price) series is mathematically very similar to the slope of a normalized (price/price_0) series.")
print("Therefore, both the SLOPES and R-SQUARED values should now be very close.\n")

print("### Side-by-Side Comparison ###")
print(comparison_df)

# --- 4. Programmatic Check for ALL values ---
try:
    # We now test BOTH slope and r_squared
    pd.testing.assert_series_equal(
        comparison_df['Log-Vectorized'],
        comparison_df['Original (Normalized)'],
        atol=0.05 # Use a slightly larger tolerance for slope approximation
    )
    print("\n[SUCCESS]: Log-vectorized results closely match the original normalized results!")
except AssertionError as e:
    print("\n[FAILURE]: Results do not match.")
    print(e)

In [None]:
vectorized_results_full.head(20)

In [None]:
vectorized_results_full.tail()

### Refactored Code

Here is the complete, refactored solution. I've included the previously refactored functions with slight modifications to accept the new configuration structure.

In [None]:
z_scores = df_train.groupby(level='Ticker', group_keys=False).apply(
    calculate_rolling_z_scores_general, df_train.columns, rolling_window=10
)

In [None]:
z_MSFT_w10 = z_scores.loc['MSFT']
z_MSFT_w10.to_csv(r'C:\Users\ping\Desktop\z_MSFT_w10.csv', index=True)

#### Step 1: Refined Core Backtesting Functions

We'll modify the function signatures to accept a single `config` dictionary. This makes them more modular.

In [None]:
# 1. Define the full configuration
config = {
    'lookback_days': 30,
    'rolling_window': 10,  # Set the value you want to test
    'slope_thresh': 0.05,
    'r2_thresh': 0.50,
    'z_entry_thresh': -1.5,
    # ... other params if needed
}

# 2. Call the function correctly
signals_df, features_df = precompute_signals(df_train, config) 

In [None]:
# --- Test Configuration ---
test_config = {
    'lookback_days': 10,
    'rolling_window': 5,
    'slope_thresh': 0, # Set low to ensure we get some signals
    'r2_thresh': 0.3,
    'z_entry_thresh': -0.5,
    'volume_thresh': 0,
}

# --- Run the special testing function ---
all_features, signals_from_test_func, trends = precompute_signals_for_testing(sample_df, test_config)


print("--- Test 1: Inspecting the full 'features' DataFrame ---")
print(f"Shape of the features DataFrame: {all_features.shape}")
print("Note: The first (lookback_days - 1) rows should be missing due to NaNs.")
print("The ticker 'SHORT' should not appear at all.\n")

# Display the head and tail to check values
print("Head of features:")
print(all_features.head())
print("\nTail of features:")
print(all_features.tail())

# --- Verification Checks for 'features' ---
assert 'SHORT' not in all_features.index.get_level_values('Ticker'), "FAIL: Ticker with insufficient data was not filtered out."
print("\n[SUCCESS]: Ticker with insufficient data was correctly ignored.")

assert not all_features.isnull().values.any(), "FAIL: The features DataFrame contains unexpected NaNs after dropna()."
print("[SUCCESS]: Features DataFrame contains no NaNs.")

print("\n--- Sanity Check Passed for Feature Generation ---")

In [None]:
signals = features[
    (features['low_slope'] > config['slope_thresh']) &
    (features['low_r_squared'] > config['r2_thresh']) &
    (features['volume_slope'] > 0) &
    (features['z_score_Adj Low'] < config['z_entry_thresh'])
]

test_config = {
    'lookback_days': 10,
    'rolling_window': 5,
    'slope_thresh': 0.005, # Set low to ensure we get some signals
    'r2_thresh': 0.3,
    'z_entry_thresh': -0.5,
    'volume_thresh': 0,
}

In [None]:
print(f'trends:\n{trends}')

In [None]:
print(f'sample_df:\n{sample_df}')

#### Step 2: "Black-Box" Test - Verify the Final `signals` DataFrame

Now we test the real function. Our goal is to prove that **every single row** in the final `signals_df` meets the filtering criteria defined in our `test_config`.

In [None]:
# --- Run the REAL function to get the final output ---
signals_df = precompute_signals(sample_df, test_config)

print(f"\n--- Test 2: Verifying the final 'signals' DataFrame ---")
print(f"Found {len(signals_df)} potential signals to verify.")

# --- Programmatic Verification ---
# For every signal found, we cross-reference it with the 'all_features' DataFrame
# and assert that its values meet the criteria.

for idx, signal_row in signals_df.iterrows():
    # Find the original, unfiltered features for this specific signal
    original_features = all_features.loc[idx]

    # Assert that each condition is met
    try:
        assert original_features['low_slope'] > test_config['slope_thresh']
        assert original_features['low_r_squared'] > test_config['r2_thresh']
        assert original_features['volume_slope'] > test_config['volume_thresh']
        assert original_features['z_score_Adj Low'] < test_config['z_entry_thresh']
    except AssertionError as e:
        print(f"\n[FAILURE]: Verification failed for signal at index {idx}!")
        print("Signal Row:")
        print(signal_row)
        print("\nThresholds:")
        print(test_config)
        raise e

if not signals_df.empty:
    print("\n[SUCCESS]: All rows in the final signals DataFrame correctly meet the filter criteria.")
else:
    print("\n[INFO]: No signals were generated with this config, test passed vacuously.")

print("\n--- Sanity Check Passed for Signal Filtering ---")

In [None]:
print(f'signals_df:\n{signals_df}')

# penalty_scores are not calculated???
* penalty_score = (1 - r_squared) * (unified_std_dev + 1e-9)  
* volume_penalty_score = (1 - volume_r_squared) * (volume_std_dev + 1e-9)  

In [None]:
signals_df

In [None]:
features_df

### Step 2: New Encapsulated Helper Functions

These new functions isolate the logic for performance analysis and the optimization loop itself.

In [None]:
def analyze_performance(trade_results):
    """
    Calculates performance metrics from a DataFrame of trades.
    
    Returns a dictionary of key metrics.
    """
    if trade_results.empty:
        return {'num_trades': 0, 'win_rate': 0, 'avg_return': 0, 'total_return': 0}
    
    win_rate = (trade_results['return'] > 0).mean()
    total_return = (1 + trade_results['return']).prod() - 1
    avg_return = trade_results['return'].mean()
    
    return {
        'num_trades': len(trade_results),
        'win_rate': win_rate,
        'avg_return': avg_return,
        'total_return': total_return
    }

def run_parameter_optimization(df, param_grid, static_params):
    """
    Orchestrates the entire parameter optimization process.
    
    Args:
        df (pd.DataFrame): The OHLCV data.
        param_grid (dict): Dictionary with lists of parameters to test.
        static_params (dict): Dictionary of parameters that are not being optimized.

    Returns:
        pd.DataFrame: A summary of results for each parameter combination.
    """
    results_log = []
    
    # Use itertools.product to create a clean generator for all combinations
    keys, values = zip(*param_grid.items())
    param_combinations = [dict(zip(keys, v)) for v in product(*values)]

    print(f"Starting optimization for {len(param_combinations)} combinations...")
    
    for param_set in tqdm(param_combinations, desc="Optimization Progress"):
        # Combine static and dynamic parameters into a single config for this run
        current_config = {**static_params, **param_set}
        
        # 1. Run the backtest with the current configuration
        trade_results = run_backtest(df, current_config)
        
        # 2. Analyze the performance of this run
        performance_metrics = analyze_performance(trade_results)
        
        # 3. Log the results
        log_entry = {**param_set, **performance_metrics}
        results_log.append(log_entry)
        
    return pd.DataFrame(results_log)

def handle_entries_for_day(current_date, next_day_date, signals_today, open_positions, df_ohlcv):
    """
    Processes entries and stores signal details in the open_positions dict.
    """
    # --- KEY CHANGE: Loop through the signals DataFrame ---
    for ticker, signal_row in signals_today.iterrows():
        # The ticker is now in the index of signal_row, so we use its name
        ticker_name = ticker[0] 
        
        if ticker_name not in open_positions:
            try:
                entry_price = df_ohlcv.loc[(ticker_name, next_day_date), 'Adj High']
                
                # --- LOGGING: Store more info about the entry signal ---
                open_positions[ticker_name] = {
                    'entry_date': next_day_date,
                    'entry_price': entry_price,
                    'signal_date': current_date,
                    'signal_features': signal_row.to_dict() # Store all features that triggered the signal
                }
            except KeyError:
                pass
                
    return open_positions

def handle_exits_for_day(current_date, next_day_date, open_positions, df_ohlcv, config):
    """
    Checks for exits and logs detailed information about the exit trigger.
    Corrected version with valid syntax for the if/elif chain.
    """
    closed_trades = []
    positions_to_close = []

    for ticker, pos in open_positions.items():
        try:
            current_close_price = df_ohlcv.loc[(ticker, current_date), 'Adj Close']
        except KeyError:
            continue 

        exit_reason = None
        exit_target_value = None 
        
        # --- SYNTAX FIX: Calculate all threshold values *before* the conditional block ---
        profit_target_price = pos['entry_price'] * (1 + config['profit_target'])
        stop_loss_price = pos['entry_price'] * (1 - config['stop_loss'])
        days_held = (current_date.to_pydatetime().date() - pos['entry_date'].to_pydatetime().date()).days
        
        # --- Now, check conditions in a contiguous if/elif/elif block ---
        if current_close_price >= profit_target_price:
            exit_reason = "Profit Target"
            exit_target_value = profit_target_price 

        elif current_close_price <= stop_loss_price:
            exit_reason = "Stop-Loss"
            exit_target_value = stop_loss_price 

        elif days_held >= config['time_hold_days']:
            exit_reason = "Time Hold"
            exit_target_value = days_held 

        if exit_reason:
            try:
                exit_price = df_ohlcv.loc[(ticker, next_day_date), 'Adj Low']
                trade_return = (exit_price - pos['entry_price']) / pos['entry_price']
                
                trade_log = {
                    'ticker': ticker, 
                    'entry_date': pos['entry_date'], 
                    'exit_date': next_day_date,
                    'return': trade_return, 
                    'reason': exit_reason,
                    'signal_date': pos['signal_date'],
                    'entry_signal_features': pos['signal_features'],
                    'entry_price_actual': pos['entry_price'],
                    'exit_signal_date': current_date,
                    'exit_trigger_price': current_close_price,
                    'exit_target_value': exit_target_value,
                    'exit_price_actual': exit_price,
                }
                closed_trades.append(trade_log)
                positions_to_close.append(ticker)
            except KeyError:
                pass
                
    for ticker in positions_to_close:
        del open_positions[ticker]
        
    return closed_trades, open_positions

#### Step 3: The New, Clean Top-Level Script

Your main script is now incredibly simple and readable. It's all about configuration and orchestration.

In [None]:
# --- 1. DEFINE CONFIGURATION ---

# Parameters to be optimized, defining the search space
optimization_grid = {
    'lookback_days': [30, 60, 90],
    'rolling_window': [15, 20]
}

# Static strategy parameters that do not change during optimization
strategy_params = {
    'slope_thresh': 1.0,
    'r2_thresh': 0.50,
    'z_entry_thresh': 0,
    'profit_target': 0.10,
    'stop_loss': 0.05,
    'time_hold_days': 20
}


# --- 2. RUN ORCHESTRATOR ---

# The main call is now a single, descriptive function
optimization_results = run_parameter_optimization(
    df_train, optimization_grid, strategy_params
)


# --- 3. ANALYZE RESULTS ---

print("\n\n--- Optimization Complete ---")
print(optimization_results.sort_values(by='total_return', ascending=False))

### Step 1: The "One-Trade" Deep Dive

The most powerful debugging technique is to isolate a single trade and follow it from signal generation to exit. If the logic holds for one trade, it's likely correct for all of them.

1.  **Pick a Winning Trade and a Losing Trade:** Run one of the backtests again (e.g., the one with `lookback=30`, `rolling=20`) and save the `trade_results` DataFrame.

In [None]:
# 1. Pick a configuration to analyze
config_to_test = {
    **strategy_params, 
    'lookback_days': 30, 
    'rolling_window': 20
}

# 2. Run the backtest to get the detailed trade log
trade_log_df = run_backtest(df_train, config_to_test)

# 3. Isolate and inspect a single trade
if not trade_log_df.empty:
    # Get the first losing trade
    losing_trade = trade_log_df[trade_log_df['return'] < 0].iloc[0]

    print("--- Detailed Log for a Single Losing Trade ---")
    # Using .T transposes the Series for easy vertical reading
    print(losing_trade.T)
else:
    print("No trades were made for this configuration.")

In [None]:
losing_trade.entry_signal_features

In [None]:
_df_trends = analyze_ticker_trends_vectorized(df_train, lookback_days=30)
print(f'_df_trends:\n{_df_trends}')
# _df_trends.to_csv('C:\\Users\\ping\\Desktop\\_df_trends.csv', index=True)
_df_trends.index.names = ['Ticker', 'Date']
_df_trends.reset_index().to_csv(r'C:\Users\ping\Desktop\_df_trends.csv', index=False)

In [None]:
df_train.loc['MSFT']

In [None]:
# Export to CSV
_df_trends.to_csv(r'C:\Users\ping\Desktop\_df_trends.csv', index=True)

In [None]:
_df_trends.info()

In [None]:
df_loss = trade_log_df[trade_log_df['return'] < 0]
print(df_loss)

In [None]:
df_win = trade_log_df[trade_log_df['return'] > 0]
print(df_win)

In [None]:
df_loss.to_csv('C:\\Users\\ping\\Desktop\\df_loss.csv', index=True)
df_win.to_csv('C:\\Users\\ping\\Desktop\\df_win.csv', index=True)

In [None]:
optimization_results

In [None]:
# Pick a configuration to analyze
config_to_test = {**strategy_params, 'lookback_days': 30, 'rolling_window': 20}

# Run a single backtest and get the detailed trade log
single_run_trades = run_backtest(df_train, config_to_test)

# Find a winning and a losing trade to investigate
print("Sample winning trade:")
print(single_run_trades[single_run_trades['return'] > 0].head(1))

print("\nSample losing trade:")
print(single_run_trades[single_run_trades['return'] < 0].head(1))

In [None]:
df_trends = analyze_ticker_trends_vectorized(df_train, lookback_days=30)


In [None]:
df_trends.loc['FIX', '2024-10-25']