### Setup and Configuration

This cell contains all imports and user-configurable parameters for the analysis pipeline.

In [1]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import pprint
import matplotlib.pyplot as plt # Import for plotting
from IPython.display import display, Markdown
from scipy.stats import linregress 

# --- 1. PANDAS & IPYTHON OPTIONS ---
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 3000)
%load_ext autoreload
%autoreload 2

# --- 2. PROJECT PATH CONFIGURATION ---
NOTEBOOK_DIR = Path.cwd()
ROOT_DIR = NOTEBOOK_DIR.parent.parent  # Adjust if your notebook is in a 'notebooks' subdirectory
DATA_DIR = ROOT_DIR / 'data'
SRC_DIR = ROOT_DIR / 'src'

# Add 'src' to the Python path to import custom modules
if str(SRC_DIR) not in sys.path:
    sys.path.append(str(SRC_DIR))

# --- 3. IMPORT CUSTOM MODULES ---
import utils
import plotting_utils

# --- 4. ANALYSIS & FILTERING CONFIGURATION ---

# File searching parameters
# FILE_PREFIX = ''  # e.g., '2024'
FILE_CONTAINS_PATTERN = 'df_OHLCV_clean_stocks_etfs'

# # Parameters defining the time windows for metric calculation
PERIOD_PARAMS = {
    'lookback_days': 22,
    'recent_days': 0,
}

# This is not use for filtering, it's use to calculate metrics in SORT_ORDER
# Parameters for filtering the calculated metrics to find candidates
METRIC_FILTERS = {
    'min_lookback_improvement': 0,
    'current_rank_bracket_start': 1,
    'current_rank_bracket_end': 1000,
    # --- Select ONE mode by commenting out the others ---
    # 'Reversal' Mode
    'min_recent_bottom_to_recent_start': 0,
    'min_recent_bottom_to_current': 0,    
    # 'Dip' Mode
    # 'min_current_to_recent_start': 10,
}

# --- 5. VERIFICATION ---
print("--- Path Configuration ---")
print(f"✅ Project Root: {ROOT_DIR}")
print(f"✅ Data Dir:     {DATA_DIR}")
print(f"✅ Source Dir:   {SRC_DIR}")
assert all([ROOT_DIR.exists(), DATA_DIR.exists(), SRC_DIR.exists()]), "A key directory was not found!"

print("\n--- Module Verification ---")
print(f"✅ Successfully imported 'utils' and 'plotting_utils'.")



--- Path Configuration ---
✅ Project Root: c:\Users\ping\Files_win10\python\py311\stocks
✅ Data Dir:     c:\Users\ping\Files_win10\python\py311\stocks\data
✅ Source Dir:   c:\Users\ping\Files_win10\python\py311\stocks\src

--- Module Verification ---
✅ Successfully imported 'utils' and 'plotting_utils'.


In [2]:
print("--- Step 1: Loading latest consolidated Finviz data ---")

# Find the most recent file matching the pattern
# This function is now understood to return List[str] (filenames), not List[Path].
latest_finviz_filepaths = utils.get_recent_files(
    directory_path=DATA_DIR,
    extension='parquet',
    prefix='202',
    contains_pattern='df_finviz_merged_stocks_etfs',
    count=1
)

if not latest_finviz_filepaths:
    raise FileNotFoundError(f"No files found in '{DATA_DIR}' with prefix '{FILE_PREFIX}' and pattern '{FILE_CONTAINS_PATTERN}'")

# Get the filename string from the list
latest_filename = latest_finviz_filepaths[0]

# Manually construct the full path before loading
full_file_path = DATA_DIR / latest_filename
df_finviz_latest = pd.read_parquet(full_file_path, engine='pyarrow')


# --- Robust Index Setting (this logic remains correct) ---
if df_finviz_latest.index.name == 'Ticker':
    print("Info: 'Ticker' is already the index. No action needed.")
elif 'Ticker' in df_finviz_latest.columns:
    print("Info: 'Ticker' column found. Setting it as the DataFrame index.")
    df_finviz_latest.set_index('Ticker', inplace=True)
elif 'ticker' in df_finviz_latest.columns:
    print("Info: 'ticker' column found. Renaming and setting as index.")
    df_finviz_latest.rename(columns={'ticker': 'Ticker'}, inplace=True)
    df_finviz_latest.set_index('Ticker', inplace=True)
elif df_finviz_latest.index.name is None:
    print("Info: Index is unnamed. Assuming it contains tickers and assigning the name 'Ticker'.")
    df_finviz_latest.index.name = 'Ticker'
else:
    print("ERROR: Loaded DataFrame has an unexpected format.")
    print(f"Columns: {df_finviz_latest.columns.tolist()}")
    print(f"Index Name: '{df_finviz_latest.index.name}'")
    raise ValueError("Could not find a 'Ticker' column or a usable index to proceed.")


# Correct the print statement to work with the filename string
print(f"✅ Successfully loaded: {latest_filename}")
print(f"Shape: {df_finviz_latest.shape}")
print(df_finviz_latest.head(3))

--- Step 1: Loading latest consolidated Finviz data ---
Info: Index is unnamed. Assuming it contains tickers and assigning the name 'Ticker'.
✅ Successfully loaded: 2025-09-12_df_finviz_merged_stocks_etfs.parquet
Shape: (1463, 139)
        No.                Company               Index      Sector                   Industry Country Exchange                                   Info  MktCap AUM, M  Rank  Market Cap, M    P/E  Fwd P/E   PEG    P/S    P/B    P/C  P/FCF  Book/sh  Cash/sh  Dividend %  Dividend TTM Dividend Ex Date  Payout Ratio %    EPS  EPS next Q  EPS this Y %  EPS next Y %  EPS past 5Y %  EPS next 5Y %  Sales past 5Y %  Sales Q/Q %  EPS Q/Q %  EPS YoY TTM %  Sales YoY TTM %  Sales, M  Income, M  EPS Surprise %  Revenue Surprise %  Outstanding, M  Float, M  Float %  Insider Own %  Insider Trans %  Inst Own %  Inst Trans %  Short Float %  Short Ratio  Short Interest, M  ROA %   ROE %  ROIC %  Curr R  Quick R  LTDebt/Eq  Debt/Eq  Gross M %  Oper M %  Profit M %  Perf 3D %  Per

In [3]:
import pandas as pd
# DATA_DIR = r'c:\Users\ping\Files_win10\python\py311\stocks\data'
# Manually construct the full path before loading
full_file_path = r'c:\Users\ping\Files_win10\python\py311\stocks\data\df_OHLCV_clean_stocks_etfs.parquet'
df_OHLCV = pd.read_parquet(full_file_path, engine='pyarrow')

### Cell 2: The Chronological Split Code

This cell contains the logic to find the split date and create the `df_train` and `df_test` DataFrames.

In [4]:
# --- 1. Find the Chronological Split Point ---

# Get all unique dates from the index and sort them
unique_dates = df_OHLCV.index.get_level_values('Date').unique().sort_values()

# Determine the index for the 70% split
split_index = int(len(unique_dates) * 0.7)

# Find the actual date at that split index
split_date = unique_dates[split_index]

print(f"Total unique trading dates in dataset: {len(unique_dates)}")
print(f"The data will be split on the date: {split_date.date()}")

# --- 2. Create the Training and Testing Sets ---

# The training set includes all data UP TO and INCLUDING the split_date
df_train = df_OHLCV[df_OHLCV.index.get_level_values('Date') <= split_date]

# The testing set includes all data AFTER the split_date
df_test = df_OHLCV[df_OHLCV.index.get_level_values('Date') > split_date]


# --- 3. Verify the Split ---

print("\n--- Verification ---")
print(f"Original DataFrame shape: {df_OHLCV.shape}")
print(f"Training set shape:   {df_train.shape}")
print(f"Testing set shape:    {df_test.shape}")

print("\nDate Ranges:")
print(f"  Training: {df_train.index.get_level_values('Date').min().date()} to {df_train.index.get_level_values('Date').max().date()}")
print(f"  Testing:  {df_test.index.get_level_values('Date').min().date()} to {df_test.index.get_level_values('Date').max().date()}")

# Final check to ensure no overlap
assert df_train.index.get_level_values('Date').max() < df_test.index.get_level_values('Date').min()
print("\nVerification successful: There is no date overlap between train and test sets.")

Total unique trading dates in dataset: 250
The data will be split on the date: 2025-05-28

--- Verification ---
Original DataFrame shape: (371000, 5)
Training set shape:   (261184, 5)
Testing set shape:    (109816, 5)

Date Ranges:
  Training: 2024-09-13 to 2025-05-28
  Testing:  2025-05-29 to 2025-09-12

Verification successful: There is no date overlap between train and test sets.


### Jupyter Notebook: Verifying `analyze_ticker_trends_vectorized`


#### Cell 1: Setup and Imports

First, let's import the necessary libraries and define both versions of the function we want to compare.

In [5]:
import pandas as pd
import numpy as np
from scipy.stats import linregress

# --- Function 1: The Fast, Vectorized Version (The one we are verifying) ---

def analyze_ticker_trends_vectorized(df_group, lookback_days=60):
    """
    Vectorized analysis of trends for a ticker's price and volume.
    Calculates linear regression metrics using rolling covariance and variance.
    """
    if len(df_group) < lookback_days:
        return None 

    time_index = pd.Series(np.arange(len(df_group)), index=df_group.index)
    var_time = np.var(np.arange(lookback_days), ddof=0)
    
    series_to_analyze = {
        'high': df_group['Adj High'],
        'low': df_group['Adj Low'],
        'volume': df_group['Volume'].astype(float)
    }
    
    df_results = pd.DataFrame(index=df_group.index)

    for name, series in series_to_analyze.items():
        rolling_cov = time_index.rolling(window=lookback_days).cov(series, ddof=0)
        rolling_var_series = series.rolling(window=lookback_days).var(ddof=0)
        
        df_results[f'{name}_slope'] = rolling_cov / var_time
        denominator = (var_time * rolling_var_series) + 1e-9
        df_results[f'{name}_r_squared'] = (rolling_cov**2) / denominator

    yesterday_low = df_group['Adj Low'].shift(1)
    worst_case_returns = (df_group['Adj High'] - yesterday_low) / yesterday_low
    df_results['unified_std_dev_returns'] = worst_case_returns.rolling(window=lookback_days).std(ddof=0)
    
    volume_std_dev = df_group['Volume'].pct_change().rolling(window=lookback_days).std(ddof=0)
    df_results['volume_std_dev_returns'] = volume_std_dev
    
    df_results['low_penalty_score'] = (1 - df_results['low_r_squared']) * (df_results['unified_std_dev_returns'] + 1e-9)
    df_results['high_penalty_score'] = (1 - df_results['high_r_squared']) * (df_results['unified_std_dev_returns'] + 1e-9)
    df_results['volume_penalty_score'] = (1 - df_results['volume_r_squared']) * (df_results['volume_std_dev_returns'] + 1e-9)
    
    return df_results

# --- Function 2: The Original, Explicit Version (Our "Ground Truth") ---

def analyze_ticker_trends_original(df_group, lookback_days=60):
    """
    The original, non-vectorized version. We use this as our ground truth
    because its logic is explicit and easy to reason about for a single window.
    """
    
    def _perform_price_regression(price_series, time_index):
        """Helper for High/Low regression."""
        # NOTE: The original function normalized the price. This is the key difference.
        normalized_price = price_series / price_series.iloc[0] 
        try:
            res = linregress(x=time_index, y=normalized_price)
            return {'slope': res.slope, 'r_squared': res.rvalue**2}
        except (ValueError, ZeroDivisionError):
            return {'slope': 0.0, 'r_squared': np.nan}

    window = df_group.tail(lookback_days)
    if len(window) < lookback_days:
        return pd.Series()

    time_index = np.arange(len(window))
    
    # Analyze High/Low
    high_metrics = _perform_price_regression(window['Adj High'], time_index)
    low_metrics = _perform_price_regression(window['Adj Low'], time_index)

    # Analyze Volume
    start_volume = window['Volume'].iloc[0]
    if start_volume > 0:
        normalized_volume = window['Volume'] / start_volume
        try:
            vol_res = linregress(x=time_index, y=normalized_volume)
            volume_slope, volume_r_squared = vol_res.slope, vol_res.rvalue**2
        except ValueError:
            volume_slope, volume_r_squared = 0.0, np.nan
    else:
        volume_slope, volume_r_squared = 0.0, np.nan
        
    final_results = {
        'high_slope': high_metrics['slope'], 'high_r_squared': high_metrics['r_squared'],
        'low_slope': low_metrics['slope'], 'low_r_squared': low_metrics['r_squared'],
        'volume_slope': volume_slope, 'volume_r_squared': volume_r_squared
    }
    
    return pd.Series(final_results)


print("Verification functions are defined.")

Verification functions are defined.


In [6]:
def analyze_ticker_trends(df_group, lookback_days=60):
    """
    Analyzes the trends of a ticker's price channel (High/Low) and Volume.
    
    - Price channel analysis uses a unified "worst-case" volatility.
    - Volume analysis uses its own standard pct_change volatility.
    """
    
    def _perform_price_regression(price_series, time_index, unified_std_dev):
        """Helper for High/Low regression, using the shared volatility metric."""
        normalized_price = price_series / price_series.iloc[0]
        try:
            res = linregress(x=time_index, y=normalized_price)
            slope, r_squared = res.slope, res.rvalue**2
        except (ValueError, ZeroDivisionError):
            return {'slope': 0.0, 'r_squared': np.nan, 'penalty_score': np.nan}
        penalty_score = (1 - r_squared) * (unified_std_dev + 1e-9)
        return {'slope': slope, 'r_squared': r_squared, 'penalty_score': penalty_score}

    # --- Main function logic ---
    
    # 1. Select the lookback window
    window = df_group.tail(lookback_days)
    if len(window) < lookback_days or len(window) < 10:
        # Define all expected columns for a clean NaN return
        cols = ['high_slope', 'high_r_squared', 'high_penalty_score',
                'low_slope', 'low_r_squared', 'low_penalty_score',
                'unified_std_dev_returns', 'volume_slope', 'volume_r_squared',
                'volume_std_dev_returns', 'volume_penalty_score']
        return pd.Series(dict.fromkeys(cols, np.nan))

    # 2. Calculate the UNIFIED "Worst-Case" Volatility for PRICE
    yesterday_low = window['Adj Low'].shift(1)
    worst_case_returns = (window['Adj High'] - yesterday_low) / yesterday_low
    unified_std_dev = worst_case_returns.std(ddof=0) # ddof=0 for population std dev
    unified_std_dev = 0.0 if pd.isna(unified_std_dev) else unified_std_dev

    # 3. Analyze Volume Trend and Volatility (SELF-CONTAINED)
    time_index = np.arange(len(window))
    start_volume = window['Volume'].iloc[0]
    
    if start_volume > 0:
        normalized_volume = window['Volume'] / start_volume
        try:
            vol_res = linregress(x=time_index, y=normalized_volume)
            volume_slope, volume_r_squared = vol_res.slope, vol_res.rvalue**2
        except ValueError:
            volume_slope, volume_r_squared = 0.0, np.nan
    else: # Handle zero start volume edge case
        volume_slope, volume_r_squared = 0.0, np.nan
        
    volume_std_dev = window['Volume'].pct_change().std(ddof=0)
    volume_std_dev = 0.0 if pd.isna(volume_std_dev) else volume_std_dev
    volume_penalty_score = (1 - volume_r_squared) * (volume_std_dev + 1e-9)

    # 4. Analyze the High and Low series using the helper
    high_metrics = _perform_price_regression(window['Adj High'], time_index, unified_std_dev)
    low_metrics = _perform_price_regression(window['Adj Low'], time_index, unified_std_dev)
    
    # 5. Combine all results into a single Series
    final_results = {
        'high_slope': high_metrics['slope'], 'high_r_squared': high_metrics['r_squared'], 'high_penalty_score': high_metrics['penalty_score'],
        'low_slope': low_metrics['slope'], 'low_r_squared': low_metrics['r_squared'], 'low_penalty_score': low_metrics['penalty_score'],
        'unified_std_dev_returns': unified_std_dev,
        'volume_slope': volume_slope, 'volume_r_squared': volume_r_squared,
        'volume_std_dev_returns': volume_std_dev, 'volume_penalty_score': volume_penalty_score
    }
    
    return pd.Series(final_results)

#### Cell 2: Create Realistic Sample Data

We need some sample data that mimics your real dataset to perform the check.

In [7]:
# Create a date range
dates = pd.to_datetime(pd.date_range(start='2023-01-01', periods=200, freq='B'))

# Create data for Ticker 'A' (upward trend)
price_a = 100 + np.linspace(0, 50, 200) + np.random.randn(200) * 2
volume_a = 100000 + np.sin(np.arange(200)/10) * 20000 + np.random.randint(-5000, 5000, 200)
df_a = pd.DataFrame({
    'Ticker': 'A',
    'Date': dates,
    'Adj High': price_a + 1,
    'Adj Low': price_a - 1,
    'Volume': volume_a
})

# Create data for Ticker 'B' (downward trend)
price_b = 200 - np.linspace(0, 30, 200) + np.random.randn(200) * 3
volume_b = 500000 + np.cos(np.arange(200)/5) * 50000 + np.random.randint(-10000, 10000, 200)
df_b = pd.DataFrame({
    'Ticker': 'B',
    'Date': dates,
    'Adj High': price_b + 1.5,
    'Adj Low': price_b - 1.5,
    'Volume': volume_b
})

# Combine and set the multi-index
sample_df = pd.concat([df_a, df_b]).set_index(['Ticker', 'Date'])

print("Sample DataFrame created with shape:", sample_df.shape)
sample_df.head()

Sample DataFrame created with shape: (400, 3)


Unnamed: 0_level_0,Unnamed: 1_level_0,Adj High,Adj Low,Volume
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,2023-01-02,98.727599,96.727599,101871.0
A,2023-01-03,99.656278,97.656278,98425.668333
A,2023-01-04,100.397974,98.397974,107919.386616
A,2023-01-05,100.609764,98.609764,101937.404133
A,2023-01-06,102.545761,100.545761,103213.366846


#### Cell 3: The Verification Process

Here, we'll pick a specific date and ticker, run both functions, and compare their outputs side-by-side.

In [8]:
# --- Parameters for our Test ---
TICKER_TO_CHECK = 'A'
DATE_TO_CHECK = pd.to_datetime('2023-06-21')
LOOKBACK_DAYS = 60


print(f"--- Verifying calculations for Ticker '{TICKER_TO_CHECK}' on {DATE_TO_CHECK.date()} with a {LOOKBACK_DAYS}-day lookback ---\n")

# --- 1. Run the FAST Vectorized function on the full history ---
ticker_history = sample_df.loc[TICKER_TO_CHECK]
vectorized_results_full = analyze_ticker_trends_vectorized(ticker_history, LOOKBACK_DAYS)

# Get the specific result for our target date
vectorized_result_today = vectorized_results_full.loc[DATE_TO_CHECK]

# --- 2. Run the SLOW Original function on a sliced window (our ground truth) ---
# Get all data up to and including our check date
historical_slice = ticker_history.loc[:DATE_TO_CHECK]
original_result_today = analyze_ticker_trends_original(historical_slice, LOOKBACK_DAYS)


# --- 3. Compare the results ---
# Create a comparison DataFrame for clarity
comparison_df = pd.DataFrame({
    'Vectorized': vectorized_result_today,
    'Original (Normalized)': original_result_today
}).dropna()

# Note on Slopes: The vectorized version doesn't normalize the input, so the slope magnitude will differ.
# However, the R-squared value, which is scale-invariant, should be nearly identical.
comparison_df['Difference'] = comparison_df['Vectorized'] - comparison_df['Original (Normalized)']

# For slopes, the sign should be the same
slope_cols = [col for col in comparison_df.index if 'slope' in col]
comparison_df['Sign Match?'] = np.nan
comparison_df.loc[slope_cols, 'Sign Match?'] = np.sign(comparison_df.loc[slope_cols, 'Vectorized']) == np.sign(comparison_df.loc[slope_cols, 'Original (Normalized)'])


print("### Important Note on Slopes ###")
print("The 'Original' function normalized prices/volume before regression (e.g., price / first_price).")
print("The 'Vectorized' function does not, for performance reasons. This means:")
print("  - R-SQUARED values SHOULD MATCH PERFECTLY (as R^2 is scale-invariant).")
print("  - SLOPE magnitudes WILL NOT MATCH, but their SIGN (+ or -) SHOULD.\n")

print("### Side-by-Side Comparison ###")
print(comparison_df)

# --- 4. Programmatic Check for R-Squared values ---
try:
    r_squared_cols = [col for col in comparison_df.index if 'r_squared' in col]
    pd.testing.assert_series_equal(
        comparison_df.loc[r_squared_cols, 'Vectorized'],
        comparison_df.loc[r_squared_cols, 'Original (Normalized)'],
        atol=1e-9 # Use a small tolerance for floating point math
    )
    print("\n[SUCCESS]: R-Squared values match perfectly!")
except AssertionError as e:
    print("\n[FAILURE]: R-Squared values do not match.")
    print(e)

--- Verifying calculations for Ticker 'A' on 2023-06-21 with a 60-day lookback ---

### Important Note on Slopes ###
The 'Original' function normalized prices/volume before regression (e.g., price / first_price).
The 'Vectorized' function does not, for performance reasons. This means:
  - R-SQUARED values SHOULD MATCH PERFECTLY (as R^2 is scale-invariant).
  - SLOPE magnitudes WILL NOT MATCH, but their SIGN (+ or -) SHOULD.

### Side-by-Side Comparison ###
                  Vectorized  Original (Normalized)    Difference Sign Match?
high_r_squared      0.833543               0.833543 -1.434408e-13         NaN
high_slope          0.250067               0.002111  2.479559e-01        True
low_r_squared       0.833543               0.833543 -1.431077e-13         NaN
low_slope           0.250067               0.002148  2.479196e-01        True
volume_r_squared    0.675416               0.675416 -1.554312e-15         NaN
volume_slope     -681.682635              -0.006553 -6.816761e+02      

  comparison_df.loc[slope_cols, 'Sign Match?'] = np.sign(comparison_df.loc[slope_cols, 'Vectorized']) == np.sign(comparison_df.loc[slope_cols, 'Original (Normalized)'])


In [9]:
# --- Parameters for our Test ---
TICKER_TO_CHECK = 'A'
DATE_TO_CHECK = pd.to_datetime('2023-06-21')
LOOKBACK_DAYS = 60


print(f"--- Verifying calculations for Ticker '{TICKER_TO_CHECK}' on {DATE_TO_CHECK.date()} with a {LOOKBACK_DAYS}-day lookback ---\n")

# --- 1. Run the FAST Vectorized function on the full history ---
ticker_history = sample_df.loc[TICKER_TO_CHECK]
vectorized_results_full = analyze_ticker_trends_vectorized(ticker_history, LOOKBACK_DAYS)

# Get the specific result for our target date
vectorized_result_today = vectorized_results_full.loc[DATE_TO_CHECK]

# --- 2. Run the SLOW Original function on a sliced window (our ground truth) ---
# Get all data up to and including our check date
historical_slice = ticker_history.loc[:DATE_TO_CHECK]


# original_result_today = analyze_ticker_trends_original(historical_slice, LOOKBACK_DAYS)
original_result_today = analyze_ticker_trends(historical_slice, LOOKBACK_DAYS)

# --- 3. Compare the results ---
# Create a comparison DataFrame for clarity
comparison_df = pd.DataFrame({
    'Vectorized': vectorized_result_today,
    'Original (Normalized)': original_result_today
}).dropna()

# Note on Slopes: The vectorized version doesn't normalize the input, so the slope magnitude will differ.
# However, the R-squared value, which is scale-invariant, should be nearly identical.
comparison_df['Difference'] = comparison_df['Vectorized'] - comparison_df['Original (Normalized)']

# For slopes, the sign should be the same
slope_cols = [col for col in comparison_df.index if 'slope' in col]
comparison_df['Sign Match?'] = np.nan
comparison_df.loc[slope_cols, 'Sign Match?'] = np.sign(comparison_df.loc[slope_cols, 'Vectorized']) == np.sign(comparison_df.loc[slope_cols, 'Original (Normalized)'])


print("### Important Note on Slopes ###")
print("The 'Original' function normalized prices/volume before regression (e.g., price / first_price).")
print("The 'Vectorized' function does not, for performance reasons. This means:")
print("  - R-SQUARED values SHOULD MATCH PERFECTLY (as R^2 is scale-invariant).")
print("  - SLOPE magnitudes WILL NOT MATCH, but their SIGN (+ or -) SHOULD.\n")

print("### Side-by-Side Comparison ###")
print(comparison_df)

# --- 4. Programmatic Check for R-Squared values ---
try:
    r_squared_cols = [col for col in comparison_df.index if 'r_squared' in col]
    pd.testing.assert_series_equal(
        comparison_df.loc[r_squared_cols, 'Vectorized'],
        comparison_df.loc[r_squared_cols, 'Original (Normalized)'],
        atol=1e-9 # Use a small tolerance for floating point math
    )
    print("\n[SUCCESS]: R-Squared values match perfectly!")
except AssertionError as e:
    print("\n[FAILURE]: R-Squared values do not match.")
    print(e)

--- Verifying calculations for Ticker 'A' on 2023-06-21 with a 60-day lookback ---

### Important Note on Slopes ###
The 'Original' function normalized prices/volume before regression (e.g., price / first_price).
The 'Vectorized' function does not, for performance reasons. This means:
  - R-SQUARED values SHOULD MATCH PERFECTLY (as R^2 is scale-invariant).
  - SLOPE magnitudes WILL NOT MATCH, but their SIGN (+ or -) SHOULD.

### Side-by-Side Comparison ###
                         Vectorized  Original (Normalized)    Difference Sign Match?
high_penalty_score         0.004176               0.004173  2.188011e-06         NaN
high_r_squared             0.833543               0.833543 -1.434408e-13         NaN
high_slope                 0.250067               0.002111  2.479559e-01        True
low_penalty_score          0.004176               0.004173  2.188011e-06         NaN
low_r_squared              0.833543               0.833543 -1.431077e-13         NaN
low_slope                  0.

  comparison_df.loc[slope_cols, 'Sign Match?'] = np.sign(comparison_df.loc[slope_cols, 'Vectorized']) == np.sign(comparison_df.loc[slope_cols, 'Original (Normalized)'])


### Refactored Code

Here is the complete, refactored solution. I've included the previously refactored functions with slight modifications to accept the new configuration structure.

In [None]:
import numpy as np
import pandas as pd

def calculate_rolling_z_scores(df_group, rolling_window=20):
    """
    Calculates a rolling Z-score for the 'Adj Low' price for an entire ticker history.
    
    This function is designed to be used with pandas' groupby().apply().
    
    Args:
        df_group (pd.DataFrame): The DataFrame for a single ticker.
        rolling_window (int): The lookback window for calculating mean and std.
        
    Returns:
        pd.Series: A Series of Z-scores with the same index as the input df_group.
    """
    # Use the column directly, which is more efficient.
    low_price = df_group['Adj Low']
    
    # Calculate rolling statistics. min_periods=1 ensures we get values even at the start.
    rolling_mean = low_price.rolling(window=rolling_window, min_periods=1).mean()
    rolling_std = low_price.rolling(window=rolling_window, min_periods=1).std()
    
    # Calculate the Z-score for all dates at once.
    z_score = (low_price - rolling_mean) / rolling_std
    
    # --- ROBUSTNESS IMPROVEMENTS ---
    # 1. Handle division by zero: if std is 0, z_score is inf. Replace with 0.
    z_score = z_score.replace([np.inf, -np.inf], 0)
    
    # 2. Fill any initial NaNs (from std dev calculation) with 0.
    z_score = z_score.fillna(0)
    
    # Return the entire series, renamed appropriately.
    return z_score.rename('low_rolling_z_score')


In [None]:
import numpy as np
import pandas as pd
from scipy.stats import linregress # We no longer need this for the vectorized function

def analyze_ticker_trends_vectorized(df_group, lookback_days=60):
    """
    Vectorized analysis of trends for a ticker's price and volume.
    
    This version calculates linear regression metrics using rolling covariance
    and variance, avoiding the limitations of .apply() and maximizing performance.
    """
    if len(df_group) < lookback_days:
        return None # groupby().apply() will correctly handle None returns by skipping the group

    # --- 1. Setup for Vectorized Regression ---
    
    # Create a time-series index [0, 1, 2, ...] that aligns with the data
    time_index = pd.Series(np.arange(len(df_group)), index=df_group.index)
    
    # The variance of a sequence [0, 1, ..., n-1] is constant. Pre-calculate it.
    var_time = np.var(np.arange(lookback_days), ddof=0)
    
    # --- 2. Calculate Rolling Metrics for Each Column ---

    # We'll work with a dictionary of the series we want to analyze
    series_to_analyze = {
        'high': df_group['Adj High'],
        'low': df_group['Adj Low'],
        'volume': df_group['Volume'].astype(float) # Ensure volume is float
    }
    
    df_results = pd.DataFrame(index=df_group.index)

    for name, series in series_to_analyze.items():
        # Rolling covariance between the series and the time index
        rolling_cov = time_index.rolling(window=lookback_days).cov(series, ddof=0)
        
        # Rolling variance of the series itself
        rolling_var_series = series.rolling(window=lookback_days).var(ddof=0)
        
        # Slope = cov(t, y) / var(t)
        df_results[f'{name}_slope'] = rolling_cov / var_time
        
        # R-squared = cov(t, y)^2 / (var(t) * var(y))
        # Add a small epsilon to prevent division by zero
        denominator = (var_time * rolling_var_series) + 1e-9
        df_results[f'{name}_r_squared'] = (rolling_cov**2) / denominator

    # --- 3. Calculate Volatility and Penalty Scores (as before) ---
    
    yesterday_low = df_group['Adj Low'].shift(1)
    worst_case_returns = (df_group['Adj High'] - yesterday_low) / yesterday_low
    df_results['unified_std_dev_returns'] = worst_case_returns.rolling(window=lookback_days).std(ddof=0)
    
    volume_std_dev = df_group['Volume'].pct_change().rolling(window=lookback_days).std(ddof=0)
    df_results['volume_std_dev_returns'] = volume_std_dev
    
    df_results['low_penalty_score'] = (1 - df_results['low_r_squared']) * (df_results['unified_std_dev_returns'] + 1e-9)
    df_results['high_penalty_score'] = (1 - df_results['high_r_squared']) * (df_results['unified_std_dev_returns'] + 1e-9)
    df_results['volume_penalty_score'] = (1 - df_results['volume_r_squared']) * (df_results['volume_std_dev_returns'] + 1e-9)
    
    return df_results


#### Step 1: Refined Core Backtesting Functions

We'll modify the function signatures to accept a single `config` dictionary. This makes them more modular.

In [None]:
import pandas as pd
from tqdm import tqdm
from itertools import product

# Assume analyze_ticker_trends_vectorized and calculate_rolling_z_scores are defined elsewhere

def precompute_signals(df_ohlcv, config):
    """
    Pre-computes all trading signals and returns the full features DataFrame
    for the dates where signals were triggered.
    """
    print("Pre-computing features for this parameter set...")
    
    trends = df_ohlcv.groupby(level='Ticker', group_keys=False).apply(
        analyze_ticker_trends_vectorized, config['lookback_days']
    )
    
    z_scores = df_ohlcv.groupby(level='Ticker', group_keys=False).apply(
        calculate_rolling_z_scores, config['rolling_window']
    )
    
    features = trends.join(z_scores).dropna()

    signals = features[
        (features['low_slope'] > config['slope_thresh']) &
        (features['low_r_squared'] > config['r2_thresh']) &
        (features['volume_slope'] > 0) &
        (features['low_rolling_z_score'] < config['z_entry_thresh'])
    ]
    
    # --- KEY CHANGE: Return the full signals DataFrame, not just the index ---
    return signals

def run_backtest(df_ohlcv, config):
    """
    Orchestrates the backtesting process with enhanced logging.
    """
    # Now returns a DataFrame of features for triggered signals
    entry_signals_features = precompute_signals(df_ohlcv, config)
    
    trades = []
    open_positions = {}
    
    all_dates = df_ohlcv.index.get_level_values('Date').unique().sort_values()
    start_index = max(config['lookback_days'], config['rolling_window'])

    for i in tqdm(range(start_index, len(all_dates) - 1), desc="Backtesting"):
        current_date = all_dates[i]
        next_day_date = all_dates[i+1]

        closed_trades, open_positions = handle_exits_for_day(
            current_date, next_day_date, open_positions, df_ohlcv, config
        )
        trades.extend(closed_trades)

        # --- KEY CHANGE: Filter the features DataFrame for today's signals ---
        signals_today = entry_signals_features[
            entry_signals_features.index.get_level_values('Date') == current_date
        ]
        
        # Pass the full signals_today DataFrame to the handler
        open_positions = handle_entries_for_day(
            current_date, next_day_date, signals_today, open_positions, df_ohlcv
        )
                
    # --- Create the final DataFrame and reorder columns for clarity ---
    if not trades:
        return pd.DataFrame()

    final_trades_df = pd.DataFrame(trades)
    log_columns = [
        'ticker', 'signal_date', 'entry_date', 'exit_signal_date', 'exit_date', 'reason',
        'return', 'entry_price_actual', 'exit_price_actual', 'exit_trigger_price', 
        'exit_target_value', 'entry_signal_features'
    ]
    # Ensure all columns exist, fill missing with None
    for col in log_columns:
        if col not in final_trades_df.columns:
            final_trades_df[col] = None
            
    return final_trades_df[log_columns]


### Step 2: New Encapsulated Helper Functions

These new functions isolate the logic for performance analysis and the optimization loop itself.

In [None]:
def analyze_performance(trade_results):
    """
    Calculates performance metrics from a DataFrame of trades.
    
    Returns a dictionary of key metrics.
    """
    if trade_results.empty:
        return {'num_trades': 0, 'win_rate': 0, 'avg_return': 0, 'total_return': 0}
    
    win_rate = (trade_results['return'] > 0).mean()
    total_return = (1 + trade_results['return']).prod() - 1
    avg_return = trade_results['return'].mean()
    
    return {
        'num_trades': len(trade_results),
        'win_rate': win_rate,
        'avg_return': avg_return,
        'total_return': total_return
    }

def run_parameter_optimization(df, param_grid, static_params):
    """
    Orchestrates the entire parameter optimization process.
    
    Args:
        df (pd.DataFrame): The OHLCV data.
        param_grid (dict): Dictionary with lists of parameters to test.
        static_params (dict): Dictionary of parameters that are not being optimized.

    Returns:
        pd.DataFrame: A summary of results for each parameter combination.
    """
    results_log = []
    
    # Use itertools.product to create a clean generator for all combinations
    keys, values = zip(*param_grid.items())
    param_combinations = [dict(zip(keys, v)) for v in product(*values)]

    print(f"Starting optimization for {len(param_combinations)} combinations...")
    
    for param_set in tqdm(param_combinations, desc="Optimization Progress"):
        # Combine static and dynamic parameters into a single config for this run
        current_config = {**static_params, **param_set}
        
        # 1. Run the backtest with the current configuration
        trade_results = run_backtest(df, current_config)
        
        # 2. Analyze the performance of this run
        performance_metrics = analyze_performance(trade_results)
        
        # 3. Log the results
        log_entry = {**param_set, **performance_metrics}
        results_log.append(log_entry)
        
    return pd.DataFrame(results_log)

def handle_entries_for_day(current_date, next_day_date, signals_today, open_positions, df_ohlcv):
    """
    Processes entries and stores signal details in the open_positions dict.
    """
    # --- KEY CHANGE: Loop through the signals DataFrame ---
    for ticker, signal_row in signals_today.iterrows():
        # The ticker is now in the index of signal_row, so we use its name
        ticker_name = ticker[0] 
        
        if ticker_name not in open_positions:
            try:
                entry_price = df_ohlcv.loc[(ticker_name, next_day_date), 'Adj High']
                
                # --- LOGGING: Store more info about the entry signal ---
                open_positions[ticker_name] = {
                    'entry_date': next_day_date,
                    'entry_price': entry_price,
                    'signal_date': current_date,
                    'signal_features': signal_row.to_dict() # Store all features that triggered the signal
                }
            except KeyError:
                pass
                
    return open_positions

def handle_exits_for_day(current_date, next_day_date, open_positions, df_ohlcv, config):
    """
    Checks for exits and logs detailed information about the exit trigger.
    Corrected version with valid syntax for the if/elif chain.
    """
    closed_trades = []
    positions_to_close = []

    for ticker, pos in open_positions.items():
        try:
            current_close_price = df_ohlcv.loc[(ticker, current_date), 'Adj Close']
        except KeyError:
            continue 

        exit_reason = None
        exit_target_value = None 
        
        # --- SYNTAX FIX: Calculate all threshold values *before* the conditional block ---
        profit_target_price = pos['entry_price'] * (1 + config['profit_target'])
        stop_loss_price = pos['entry_price'] * (1 - config['stop_loss'])
        days_held = (current_date.to_pydatetime().date() - pos['entry_date'].to_pydatetime().date()).days
        
        # --- Now, check conditions in a contiguous if/elif/elif block ---
        if current_close_price >= profit_target_price:
            exit_reason = "Profit Target"
            exit_target_value = profit_target_price 

        elif current_close_price <= stop_loss_price:
            exit_reason = "Stop-Loss"
            exit_target_value = stop_loss_price 

        elif days_held >= config['time_hold_days']:
            exit_reason = "Time Hold"
            exit_target_value = days_held 

        if exit_reason:
            try:
                exit_price = df_ohlcv.loc[(ticker, next_day_date), 'Adj Low']
                trade_return = (exit_price - pos['entry_price']) / pos['entry_price']
                
                trade_log = {
                    'ticker': ticker, 
                    'entry_date': pos['entry_date'], 
                    'exit_date': next_day_date,
                    'return': trade_return, 
                    'reason': exit_reason,
                    'signal_date': pos['signal_date'],
                    'entry_signal_features': pos['signal_features'],
                    'entry_price_actual': pos['entry_price'],
                    'exit_signal_date': current_date,
                    'exit_trigger_price': current_close_price,
                    'exit_target_value': exit_target_value,
                    'exit_price_actual': exit_price,
                }
                closed_trades.append(trade_log)
                positions_to_close.append(ticker)
            except KeyError:
                pass
                
    for ticker in positions_to_close:
        del open_positions[ticker]
        
    return closed_trades, open_positions

#### Step 3: The New, Clean Top-Level Script

Your main script is now incredibly simple and readable. It's all about configuration and orchestration.

In [None]:
# --- 1. DEFINE CONFIGURATION ---

# Parameters to be optimized, defining the search space
optimization_grid = {
    'lookback_days': [30, 60, 90],
    'rolling_window': [15, 20]
}

# Static strategy parameters that do not change during optimization
strategy_params = {
    'slope_thresh': 1.0,
    'r2_thresh': 0.50,
    'z_entry_thresh': 0,
    'profit_target': 0.10,
    'stop_loss': 0.05,
    'time_hold_days': 20
}


# --- 2. RUN ORCHESTRATOR ---

# The main call is now a single, descriptive function
optimization_results = run_parameter_optimization(
    df_train, optimization_grid, strategy_params
)


# --- 3. ANALYZE RESULTS ---

print("\n\n--- Optimization Complete ---")
print(optimization_results.sort_values(by='total_return', ascending=False))

### Step 1: The "One-Trade" Deep Dive

The most powerful debugging technique is to isolate a single trade and follow it from signal generation to exit. If the logic holds for one trade, it's likely correct for all of them.

1.  **Pick a Winning Trade and a Losing Trade:** Run one of the backtests again (e.g., the one with `lookback=30`, `rolling=20`) and save the `trade_results` DataFrame.

In [None]:
# 1. Pick a configuration to analyze
config_to_test = {
    **strategy_params, 
    'lookback_days': 30, 
    'rolling_window': 20
}

# 2. Run the backtest to get the detailed trade log
trade_log_df = run_backtest(df_train, config_to_test)

# 3. Isolate and inspect a single trade
if not trade_log_df.empty:
    # Get the first losing trade
    losing_trade = trade_log_df[trade_log_df['return'] < 0].iloc[0]

    print("--- Detailed Log for a Single Losing Trade ---")
    # Using .T transposes the Series for easy vertical reading
    print(losing_trade.T)
else:
    print("No trades were made for this configuration.")

In [None]:
losing_trade.entry_signal_features

In [None]:
_df_trends = analyze_ticker_trends_vectorized(df_train, lookback_days=30)
print(f'_df_trends:\n{_df_trends}')
# _df_trends.to_csv('C:\\Users\\ping\\Desktop\\_df_trends.csv', index=True)
_df_trends.index.names = ['Ticker', 'Date']
_df_trends.reset_index().to_csv(r'C:\Users\ping\Desktop\_df_trends.csv', index=False)

In [None]:
df_train.loc['MSFT']

In [None]:
# Export to CSV
_df_trends.to_csv(r'C:\Users\ping\Desktop\_df_trends.csv', index=True)

In [None]:
_df_trends.info()

In [None]:
df_loss = trade_log_df[trade_log_df['return'] < 0]
print(df_loss)

In [None]:
df_win = trade_log_df[trade_log_df['return'] > 0]
print(df_win)

In [None]:
df_loss.to_csv('C:\\Users\\ping\\Desktop\\df_loss.csv', index=True)
df_win.to_csv('C:\\Users\\ping\\Desktop\\df_win.csv', index=True)

In [None]:
optimization_results

In [None]:
# Pick a configuration to analyze
config_to_test = {**strategy_params, 'lookback_days': 30, 'rolling_window': 20}

# Run a single backtest and get the detailed trade log
single_run_trades = run_backtest(df_train, config_to_test)

# Find a winning and a losing trade to investigate
print("Sample winning trade:")
print(single_run_trades[single_run_trades['return'] > 0].head(1))

print("\nSample losing trade:")
print(single_run_trades[single_run_trades['return'] < 0].head(1))

In [None]:
df_trends = analyze_ticker_trends_vectorized(df_train, lookback_days=30)


In [None]:
df_trends.loc['FIX', '2024-10-25']