In [1]:
# import numpy as np
# from scipy import stats
# import pandas as pd
# import matplotlib.pyplot as plt

# def analyze_rank_series(ranks: np.ndarray):
#     """
#     Performs linear regression on a rank series and calculates metrics
#     to quantify its steadiness and volatility.

#     Args:
#         ranks (np.ndarray): A 1D array of ranks.

#     Returns:
#         dict: A dictionary containing key metrics about the series.
#     """
#     if len(ranks) < 2:
#         return None # Not enough data to perform regression

#     # Create the time-step variable (x-axis)
#     time_steps = np.arange(len(ranks))

#     # --- Perform Linear Regression using SciPy (fast and efficient) ---
#     slope, intercept, r_value, p_value, std_err = stats.linregress(time_steps, ranks)
    
#     # --- Calculate Key Metrics ---
    
#     # 1. R-squared: The primary measure of "steadiness" or linearity
#     r_squared = r_value**2

#     # 2. Predicted values and residuals
#     predicted_ranks = intercept + slope * time_steps
#     residuals = ranks - predicted_ranks

#     # 3. Maximum Absolute Residual: The single biggest jump from the trend
#     max_abs_residual = np.max(np.abs(residuals))

#     # 4. Standard Deviation of Returns: The primary measure of "jumpiness"
#     # Use pandas for a convenient way to calculate percentage change
#     returns = pd.Series(ranks).pct_change().dropna()
#     std_dev_returns = returns.std()
    
#     # 5. Create a combined penalty score (example)
#     # A higher score means more penalty (less desirable series)
#     # We use (1 - r_squared) because a high r_squared is good.
#     # We add a small epsilon to avoid division by zero if std_dev is 0.
#     penalty_score = (1 - r_squared) * (std_dev_returns + 1e-9)

#     return {
#         "slope": slope,
#         "intercept": intercept,
#         "r_squared": r_squared,
#         "std_dev_returns": std_dev_returns,
#         "max_abs_residual": max_abs_residual,
#         "penalty_score": penalty_score,
#         "predicted_ranks": predicted_ranks # For plotting
#     }


In [2]:
# import numpy as np
# import pandas as pd
# # The 'analyze_rank_series' function is assumed to be defined as you provided
# # from scipy import stats # This would be needed by analyze_rank_series

# def calculate_rank_metrics(df_rank_history, tickers_list, lookback_days=20, recent_days=4):
#     """
#     Calculates a comprehensive set of rank metrics for a given list of tickers.

#     This function does NOT filter tickers based on performance criteria. It processes
#     every ticker provided in the tickers_list and returns its calculated metrics,
#     making it suitable for generating features for analysis or other models.

#     Args:
#         df_rank_history (pd.DataFrame): DataFrame with tickers as index and dates as columns.
#         tickers_list (list): A list of ticker symbols to calculate metrics for.
#         lookback_days (int): The number of days for the "lookback" period.
#         recent_days (int): The number of days for the "recent" period.

#     Returns:
#         list: A list of dictionaries, where each dictionary contains the calculated
#               rank metrics for one ticker. Tickers with insufficient data in the
#               specified period are skipped.
#     """
#     # --- Guard Clause & Date Setup ---
#     total_days_needed = lookback_days + recent_days
#     if len(df_rank_history.columns) < total_days_needed:
#         print(f"Error: Not enough data. Need {total_days_needed} days, have {len(df_rank_history.columns)}.")
#         return []

#     all_dates = df_rank_history.columns
#     last_date = all_dates[-1]
#     recent_period_start_date = all_dates[-recent_days]
#     lookback_period_end_date = all_dates[-(recent_days + 1)]
#     lookback_period_start_date = all_dates[-(recent_days + lookback_days)]
    
#     lookback_dates = df_rank_history.loc[:, lookback_period_start_date:lookback_period_end_date].columns
#     recent_dates = df_rank_history.loc[:, recent_period_start_date:last_date].columns
    
#     all_ticker_metrics = []

#     print(f"Calculating metrics for {len(tickers_list)} tickers...")

#     # --- Calculation Loop ---
#     for ticker in tickers_list:
#         # Skip if ticker is not in the dataframe index
#         if ticker not in df_rank_history.index:
#             continue

#         lookback_ranks = df_rank_history.loc[ticker, lookback_dates].dropna()
#         recent_ranks = df_rank_history.loc[ticker, recent_dates].dropna()
        
#         # Skip if there's not enough data for this specific ticker in the required periods
#         if len(lookback_ranks) < lookback_days or len(recent_ranks) < recent_days:
#             continue
            
#         # --- Perform all calculations without any filtering 'if' statements ---
        
#         # MODIFICATION: Use analyze_rank_series to calculate slope, r_squared, and penalty_score.
#         analysis_results = analyze_rank_series(lookback_ranks.values)
        
#         # The analyze_rank_series function returns None if there's not enough data,
#         # so we add a check here just in case.
#         if not analysis_results:
#             continue
            
#         slope = analysis_results['slope']
#         r_squared = analysis_results['r_squared']
#         penalty_score = analysis_results['penalty_score']
        
#         all_ranks_in_period = pd.concat([lookback_ranks, recent_ranks])

#         # Key reference points
#         current_rank = int(recent_ranks.iloc[-1])
#         rank_at_start_of_recent = int(recent_ranks.iloc[0])
#         recent_bottom_rank = int(recent_ranks.max()) # Worst rank in recent period
#         total_peak_rank = int(all_ranks_in_period.min()) # Best rank over the whole period
        
#         # This dictionary holds all calculated metrics for one ticker
#         metrics_dict = {
#             'ticker': ticker,
#             'lookback_slope': round(slope, 2),
#             'r_squared': round(r_squared, 4),      # Added r_squared
#             'penalty_score': round(penalty_score, 4), # Added penalty_score
            
#             # Key Ranks
#             'current': current_rank,
#             'recent_start': rank_at_start_of_recent,
#             'lookback_start': int(lookback_ranks.iloc[0]),
#             'lookback_end': int(lookback_ranks.iloc[-1]),

#             # Best/Worst Ranks by Period
#             'best_lookback': int(lookback_ranks.min()),
#             'worst_lookback': int(lookback_ranks.max()),
#             'best_recent': int(recent_ranks.min()),
#             'worst_recent': recent_bottom_rank, # Same value, more descriptive name
#             'best_total': total_peak_rank,
#             'worst_total': int(all_ranks_in_period.max()),

#             # Derived Metrics (using clearer names)
#             'current_to_total_peak': current_rank - total_peak_rank,
#             'current_to_recent_start': current_rank - rank_at_start_of_recent,
#             'recent_bottom_to_recent_start': recent_bottom_rank - rank_at_start_of_recent,
#             'recent_bottom_to_current': recent_bottom_rank - current_rank,
#         }
#         all_ticker_metrics.append(metrics_dict)

#     return all_ticker_metrics


### The Role of Each Metric in Your Strategy

First, let's assign a clear purpose to each metric from an investor's point of view:

1.  **Slope (The Growth Factor):** This is your **reward** metric. It quantifies the magnitude of the trend. A higher slope means faster price appreciation. A negative slope means the stock is losing value.
    *   **Purpose:** To identify stocks that are actually growing.

2.  **R-Squared (The Reliability Filter):** This is your **confidence** metric. It tells you if the slope is statistically meaningful or just a result of random noise. A high R-squared means the price movement is very trend-like and predictable.
    *   **Purpose:** To filter out erratic, unpredictable stocks where the "trend" isn't trustworthy.

3.  **Penalty Score (The Quality & Risk Factor):** This is your primary **selection** metric. It combines trend reliability (`1 - R²`) with path volatility (`std_dev_returns`). A lower score is better, indicating a smooth, linear, and low-volatility journey.
    *   **Purpose:** To rank the "good" stocks and find the highest quality, most stable performers.

---

### Recommended Strategy: The Tiered Filtering & Ranking Funnel

This is the most robust and logical approach. It works like a funnel, progressively narrowing the universe of stocks to find the best candidates.

#### Step 1: The Viability Filter (using `Slope`)

**Action:** Discard any stock where `Slope <= 0`.

**Reasoning:** Your goal is to find "steady price *gains*." The most basic requirement is that the stock's trend is positive. This is a simple, non-negotiable first pass to eliminate all assets that are flat or in a downtrend over your analysis period. You might even set a minimum threshold, like `Slope > 0.1`, to ensure you're only looking at stocks with meaningful upward momentum.

#### Step 2: The Reliability Filter (using `R-Squared`)

**Action:** From the remaining stocks, discard any where `R-Squared` is below a certain threshold. A good starting point is **`R-Squared < 0.70`**.

**Reasoning:** A high slope is meaningless if the trend is not reliable. A stock that jumps +20% one week and falls -19% the next might have a positive slope, but its movement is chaotic. By filtering for a high R-squared, you are ensuring that the growth (slope) is consistent and not just a statistical accident. This step removes the "trap" stocks that look good on slope alone but are far too unpredictable.

#### Step 3: The Quality Ranking (using `Penalty Score`)

**Action:** Take the final pool of stocks that have passed both filters and **rank them by `Penalty Score` from lowest to highest.**

**Reasoning:** You now have a list of stocks that are both *growing* (Step 1) and have a *reliable trend* (Step 2). The final step is to select the *best* among them based on your core philosophy: favoring steady, smooth gains. The `Penalty Score` was specifically designed for this. A lower penalty score signifies a smoother, more linear path with less volatility. The stocks at the top of this ranked list are your prime candidates.

**Final Selection:** Choose the top N stocks (e.g., top 5, 10, or 20) from this final ranked list to build your portfolio.

### Conceptual Example

Imagine you analyze four stocks:

| Stock | Slope | R-Squared | Penalty Score | Decision Process |
| :---- | :---- | :-------- | :------------ | :-------------------------------------------------------------------------------- |
| **A** | 1.5   | 0.95      | 0.2           | **Passes all filters.** High growth, reliable, and very smooth. **Top Candidate.**     |
| **B** | 2.0   | 0.55      | 1.8           | Passes Step 1 (high slope), but **FAILS Step 2** (R² is too low). It's an erratic gambler's stock, not a steady gainer. **Discard.** |
| **C** | 0.8   | 0.98      | 0.1           | **Passes all filters.** Moderate growth, but extremely reliable and smooth. Ranks higher than Stock A in the final ranking due to its lower penalty score. A great "safe" pick. |
| **D** | -0.5  | 0.90      | 0.4           | **FAILS Step 1.** Even though it's a very smooth and predictable trend, it's going down. **Discard immediately.** |

In this scenario, after the filtering and ranking, your final list would be:
1.  **Stock C** (Penalty Score: 0.1)
2.  **Stock A** (Penalty Score: 0.2)

---

### Alternative Strategy: The "Steadiness-Adjusted Growth" Score

If you want to combine everything into a single number, you can create a ratio. This is for investors who are willing to accept a bit more volatility in exchange for a significantly higher growth rate.

**Action:** For all stocks that pass the initial filters (`Slope > 0` and `R-Squared > 0.7`), calculate a new score:

`SAG_Score = Slope / Penalty_Score` (Steadiness-Adjusted Growth)

Then, rank the stocks from **highest to lowest** `SAG_Score`.

**Reasoning:** This score rewards stocks with a high slope (high reward) and a low penalty score (low risk/high quality). It directly creates a "bang for your buck" metric. A stock with a massive slope could overcome a slightly higher penalty score. This is a more aggressive approach than the primary strategy but is an excellent way to balance growth and quality.

### Final Recommendation

For your stated goal, the **Tiered Filtering & Ranking Funnel is the superior strategy.** It is more robust, methodologically sound, and directly aligned with your philosophy. It ensures you never compromise on the foundational principles of positive growth and trend reliability before optimizing for smoothness.

Start with that strategy. Once you are comfortable with it, you can experiment with the `SAG_Score` as a secondary way to view your top candidates and see if it reveals any high-growth opportunities you might have otherwise overlooked.

### Rank-Based Candidate Analysis

This notebook identifies promising stock tickers based on their historical rank performance.

**Workflow:**

1.  **Load Data:** It finds the latest comprehensive Finviz data file and all recent daily rank files.
2.  **Build Rank History:** It compiles the daily files into a single time-series DataFrame (`df_rank_history`) where each row is a ticker and each column is a date.
3.  **Calculate Metrics:** It processes the entire rank history to compute a rich set of performance metrics (slope, peak rank, etc.) for *every ticker*. This creates a master `df_all_tickers_metrics` DataFrame.
4.  **Filter & Sort Candidates:** The master metrics are filtered according to user-defined criteria (e.g., "Reversal" pattern) to find a small list of top candidates.
5.  **Analyze & Visualize:** The top candidates are enhanced with price data, sorted, and plotted. A separate analysis is also performed on a pre-defined portfolio of interest.

### Setup and Configuration

This cell contains all imports and user-configurable parameters for the analysis pipeline.

In [3]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import pprint
import inspect  # <--- ADD THIS LINE
from IPython.display import display, Markdown

# --- 1. PANDAS & IPYTHON OPTIONS ---
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 3000)
%load_ext autoreload
%autoreload 2

# --- 2. PROJECT PATH CONFIGURATION ---
NOTEBOOK_DIR = Path.cwd()
ROOT_DIR = NOTEBOOK_DIR.parent.parent  # Adjust if your notebook is in a 'notebooks' subdirectory
DATA_DIR = ROOT_DIR / 'data'
SRC_DIR = ROOT_DIR / 'src'

# Add 'src' to the Python path to import custom modules
if str(SRC_DIR) not in sys.path:
    sys.path.append(str(SRC_DIR))

# --- 3. IMPORT CUSTOM MODULES ---
import utils
import plotting_utils

# --- 4. ANALYSIS & FILTERING CONFIGURATION ---

# File searching parameters
FILE_PREFIX = '202'  # e.g., '2024'
FILE_CONTAINS_PATTERN = 'df_finviz_merged_stocks_etfs'
HISTORY_FILE_COUNT = 100 # Number of recent daily files to build rank history

############################################
# # Parameters defining the time windows for metric calculation
PERIOD_PARAMS = {
    'lookback_days': 40,
    'recent_days': 8,
}

############################################

# This is not use for filtering, it's use to calculate metrics in SORT_ORDER
# Parameters for filtering the calculated metrics to find candidates
METRIC_FILTERS = {
    'min_lookback_improvement': 0,
    'current_rank_bracket_start': 1,
    'current_rank_bracket_end': 1000,
    # --- Select ONE mode by commenting out the others ---
    # 'Reversal' Mode
    'min_recent_bottom_to_recent_start': 0,
    'min_recent_bottom_to_current': 0,    
    # 'Dip' Mode
    # 'min_current_to_recent_start': 10,
}

# Sorting is the filter to select the top tickers
# Sorting order for final candidate list (column_name: ascending_boolean)
SORT_ORDER = {
    'lookback_slope': True,             # Lower is better (steeper improving trend)    
    'current_to_total_peak': True,      # Lower is better (closer to all-time best rank)
    'Change/(ATR/Price)': False,        # Higher is better (strong upside price and low volatility)   
    'Change %': False,                  # Higher is better (stronger daily performance)    
    'recent_bottom_to_current': False,  # Higher is better (stronger bounce from recent low)
}


# List of tickers for a separate, focused portfolio analysis
PORTFOLIO_TICKERS = [
    "JOBY", "SYM", "RKLB", "MSTR", "ORCL",
    "SHOP", "COIN", "VGT", "AVAV", "META",
    "NVDA",
    # ####### Change/(ATR/Price) > 1.5
    "DELL", "FUTU", "VST",
    #########
     "U", "IONQ",
    "RBLX",
    # ####### Kimi pick
    "ASTS", "NET", "ANET", "CCJ",
]

CANDIDATES_TO_PLOT = 100

# --- 5. VERIFICATION ---
print("--- Path Configuration ---")
print(f"✅ Project Root: {ROOT_DIR}")
print(f"✅ Data Dir:     {DATA_DIR}")
print(f"✅ Source Dir:   {SRC_DIR}")
assert all([ROOT_DIR.exists(), DATA_DIR.exists(), SRC_DIR.exists()]), "A key directory was not found!"

print("\n--- Module Verification ---")
print(f"✅ Successfully imported 'utils' and 'plotting_utils'.")

print("\n--- Analysis Configuration ---")
print("\n--- Analysis Configuration ---")
print("Period Parameters (for calculation):")
pprint.pprint(PERIOD_PARAMS)
print("\nMetric Filters (for selection):")
pprint.pprint(METRIC_FILTERS, sort_dicts=False)

--- Path Configuration ---
✅ Project Root: c:\Users\ping\Files_win10\python\py311\stocks
✅ Data Dir:     c:\Users\ping\Files_win10\python\py311\stocks\data
✅ Source Dir:   c:\Users\ping\Files_win10\python\py311\stocks\src

--- Module Verification ---
✅ Successfully imported 'utils' and 'plotting_utils'.

--- Analysis Configuration ---

--- Analysis Configuration ---
Period Parameters (for calculation):
{'lookback_days': 40, 'recent_days': 8}

Metric Filters (for selection):
{'min_lookback_improvement': 0,
 'current_rank_bracket_start': 1,
 'current_rank_bracket_end': 1000,
 'min_recent_bottom_to_recent_start': 0,
 'min_recent_bottom_to_current': 0}


### Step 1: Load Latest Merged Finviz Data

Find and load the single most recent `df_finviz_merged` file. This DataFrame contains supplementary data like `Price` and `ATR/Price %` that will be used to enhance our final analysis.

In [4]:
print("--- Step 1: Loading latest consolidated Finviz data ---")

# Find the most recent file matching the pattern
# This function is now understood to return List[str] (filenames), not List[Path].
latest_finviz_filepaths = utils.get_recent_files(
    directory_path=DATA_DIR,
    extension='parquet',
    prefix=FILE_PREFIX,
    contains_pattern=FILE_CONTAINS_PATTERN,
    count=1
)

if not latest_finviz_filepaths:
    raise FileNotFoundError(f"No files found in '{DATA_DIR}' with prefix '{FILE_PREFIX}' and pattern '{FILE_CONTAINS_PATTERN}'")

# Get the filename string from the list
latest_filename = latest_finviz_filepaths[0]

# Manually construct the full path before loading
full_file_path = DATA_DIR / latest_filename
df_finviz_latest = pd.read_parquet(full_file_path, engine='pyarrow')


# --- Robust Index Setting (this logic remains correct) ---
if df_finviz_latest.index.name == 'Ticker':
    print("Info: 'Ticker' is already the index. No action needed.")
elif 'Ticker' in df_finviz_latest.columns:
    print("Info: 'Ticker' column found. Setting it as the DataFrame index.")
    df_finviz_latest.set_index('Ticker', inplace=True)
elif 'ticker' in df_finviz_latest.columns:
    print("Info: 'ticker' column found. Renaming and setting as index.")
    df_finviz_latest.rename(columns={'ticker': 'Ticker'}, inplace=True)
    df_finviz_latest.set_index('Ticker', inplace=True)
elif df_finviz_latest.index.name is None:
    print("Info: Index is unnamed. Assuming it contains tickers and assigning the name 'Ticker'.")
    df_finviz_latest.index.name = 'Ticker'
else:
    print("ERROR: Loaded DataFrame has an unexpected format.")
    print(f"Columns: {df_finviz_latest.columns.tolist()}")
    print(f"Index Name: '{df_finviz_latest.index.name}'")
    raise ValueError("Could not find a 'Ticker' column or a usable index to proceed.")


# Correct the print statement to work with the filename string
print(f"✅ Successfully loaded: {latest_filename}")
print(f"Shape: {df_finviz_latest.shape}")
print(df_finviz_latest.head(3))

--- Step 1: Loading latest consolidated Finviz data ---
Info: Index is unnamed. Assuming it contains tickers and assigning the name 'Ticker'.
✅ Successfully loaded: 2025-09-08_df_finviz_merged_stocks_etfs.parquet
Shape: (1525, 139)
        No.                Company               Index      Sector                   Industry Country Exchange                                   Info  MktCap AUM, M  Rank  Market Cap, M    P/E  Fwd P/E   PEG    P/S    P/B    P/C  P/FCF  Book/sh  Cash/sh  Dividend %  Dividend TTM Dividend Ex Date  Payout Ratio %    EPS  EPS next Q  EPS this Y %  EPS next Y %  EPS past 5Y %  EPS next 5Y %  Sales past 5Y %  Sales Q/Q %  EPS Q/Q %  EPS YoY TTM %  Sales YoY TTM %  Sales, M  Income, M  EPS Surprise %  Revenue Surprise %  Outstanding, M  Float, M  Float %  Insider Own %  Insider Trans %  Inst Own %  Inst Trans %  Short Float %  Short Ratio  Short Interest, M  ROA %   ROE %  ROIC %  Curr R  Quick R  LTDebt/Eq  Debt/Eq  Gross M %  Oper M %  Profit M %  Perf 3D %  Per

### Step 2: Build Rank History Matrix

Load the last `N` daily data files to construct a comprehensive rank history DataFrame. This matrix is the primary input for all subsequent trend and performance calculations.

In [5]:
print(f"--- Step 2: Building rank history from the latest {HISTORY_FILE_COUNT} files ---")

# Get a list of all recent daily files
daily_files_list = utils.get_recent_files(
    directory_path=DATA_DIR,
    extension='parquet',
    prefix=FILE_PREFIX,
    contains_pattern=FILE_CONTAINS_PATTERN,
    count=HISTORY_FILE_COUNT
)

# Use the utility function to create the rank history dataframe
# Assumes 'create_rank_history_df' is now in utils.py
df_rank_history = utils.create_rank_history_df(daily_files_list, DATA_DIR)

print(f"✅ Rank history matrix created successfully.")
print(f"Shape: {df_rank_history.shape} (Tickers, Days)")
print(f"Date Range: {df_rank_history.columns.min().strftime('%Y-%m-%d')} to {df_rank_history.columns.max().strftime('%Y-%m-%d')}")

--- Step 2: Building rank history from the latest 100 files ---
✅ Rank history matrix created successfully.
Shape: (1663, 92) (Tickers, Days)
Date Range: 2025-04-25 to 2025-09-08


In [6]:
# 2. Print the original shape
# df_rank_history.shape returns a tuple (number_of_rows, number_of_columns)
print("\nOriginal shape:", df_rank_history.shape)

# 3. Remove all rows with any NaN values
# The dropna() method returns a new DataFrame by default
df_rank_history_cleaned = df_rank_history.dropna()

# 4. Print the new shape and the cleaned DataFrame
print("\nDataFrame after removing rows with any NaN values:")
print(df_rank_history_cleaned)

print("\nNew shape:", df_rank_history_cleaned.shape)


Original shape: (1663, 92)

DataFrame after removing rows with any NaN values:
       2025-04-25  2025-04-28  2025-04-29  2025-04-30  2025-05-01  2025-05-02  2025-05-05  2025-05-06  2025-05-07  2025-05-08  2025-05-09  2025-05-12  2025-05-13  2025-05-14  2025-05-15  2025-05-16  2025-05-19  2025-05-20  2025-05-21  2025-05-22  2025-05-23  2025-05-27  2025-05-28  2025-05-29  2025-05-30  2025-06-02  2025-06-03  2025-06-04  2025-06-05  2025-06-06  2025-06-09  2025-06-10  2025-06-11  2025-06-12  2025-06-13  2025-06-16  2025-06-17  2025-06-18  2025-06-19  2025-06-20  2025-06-23  2025-06-24  2025-06-25  2025-06-26  2025-06-27  2025-06-30  2025-07-01  2025-07-02  2025-07-03  2025-07-07  2025-07-08  2025-07-09  2025-07-10  2025-07-11  2025-07-14  2025-07-15  2025-07-16  2025-07-17  2025-07-18  2025-07-21  2025-07-22  2025-07-23  2025-07-24  2025-07-25  2025-07-28  2025-07-29  2025-07-30  2025-07-31  2025-08-01  2025-08-04  2025-08-05  2025-08-06  2025-08-07  2025-08-08  2025-08-11  2025-08-12  2

### Step 3: Calculate Metrics for All Tickers

Process the rank history matrix to compute performance metrics for **every ticker**. This creates a master metrics DataFrame that serves as a single source of truth for all subsequent filtering and analysis.

In [8]:
print("--- Step 3: Calculating performance metrics for all tickers ---")

# This call remains the same. It correctly passes the arguments to the new function.
all_metrics_data = utils.calculate_rank_metrics( # Assuming it's in the same file, or use utils.
    df_rank_history,
    tickers_list=df_rank_history.index.tolist(),
    **PERIOD_PARAMS
)

# Convert the list of dicts into a DataFrame.
# The new 'r_squared' and 'penalty_score' keys will automatically become columns.
df_all_tickers_metrics = pd.DataFrame(all_metrics_data)
if not df_all_tickers_metrics.empty:
    df_all_tickers_metrics.set_index('ticker', inplace=True)
    df_all_tickers_metrics.index.name = 'Ticker'

print(f"✅ Calculated metrics for {len(df_all_tickers_metrics)} tickers.")
print("\nDataFrame head, now including 'r_squared' and 'penalty_score':")
# display(df_all_tickers_metrics.head()) # Use display() in a notebook, or print()
print(df_all_tickers_metrics.head())

--- Step 3: Calculating performance metrics for all tickers ---
Calculating metrics for 1663 tickers...
✅ Calculated metrics for 805 tickers.

DataFrame head, now including 'r_squared' and 'penalty_score':
        lookback_slope  r_squared  penalty_score  current  recent_start  lookback_start  lookback_end  best_lookback  worst_lookback  best_recent  worst_recent  best_total  worst_total  current_to_total_peak  current_to_recent_start  recent_bottom_to_recent_start  recent_bottom_to_current
Ticker                                                                                                                                                                                                                                                                                           
NVDA              0.00     0.0000         0.0000        1             1               1             1              1               1            1             1           1            1                      0       

### Step 4: Filter for 'Reversal' Candidates

Apply the predefined filtering rules from the configuration cell to the master metrics DataFrame to identify a list of promising candidates.


In [11]:
print("--- Step 4: Filtering metrics to find candidates ---")
print("METRIC_FILTERS")
pprint.pprint(METRIC_FILTERS)

# Use the utility function, passing only the relevant filter arguments.
# This is now much cleaner than the previous version.
df_filtered_candidates = utils.filter_rank_metrics(
    df_all_tickers_metrics,
    **METRIC_FILTERS
)

print(f"✅ Found {len(df_filtered_candidates)} candidates matching the criteria.")
display(df_filtered_candidates.head(20))

--- Step 4: Filtering metrics to find candidates ---
METRIC_FILTERS
{'current_rank_bracket_end': 1000,
 'current_rank_bracket_start': 1,
 'min_lookback_improvement': 0,
 'min_recent_bottom_to_current': 0,
 'min_recent_bottom_to_recent_start': 0}
Filtering in 'Reversal' mode...
✅ Found 300 candidates matching the criteria.


Unnamed: 0_level_0,lookback_slope,r_squared,penalty_score,current,recent_start,lookback_start,lookback_end,best_lookback,worst_lookback,best_recent,worst_recent,best_total,worst_total,current_to_total_peak,current_to_recent_start,recent_bottom_to_recent_start,recent_bottom_to_current
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
BG,-7.45,0.6678,0.013,779,768,1014,742,742,1069,768,804,742,1069,37,11,36,25
COOP,-5.24,0.7874,0.005,900,954,1096,939,921,1120,900,998,900,1120,0,-54,44,98
CHRW,-4.29,0.8162,0.0035,848,840,982,831,831,987,830,851,830,987,18,8,11,3
CX,-3.99,0.8843,0.0012,868,898,1050,891,891,1050,868,898,868,1050,0,-30,0,30
INCY,-3.96,0.8196,0.0027,777,789,886,771,761,905,767,798,761,905,16,-12,9,21
BWXT,-3.72,0.6305,0.0075,845,826,893,832,769,931,826,851,769,931,76,19,25,6
GFI,-3.65,0.721,0.0066,483,502,651,505,505,651,483,522,483,651,0,-19,20,39
SNN,-3.29,0.7246,0.0044,784,799,888,789,789,904,784,803,784,904,0,-15,4,19
KGC,-3.09,0.7487,0.0054,542,584,714,589,589,714,542,592,542,714,0,-42,8,50
TER,-3.08,0.7128,0.0061,721,715,828,713,713,849,707,721,707,849,14,6,6,0


### Step 5: Enhance, Sort, and Select Top Candidates

Enrich the filtered candidates with the latest price data, sort them according to the specified rules, and select the top N for visualization.


In [45]:
# Assuming the following columns have been calculated earlier for each stock
# and are present in df_filtered_candidates:
# - 'slope': The slope of the regression line (higher is better).
# - 'r_squared': The R-squared value (higher is better).
# - 'penalty_score': The combined penalty score (lower is better).

print("--- Step 5: Applying the Tiered Filtering & Ranking Funnel ---")

# --- Funnel Strategy Parameters (you can tune these) ---
# SLOPE_THRESHOLD = -1.0      # Viability: Trend must be meaningfully positive.
# R_SQUARED_THRESHOLD = 0.75  # Reliability: Trend must be consistent and not random.
SLOPE_THRESHOLD = 100      # Viability: Trend must be meaningfully positive.
R_SQUARED_THRESHOLD = -10  # Reliability: Trend must be consistent and not random.


# ==============================================================================
# ### MODIFICATION START: Replaced the original sorting logic with the funnel ###
# ==============================================================================

# --- Step 1: The Viability Filter (using Slope) ---
print(f"Starting with {len(df_filtered_candidates)} initial candidates.")
df_funnel_step1 = df_filtered_candidates[df_filtered_candidates['lookback_slope'] < SLOPE_THRESHOLD]
print(f"-> {len(df_funnel_step1)} candidates remain after Slope Filter (lookback_slope < {SLOPE_THRESHOLD}).")


# --- Step 2: The Reliability Filter (using R-Squared) ---
df_funnel_step2 = df_funnel_step1[df_funnel_step1['r_squared'] > R_SQUARED_THRESHOLD]
print(f"-> {len(df_funnel_step2)} candidates remain after R-Squared Filter (r_squared > {R_SQUARED_THRESHOLD}).")
df_funnel_candidates = df_funnel_step2

# --- Step 3: Enhance, Calculate, and Rank the High-Quality Candidates ---
# Join with latest Finviz data to add Price, MktCap, etc.
cols_to_add = ['Price', 'Change %', 'MktCap AUM, M', 'ATR/Price %', 'Rel Volume']
# Use .loc to avoid potential SettingWithCopyWarning
df_candidates_enhanced = df_funnel_candidates.join(df_finviz_latest[cols_to_add])

# Calculate the new 'Change/(ATR/Price)' metric
df_candidates_enhanced['Change/(ATR/Price)'] = np.where(
    df_candidates_enhanced['ATR/Price %'] != 0,
    df_candidates_enhanced['Change %'] / df_candidates_enhanced['ATR/Price %'],
    0
)

# --- Define the NEW Sorting Order based on the Funnel Philosophy ---
# The primary sort is now by 'penalty_score'. Other metrics act as tie-breakers.
# This replaces your original SORT_ORDER.
SORT_ORDER_FUNNEL = {
    'lookback_slope': True,             # Lower is better (steeper improving trend) - PRIMARY SORT    
    'r_squared': False,                 # Higher is better (stronger growth) - TIE BREAKER 1    
    'penalty_score': True,              # Lower is better (smoother, more linear trend) - TIE BREAKER 2
    'Change/(ATR/Price)': False,        # Higher is better (strong recent momentum) - TIE BREAKER 3  
}

sort_keys = list(SORT_ORDER_FUNNEL.keys())
sort_ascending = list(SORT_ORDER_FUNNEL.values())

# Apply the new sorting logic
df_sorted_candidates = df_candidates_enhanced.sort_values(by=sort_keys, ascending=sort_ascending)

# ==============================================================================
# ### MODIFICATION END ###
# ==============================================================================


# --- Define and Apply Final Column Order (your existing logic is great here) ---
leading_cols = [
    'MktCap AUM, M', 'Price', 'Change %', 'ATR/Price %', 'Change/(ATR/Price)', 'Rel Volume',
    'lookback_slope', 'r_squared', 'penalty_score' # Add our key metrics to the front
]
priority_cols = list(dict.fromkeys(leading_cols + sort_keys))
remaining_cols = [c for c in df_sorted_candidates.columns if c not in priority_cols]
final_col_order = priority_cols + remaining_cols
df_sorted_candidates = df_sorted_candidates[final_col_order]


# --- Select Top Candidates for Plotting ---

####################################
# tickers_to_plot = df_sorted_candidates.head(CANDIDATES_TO_PLOT).index.tolist()
tickers_to_plot = df_sorted_candidates.index.tolist()
####################################


# --- Display Final Results with Context ---
print("\n" + "="*50)
print("      FINAL CANDIDATE REPORT (TIERED FUNNEL)")
print("="*50)

print("\nFunnel Filter Parameters:")
print(f"  - Minimum Slope: {SLOPE_THRESHOLD}")
print(f"  - Minimum R-Squared: {R_SQUARED_THRESHOLD}")

print("\nApplied Metric Filters (from earlier steps):")
pprint.pprint(METRIC_FILTERS, sort_dicts=False)

print("\nSorting Order (Primary: Penalty Score):")
pprint.pprint(SORT_ORDER_FUNNEL, sort_dicts=False)

# print(f"\nDisplaying Top {CANDIDATES_TO_PLOT} Candidates from Funnel:")
# display(df_sorted_candidates.head(CANDIDATES_TO_PLOT))

print(f"\nNumber of Tickers selected for plotting: : {len(tickers_to_plot)}")
print(f"\nTickers selected for plotting: {tickers_to_plot}")

--- Step 5: Applying the Tiered Filtering & Ranking Funnel ---
Starting with 300 initial candidates.
-> 300 candidates remain after Slope Filter (lookback_slope < 100).
-> 300 candidates remain after R-Squared Filter (r_squared > -10).

      FINAL CANDIDATE REPORT (TIERED FUNNEL)

Funnel Filter Parameters:
  - Minimum Slope: 100
  - Minimum R-Squared: -10

Applied Metric Filters (from earlier steps):
{'min_lookback_improvement': 0,
 'current_rank_bracket_start': 1,
 'current_rank_bracket_end': 1000,
 'min_recent_bottom_to_recent_start': 0,
 'min_recent_bottom_to_current': 0}

Sorting Order (Primary: Penalty Score):
{'lookback_slope': True,
 'r_squared': False,
 'penalty_score': True,
 'Change/(ATR/Price)': False}

Number of Tickers selected for plotting: : 300

Tickers selected for plotting: ['BG', 'COOP', 'CHRW', 'CX', 'INCY', 'BWXT', 'GFI', 'SNN', 'KGC', 'TER', 'CIEN', 'OHI', 'EBAY', 'NLY', 'AU', 'FSLR', 'ALLE', 'WDC', 'VTRS', 'ZG', 'PTC', 'SNPS', 'HUM', 'ICLR', 'LECO', 'SGI', 'DB',

In [46]:
# tickers_to_view = ['NVDA', 'META', 'MSFT', 'B', 'GOOG', 'AVGO']
# tickers_to_view = ['LYG', 'IEI']
tickers_to_view = ['META']
df_sorted_candidates.loc[tickers_to_view]
# df_sorted_candidates

KeyError: "None of [Index(['META'], dtype='object', name='Ticker')] are in the [index]"

In [None]:
print("--- Step 5: Enhancing and sorting final candidates ---")

# Join with latest Finviz data to add Price, MktCap, etc.
cols_to_add = ['Price', 'Change %', 'MktCap AUM, M', 'ATR/Price %', 'Rel Volume']
df_candidates_enhanced = df_filtered_candidates.join(df_finviz_latest[cols_to_add])

# --- Calculate New Metrics ---
# Create a normalized change metric by dividing the daily change by its recent volatility (ATR).
# This gives a sense of how significant the day's move is relative to its own behavior.
df_candidates_enhanced['Change/(ATR/Price)'] = np.where(
    df_candidates_enhanced['ATR/Price %'] != 0,
    df_candidates_enhanced['Change %'] / df_candidates_enhanced['ATR/Price %'],
    0  # Assign 0 if ATR/Price % is 0 to avoid division errors
)

# Sort the candidates based on the rules in the SORT_ORDER dictionary
sort_keys = list(SORT_ORDER.keys())
sort_ascending = list(SORT_ORDER.values())
df_sorted_candidates = df_candidates_enhanced.sort_values(by=sort_keys, ascending=sort_ascending)

# --- Define and Apply Final Column Order ---
# Define the columns that should always appear first, including our new metric.
leading_cols = [
    'MktCap AUM, M', 'Price', 'Change %', 'ATR/Price %', 'Change/(ATR/Price)', 'Rel Volume', 'current',
]

# Combine the leading columns with the sort keys for the master order.
priority_cols = list(dict.fromkeys(leading_cols + sort_keys))
remaining_cols = [c for c in df_sorted_candidates.columns if c not in priority_cols]
final_col_order = priority_cols + remaining_cols
df_sorted_candidates = df_sorted_candidates[final_col_order]

# --- Select Top Candidates for Plotting ---
tickers_to_plot = df_sorted_candidates.head(CANDIDATES_TO_PLOT).index.tolist()

# --- Display Final Results with Context ---
print("\n" + "="*50)
print("      FINAL CANDIDATE REPORT")
print("="*50)

print("\nPeriod Parameters (for calculation):")
pprint.pprint(PERIOD_PARAMS)

print("\nApplied Metric Filters:")
pprint.pprint(METRIC_FILTERS, sort_dicts=False)

print("\nSorting Order:")
pprint.pprint(SORT_ORDER, sort_dicts=False)

print(f"\nDisplaying Top {CANDIDATES_TO_PLOT} Candidates:")
display(df_sorted_candidates.head(CANDIDATES_TO_PLOT))

print(f"\nTickers selected for plotting: {tickers_to_plot}")

In [None]:
print(f'df_finviz_latest.info():\n{df_finviz_latest.info()}')
# print(f'\ndf_finviz_latest.columns:\n{df_finviz_latest.columns}')
print(f'\ndf_finviz_latest.columns:\n{list(df_finviz_latest.columns)}')
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):
#     # 'display.max_rows' controls the truncation of the Index/Series representation
#     # 'display.max_columns' is good to include for printing the full DataFrame
#     print(f'\ndf_finviz_latest.columns:\n{df_finviz_latest.columns}')

In [None]:
df_sorted_candidates

In [None]:
# --- 4. Sort the DataFrame by the index using the desired order ---
# Use .loc with the list of valid, ordered tickers
# df_finviz_latest_sorted = df_finviz_latest.loc[valid_tickers_in_order_unique]
df_finviz_tickers_to_plot = df_finviz_latest.loc[tickers_to_plot]
print(f'df_finviz_tickers_to_plot:\n{df_finviz_tickers_to_plot}') 

### Step 6: Visualize Top Candidates

Plot the rank history for the top candidates to visually verify their performance and trends.

In [None]:
print("--- Step 6: Plotting rank history for top candidates ---")

if tickers_to_plot:
    plot_days = PERIOD_PARAMS['lookback_days'] + PERIOD_PARAMS['recent_days'] + 10
    # Combine both dictionaries for complete context in the plot's annotation
    full_criteria_for_plot = {**PERIOD_PARAMS, **METRIC_FILTERS}
    
    plotting_utils.plot_rank_with_criteria(
        df_rank_history=df_rank_history.iloc[:, -plot_days:],
        ticker_list=tickers_to_plot,
        title_suffix="Top Filtered Candidates",
        filter_criteria=full_criteria_for_plot, # Pass the combined dict
        width=1150,
        height=700,
    )
else:
    print("No candidates found to plot.")

df_sorted_candidates

### Step 7: Analyze Pre-defined Portfolio

Perform a detailed analysis on a specific list of tickers. This step correctly uses the master `df_all_tickers_metrics` DataFrame to ensure all portfolio tickers are included, regardless of filter outcomes.

In [None]:
df_sorted_candidates.columns

In [None]:
print("--- Step 7: Analyzing the pre-defined portfolio ---")

# Correctly filter the *master* metrics dataframe for the portfolio tickers
df_portfolio_analysis  = df_sorted_candidates[df_sorted_candidates.index.isin(PORTFOLIO_TICKERS)].copy()

# Calculate portfolio weights, only if the dataframe is not empty
if not df_portfolio_analysis.empty:
    total_aum = df_portfolio_analysis['MktCap AUM, M'].sum()
    inv_atr = 1 / df_portfolio_analysis['ATR/Price %']
    total_inv_atr = inv_atr.sum()

    df_portfolio_analysis['MktCap AUM Weight'] = df_portfolio_analysis['MktCap AUM, M'] / total_aum
    df_portfolio_analysis['ATR/Price INV Weight'] = (inv_atr / total_inv_atr)

    total_portf = df_portfolio_analysis['MktCap AUM Weight'].sum() + df_portfolio_analysis['ATR/Price INV Weight'].sum()
    df_portfolio_analysis['Portf Weight'] = (df_portfolio_analysis['MktCap AUM Weight'] + df_portfolio_analysis['ATR/Price INV Weight']) / total_portf

    # --- Define and Apply Final Column Order ---
    # The new columns to be inserted
    new_cols = ['MktCap AUM Weight', 'ATR/Price INV Weight', 'Portf Weight']

    # Convert the original column Index to a list for easy manipulation
    original_cols = list(df_sorted_candidates.columns)

    # Find the index of the column to insert after
    try:
        # Find the integer position of the 'current' column
        insert_index = original_cols.index('current') + 1 
    except ValueError:
        # Handle the case where 'current' isn't in the columns, perhaps append to the end
        print("Warning: 'current' column not found. Appending new columns to the end.")
        insert_index = len(original_cols)

    # Reconstruct the list with the new columns inserted
    PORTFOLIO_COLUMN_ORDER = original_cols[:insert_index] + new_cols + original_cols[insert_index:]

    # Filter the desired order to only include columns that actually exist in the DataFrame
    # This makes the code robust against missing data columns.
    final_portfolio_cols = [c for c in PORTFOLIO_COLUMN_ORDER if c in df_portfolio_analysis.columns]
    df_portfolio_analysis = df_portfolio_analysis[final_portfolio_cols]

print(f"✅ Portfolio analysis complete for {len(df_portfolio_analysis)} tickers.")
print("Portfolio metrics, sorted by final portfolio weight:")
print(df_portfolio_analysis.sort_values(by='Portf Weight', ascending=False))

### Step 8: Visualize Portfolio

Plot the rank history for the tickers in the pre-defined portfolio to compare their recent performance.

In [None]:
print("--- Step 8: Plotting rank history for the portfolio ---")

if PORTFOLIO_TICKERS:
    plot_days = PERIOD_PARAMS['lookback_days'] + PERIOD_PARAMS['recent_days'] + 10
    # Combine both dictionaries for complete context in the plot's annotation
    full_criteria_for_plot = {**PERIOD_PARAMS, **METRIC_FILTERS}
    
    plotting_utils.plot_rank_with_criteria(
        df_rank_history=df_rank_history.iloc[:, -plot_days:],
        ticker_list=PORTFOLIO_TICKERS,
        title_suffix="Pre-defined Portfolio",
        filter_criteria=full_criteria_for_plot, # Pass the combined dict
        width=1150,
        height=700,
    )
else:
    print("Portfolio ticker list is empty. Nothing to plot.")