### Setup and Configuration

This cell contains all imports and user-configurable parameters for the analysis pipeline.

In [None]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import pprint
import inspect  # <--- ADD THIS LINE
from IPython.display import display, Markdown

# --- 1. PANDAS & IPYTHON OPTIONS ---
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 3000)
%load_ext autoreload
%autoreload 2

# --- 2. PROJECT PATH CONFIGURATION ---
NOTEBOOK_DIR = Path.cwd()
ROOT_DIR = NOTEBOOK_DIR.parent.parent  # Adjust if your notebook is in a 'notebooks' subdirectory
DATA_DIR = ROOT_DIR / 'data'
SRC_DIR = ROOT_DIR / 'src'

# Add 'src' to the Python path to import custom modules
if str(SRC_DIR) not in sys.path:
    sys.path.append(str(SRC_DIR))

# --- 3. IMPORT CUSTOM MODULES ---
import utils
import plotting_utils

# --- 4. ANALYSIS & FILTERING CONFIGURATION ---

# File searching parameters
# FILE_PREFIX = ''  # e.g., '2024'
FILE_CONTAINS_PATTERN = 'df_OHLCV_clean_stocks_etfs'

# # Parameters defining the time windows for metric calculation
PERIOD_PARAMS = {
    'lookback_days': 40,
    'recent_days': 8,
}

# --- 5. VERIFICATION ---
print("--- Path Configuration ---")
print(f"✅ Project Root: {ROOT_DIR}")
print(f"✅ Data Dir:     {DATA_DIR}")
print(f"✅ Source Dir:   {SRC_DIR}")
assert all([ROOT_DIR.exists(), DATA_DIR.exists(), SRC_DIR.exists()]), "A key directory was not found!"

print("\n--- Module Verification ---")
print(f"✅ Successfully imported 'utils' and 'plotting_utils'.")



### Step 1: Load Latest Merged Finviz Data

Find and load the single most recent `df_finviz_merged` file. This DataFrame contains supplementary data like `Price` and `ATR/Price %` that will be used to enhance our final analysis.

In [None]:
print("--- Step 1: Loading latest consolidated OHLCV data ---")

# Find the most recent file matching the pattern
# This function is now understood to return List[str] (filenames), not List[Path].
latest_OHLCV_filepaths = utils.get_recent_files(
    directory_path=DATA_DIR,
    extension='parquet',
    # prefix=FILE_PREFIX,
    contains_pattern=FILE_CONTAINS_PATTERN,
    count=1
)

if not latest_OHLCV_filepaths:
    raise FileNotFoundError(f"No files found in '{DATA_DIR}' with prefix '{FILE_PREFIX}' and pattern '{FILE_CONTAINS_PATTERN}'")

# Get the filename string from the list
latest_filename = latest_OHLCV_filepaths[0]

# Manually construct the full path before loading
full_file_path = DATA_DIR / latest_filename
df_OHLCV = pd.read_parquet(full_file_path, engine='pyarrow')


# --- Robust Index Setting (this logic remains correct) ---
if df_OHLCV.index.name == 'Ticker':
    print("Info: 'Ticker' is already the index. No action needed.")
elif 'Ticker' in df_OHLCV.columns:
    print("Info: 'Ticker' column found. Setting it as the DataFrame index.")
    df_OHLCV.set_index('Ticker', inplace=True)
elif 'ticker' in df_OHLCV.columns:
    print("Info: 'ticker' column found. Renaming and setting as index.")
    df_OHLCV.rename(columns={'ticker': 'Ticker'}, inplace=True)
    df_OHLCV.set_index('Ticker', inplace=True)
elif df_OHLCV.index.name is None:
    print("Info: Index is unnamed. Assuming it contains tickers and assigning the name 'Ticker'.")
    df_OHLCV.index.name = 'Ticker'
else:
    print("ERROR: Loaded DataFrame has an unexpected format.")
    print(f"Columns: {df_OHLCV.columns.tolist()}")
    print(f"Index Name: '{df_OHLCV.index.name}'")
    raise ValueError("Could not find a 'Ticker' column or a usable index to proceed.")


# Correct the print statement to work with the filename string
print(f"✅ Successfully loaded: {latest_filename}")
print(f"Shape: {df_OHLCV.shape}")
print(df_OHLCV.head(3))

In [None]:
# Returns a single boolean: True if any NaN exists, False otherwise.
has_nan = df_OHLCV.isna().any().any()

print(f"Are there any NaN values in the DataFrame? {has_nan}")

In [None]:
# Select the 'Adj High' series and unstack the first level of the index ('Ticker')
df_adj_high = df_OHLCV['Adj High'].unstack(level=1)

# Display the first few rows of the newly shaped DataFrame
print(df_adj_high.head())

### Step 2: Build Rank History Matrix

Load the last `N` daily data files to construct a comprehensive rank history DataFrame. This matrix is the primary input for all subsequent trend and performance calculations.

In [None]:
print(f"--- Step 2: Building rank history from the latest {HISTORY_FILE_COUNT} files ---")

# Get a list of all recent daily files
daily_files_list = utils.get_recent_files(
    directory_path=DATA_DIR,
    extension='parquet',
    prefix=FILE_PREFIX,
    contains_pattern=FILE_CONTAINS_PATTERN,
    count=HISTORY_FILE_COUNT
)

# Use the utility function to create the rank history dataframe
# Assumes 'create_rank_history_df' is now in utils.py
df_rank_history = utils.create_rank_history_df(daily_files_list, DATA_DIR)

print(f"✅ Rank history matrix created successfully.")
print(f"Shape: {df_rank_history.shape} (Tickers, Days)")
print(f"Date Range: {df_rank_history.columns.min().strftime('%Y-%m-%d')} to {df_rank_history.columns.max().strftime('%Y-%m-%d')}")

In [None]:
# 2. Print the original shape
# df_rank_history.shape returns a tuple (number_of_rows, number_of_columns)
print("\nOriginal shape:", df_rank_history.shape)

# 3. Remove all rows with any NaN values
# The dropna() method returns a new DataFrame by default
df_rank_history_cleaned = df_rank_history.dropna()

# 4. Print the new shape and the cleaned DataFrame
print("\nDataFrame after removing rows with any NaN values:")
print(df_rank_history_cleaned)

print("\nNew shape:", df_rank_history_cleaned.shape)

### Step 3: Calculate Metrics for All Tickers

Process the rank history matrix to compute performance metrics for **every ticker**. This creates a master metrics DataFrame that serves as a single source of truth for all subsequent filtering and analysis.

In [None]:
print("--- Step 3: Calculating performance metrics for all tickers ---")

# This call remains the same. It correctly passes the arguments to the new function.
all_metrics_data = utils.calculate_rank_metrics( # Assuming it's in the same file, or use utils.
    df_rank_history,
    tickers_list=df_rank_history.index.tolist(),
    **PERIOD_PARAMS
)

# # Convert the list of dicts into a DataFrame.
# # The new 'r_squared' and 'penalty_score' keys will automatically become columns.
# df_all_tickers_metrics = pd.DataFrame(all_metrics_data)
# if not df_all_tickers_metrics.empty:
#     df_all_tickers_metrics.set_index('ticker', inplace=True)
#     df_all_tickers_metrics.index.name = 'Ticker'

# print(f"✅ Calculated metrics for {len(df_all_tickers_metrics)} tickers.")
# print("\nDataFrame head, now including 'r_squared' and 'penalty_score':")
# # display(df_all_tickers_metrics.head()) # Use display() in a notebook, or print()
# print(df_all_tickers_metrics.head())

In [None]:
all_metrics_data

In [None]:
print("--- Step 3: Calculating performance metrics for all tickers ---")

# This call remains the same. It correctly passes the arguments to the new function.
all_metrics_data = utils.calculate_rank_metrics( # Assuming it's in the same file, or use utils.
    df_rank_history,
    tickers_list=df_rank_history.index.tolist(),
    **PERIOD_PARAMS
)

# Convert the list of dicts into a DataFrame.
# The new 'r_squared' and 'penalty_score' keys will automatically become columns.
df_all_tickers_metrics = pd.DataFrame(all_metrics_data)
if not df_all_tickers_metrics.empty:
    df_all_tickers_metrics.set_index('ticker', inplace=True)
    df_all_tickers_metrics.index.name = 'Ticker'

print(f"✅ Calculated metrics for {len(df_all_tickers_metrics)} tickers.")
print("\nDataFrame head, now including 'r_squared' and 'penalty_score':")
# display(df_all_tickers_metrics.head()) # Use display() in a notebook, or print()
print(df_all_tickers_metrics.head())

### Step 4: Filter for 'Reversal' Candidates

Apply the predefined filtering rules from the configuration cell to the master metrics DataFrame to identify a list of promising candidates.


In [None]:
print("--- Step 4: Filtering metrics to find candidates ---")
print("METRIC_FILTERS")
pprint.pprint(METRIC_FILTERS)

# Use the utility function, passing only the relevant filter arguments.
# This is now much cleaner than the previous version.
df_filtered_candidates = utils.filter_rank_metrics(
    df_all_tickers_metrics,
    **METRIC_FILTERS
)

print(f"✅ Found {len(df_filtered_candidates)} candidates matching the criteria.")
display(df_filtered_candidates.head(20))

### Step 5: Enhance, Sort, and Select Top Candidates

Enrich the filtered candidates with the latest price data, sort them according to the specified rules, and select the top N for visualization.


In [None]:
# Assuming the following columns have been calculated earlier for each stock
# and are present in df_filtered_candidates:
# - 'slope': The slope of the regression line (higher is better).
# - 'r_squared': The R-squared value (higher is better).
# - 'penalty_score': The combined penalty score (lower is better).

print("--- Step 5: Applying the Tiered Filtering & Ranking Funnel ---")

# --- Funnel Strategy Parameters (you can tune these) ---
# SLOPE_THRESHOLD = -1.0      # Viability: Trend must be meaningfully positive.
# R_SQUARED_THRESHOLD = 0.75  # Reliability: Trend must be consistent and not random.
SLOPE_THRESHOLD = 100      # Viability: Trend must be meaningfully positive.
R_SQUARED_THRESHOLD = -10  # Reliability: Trend must be consistent and not random.


# ==============================================================================
# ### MODIFICATION START: Replaced the original sorting logic with the funnel ###
# ==============================================================================

# --- Step 1: The Viability Filter (using Slope) ---
print(f"Starting with {len(df_filtered_candidates)} initial candidates.")
df_funnel_step1 = df_filtered_candidates[df_filtered_candidates['lookback_slope'] < SLOPE_THRESHOLD]
print(f"-> {len(df_funnel_step1)} candidates remain after Slope Filter (lookback_slope < {SLOPE_THRESHOLD}).")


# --- Step 2: The Reliability Filter (using R-Squared) ---
df_funnel_step2 = df_funnel_step1[df_funnel_step1['r_squared'] > R_SQUARED_THRESHOLD]
print(f"-> {len(df_funnel_step2)} candidates remain after R-Squared Filter (r_squared > {R_SQUARED_THRESHOLD}).")
df_funnel_candidates = df_funnel_step2

# --- Step 3: Enhance, Calculate, and Rank the High-Quality Candidates ---
# Join with latest Finviz data to add Price, MktCap, etc.
cols_to_add = ['Price', 'Change %', 'MktCap AUM, M', 'ATR/Price %', 'Rel Volume']
# Use .loc to avoid potential SettingWithCopyWarning
df_candidates_enhanced = df_funnel_candidates.join(df_finviz_latest[cols_to_add])

# Calculate the new 'Change/(ATR/Price)' metric
df_candidates_enhanced['Change/(ATR/Price)'] = np.where(
    df_candidates_enhanced['ATR/Price %'] != 0,
    df_candidates_enhanced['Change %'] / df_candidates_enhanced['ATR/Price %'],
    0
)

# --- Define the NEW Sorting Order based on the Funnel Philosophy ---
# The primary sort is now by 'penalty_score'. Other metrics act as tie-breakers.
# This replaces your original SORT_ORDER.
SORT_ORDER_FUNNEL = {
    'lookback_slope': True,             # Lower is better (steeper improving trend) - PRIMARY SORT    
    'r_squared': False,                 # Higher is better (stronger growth) - TIE BREAKER 1    
    'penalty_score': True,              # Lower is better (smoother, more linear trend) - TIE BREAKER 2
    'Change/(ATR/Price)': False,        # Higher is better (strong recent momentum) - TIE BREAKER 3  
}

sort_keys = list(SORT_ORDER_FUNNEL.keys())
sort_ascending = list(SORT_ORDER_FUNNEL.values())

# Apply the new sorting logic
df_sorted_candidates = df_candidates_enhanced.sort_values(by=sort_keys, ascending=sort_ascending)

# ==============================================================================
# ### MODIFICATION END ###
# ==============================================================================


# --- Define and Apply Final Column Order (your existing logic is great here) ---
leading_cols = [
    'MktCap AUM, M', 'Price', 'Change %', 'ATR/Price %', 'Change/(ATR/Price)', 'Rel Volume',
    'lookback_slope', 'r_squared', 'penalty_score' # Add our key metrics to the front
]
priority_cols = list(dict.fromkeys(leading_cols + sort_keys))
remaining_cols = [c for c in df_sorted_candidates.columns if c not in priority_cols]
final_col_order = priority_cols + remaining_cols
df_sorted_candidates = df_sorted_candidates[final_col_order]


# --- Select Top Candidates for Plotting ---

####################################
# tickers_to_plot = df_sorted_candidates.head(CANDIDATES_TO_PLOT).index.tolist()
tickers_to_plot = df_sorted_candidates.index.tolist()
####################################


# --- Display Final Results with Context ---
print("\n" + "="*50)
print("      FINAL CANDIDATE REPORT (TIERED FUNNEL)")
print("="*50)

print("\nFunnel Filter Parameters:")
print(f"  - Minimum Slope: {SLOPE_THRESHOLD}")
print(f"  - Minimum R-Squared: {R_SQUARED_THRESHOLD}")

print("\nApplied Metric Filters (from earlier steps):")
pprint.pprint(METRIC_FILTERS, sort_dicts=False)

print("\nSorting Order (Primary: Penalty Score):")
pprint.pprint(SORT_ORDER_FUNNEL, sort_dicts=False)

# print(f"\nDisplaying Top {CANDIDATES_TO_PLOT} Candidates from Funnel:")
# display(df_sorted_candidates.head(CANDIDATES_TO_PLOT))

print(f"\nNumber of Tickers selected for plotting: : {len(tickers_to_plot)}")
print(f"\nTickers selected for plotting: {tickers_to_plot}")

In [None]:
# tickers_to_view = ['NVDA', 'META', 'MSFT', 'B', 'GOOG', 'AVGO']
# tickers_to_view = ['LYG', 'IEI']
tickers_to_view = ['META']
df_sorted_candidates.loc[tickers_to_view]
# df_sorted_candidates

In [None]:
print("--- Step 5: Enhancing and sorting final candidates ---")

# Join with latest Finviz data to add Price, MktCap, etc.
cols_to_add = ['Price', 'Change %', 'MktCap AUM, M', 'ATR/Price %', 'Rel Volume']
df_candidates_enhanced = df_filtered_candidates.join(df_finviz_latest[cols_to_add])

# --- Calculate New Metrics ---
# Create a normalized change metric by dividing the daily change by its recent volatility (ATR).
# This gives a sense of how significant the day's move is relative to its own behavior.
df_candidates_enhanced['Change/(ATR/Price)'] = np.where(
    df_candidates_enhanced['ATR/Price %'] != 0,
    df_candidates_enhanced['Change %'] / df_candidates_enhanced['ATR/Price %'],
    0  # Assign 0 if ATR/Price % is 0 to avoid division errors
)

# Sort the candidates based on the rules in the SORT_ORDER dictionary
sort_keys = list(SORT_ORDER.keys())
sort_ascending = list(SORT_ORDER.values())
df_sorted_candidates = df_candidates_enhanced.sort_values(by=sort_keys, ascending=sort_ascending)

# --- Define and Apply Final Column Order ---
# Define the columns that should always appear first, including our new metric.
leading_cols = [
    'MktCap AUM, M', 'Price', 'Change %', 'ATR/Price %', 'Change/(ATR/Price)', 'Rel Volume', 'current',
]

# Combine the leading columns with the sort keys for the master order.
priority_cols = list(dict.fromkeys(leading_cols + sort_keys))
remaining_cols = [c for c in df_sorted_candidates.columns if c not in priority_cols]
final_col_order = priority_cols + remaining_cols
df_sorted_candidates = df_sorted_candidates[final_col_order]

# --- Select Top Candidates for Plotting ---
tickers_to_plot = df_sorted_candidates.head(CANDIDATES_TO_PLOT).index.tolist()

# --- Display Final Results with Context ---
print("\n" + "="*50)
print("      FINAL CANDIDATE REPORT")
print("="*50)

print("\nPeriod Parameters (for calculation):")
pprint.pprint(PERIOD_PARAMS)

print("\nApplied Metric Filters:")
pprint.pprint(METRIC_FILTERS, sort_dicts=False)

print("\nSorting Order:")
pprint.pprint(SORT_ORDER, sort_dicts=False)

print(f"\nDisplaying Top {CANDIDATES_TO_PLOT} Candidates:")
display(df_sorted_candidates.head(CANDIDATES_TO_PLOT))

print(f"\nTickers selected for plotting: {tickers_to_plot}")

In [None]:
print(f'df_finviz_latest.info():\n{df_finviz_latest.info()}')
# print(f'\ndf_finviz_latest.columns:\n{df_finviz_latest.columns}')
print(f'\ndf_finviz_latest.columns:\n{list(df_finviz_latest.columns)}')
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):
#     # 'display.max_rows' controls the truncation of the Index/Series representation
#     # 'display.max_columns' is good to include for printing the full DataFrame
#     print(f'\ndf_finviz_latest.columns:\n{df_finviz_latest.columns}')

In [None]:
df_sorted_candidates

In [None]:
# --- 4. Sort the DataFrame by the index using the desired order ---
# Use .loc with the list of valid, ordered tickers
# df_finviz_latest_sorted = df_finviz_latest.loc[valid_tickers_in_order_unique]
df_finviz_tickers_to_plot = df_finviz_latest.loc[tickers_to_plot]
print(f'df_finviz_tickers_to_plot:\n{df_finviz_tickers_to_plot}') 

### Step 6: Visualize Top Candidates

Plot the rank history for the top candidates to visually verify their performance and trends.

In [None]:
print("--- Step 6: Plotting rank history for top candidates ---")

if tickers_to_plot:
    plot_days = PERIOD_PARAMS['lookback_days'] + PERIOD_PARAMS['recent_days'] + 10
    # Combine both dictionaries for complete context in the plot's annotation
    full_criteria_for_plot = {**PERIOD_PARAMS, **METRIC_FILTERS}
    
    plotting_utils.plot_rank_with_criteria(
        df_rank_history=df_rank_history.iloc[:, -plot_days:],
        ticker_list=tickers_to_plot,
        title_suffix="Top Filtered Candidates",
        filter_criteria=full_criteria_for_plot, # Pass the combined dict
        width=1150,
        height=700,
    )
else:
    print("No candidates found to plot.")

df_sorted_candidates

### Step 7: Analyze Pre-defined Portfolio

Perform a detailed analysis on a specific list of tickers. This step correctly uses the master `df_all_tickers_metrics` DataFrame to ensure all portfolio tickers are included, regardless of filter outcomes.

In [None]:
df_sorted_candidates.columns

In [None]:
print("--- Step 7: Analyzing the pre-defined portfolio ---")

# Correctly filter the *master* metrics dataframe for the portfolio tickers
df_portfolio_analysis  = df_sorted_candidates[df_sorted_candidates.index.isin(PORTFOLIO_TICKERS)].copy()

# Calculate portfolio weights, only if the dataframe is not empty
if not df_portfolio_analysis.empty:
    total_aum = df_portfolio_analysis['MktCap AUM, M'].sum()
    inv_atr = 1 / df_portfolio_analysis['ATR/Price %']
    total_inv_atr = inv_atr.sum()

    df_portfolio_analysis['MktCap AUM Weight'] = df_portfolio_analysis['MktCap AUM, M'] / total_aum
    df_portfolio_analysis['ATR/Price INV Weight'] = (inv_atr / total_inv_atr)

    total_portf = df_portfolio_analysis['MktCap AUM Weight'].sum() + df_portfolio_analysis['ATR/Price INV Weight'].sum()
    df_portfolio_analysis['Portf Weight'] = (df_portfolio_analysis['MktCap AUM Weight'] + df_portfolio_analysis['ATR/Price INV Weight']) / total_portf

    # --- Define and Apply Final Column Order ---
    # The new columns to be inserted
    new_cols = ['MktCap AUM Weight', 'ATR/Price INV Weight', 'Portf Weight']

    # Convert the original column Index to a list for easy manipulation
    original_cols = list(df_sorted_candidates.columns)

    # Find the index of the column to insert after
    try:
        # Find the integer position of the 'current' column
        insert_index = original_cols.index('current') + 1 
    except ValueError:
        # Handle the case where 'current' isn't in the columns, perhaps append to the end
        print("Warning: 'current' column not found. Appending new columns to the end.")
        insert_index = len(original_cols)

    # Reconstruct the list with the new columns inserted
    PORTFOLIO_COLUMN_ORDER = original_cols[:insert_index] + new_cols + original_cols[insert_index:]

    # Filter the desired order to only include columns that actually exist in the DataFrame
    # This makes the code robust against missing data columns.
    final_portfolio_cols = [c for c in PORTFOLIO_COLUMN_ORDER if c in df_portfolio_analysis.columns]
    df_portfolio_analysis = df_portfolio_analysis[final_portfolio_cols]

print(f"✅ Portfolio analysis complete for {len(df_portfolio_analysis)} tickers.")
print("Portfolio metrics, sorted by final portfolio weight:")
print(df_portfolio_analysis.sort_values(by='Portf Weight', ascending=False))

### Step 8: Visualize Portfolio

Plot the rank history for the tickers in the pre-defined portfolio to compare their recent performance.

In [None]:
print("--- Step 8: Plotting rank history for the portfolio ---")

if PORTFOLIO_TICKERS:
    plot_days = PERIOD_PARAMS['lookback_days'] + PERIOD_PARAMS['recent_days'] + 10
    # Combine both dictionaries for complete context in the plot's annotation
    full_criteria_for_plot = {**PERIOD_PARAMS, **METRIC_FILTERS}
    
    plotting_utils.plot_rank_with_criteria(
        df_rank_history=df_rank_history.iloc[:, -plot_days:],
        ticker_list=PORTFOLIO_TICKERS,
        title_suffix="Pre-defined Portfolio",
        filter_criteria=full_criteria_for_plot, # Pass the combined dict
        width=1150,
        height=700,
    )
else:
    print("Portfolio ticker list is empty. Nothing to plot.")