In [1]:
# Configuration

# days used in calculating ratios
DAYS_RATIO = [3, 5, 10, 15, 30, 60, 120, 250]

# risk free rate
RISK_FREE_RATE = 0.04

In [2]:
# process_files.py
from config import date_str, DOWNLOAD_DIR, DEST_DIR
from pathlib import Path  # Better path handling

print(f"date_str: {date_str}")
# print(f"DOWNLOAD_DIR: {DOWNLOAD_DIR}")
print(f"DEST_DIR: {DEST_DIR}\n")

# Build paths
source_path = Path(DEST_DIR) / f'df_OHLCV_{date_str}_clean.parquet'
dest_path = Path(DEST_DIR) / f'df_perf_ratios.parquet'

print(f"source_path: {source_path}")
print(f"dest_path: {dest_path}")

date_str: 2025-04-02
DEST_DIR: ..\data

source_path: ..\data\df_OHLCV_2025-04-02_clean.parquet
dest_path: ..\data\df_perf_ratios.parquet


In [3]:
import pandas as pd

# Get tickers from df_finviz.pkl file
df_finviz = pd.read_parquet('..\data\df_finviz.parquet')
tickers = df_finviz.index.to_list()

In [4]:
import pandas as pd

# Load the DataFrame df_OHLCV_2025-03-14_clean.pkl
# df = pd.read_pickle(source_path)
df = pd.read_parquet(source_path, engine='pyarrow')

# Display the first few rows of the DataFrame to verify
df = df[['Adj Close']].copy()
display(df)

Unnamed: 0_level_0,Unnamed: 1_level_0,Adj Close
Symbol,Date,Unnamed: 2_level_1
WMB,2025-04-02,61.60
WMB,2025-04-01,60.57
WMB,2025-03-31,59.76
WMB,2025-03-28,59.19
WMB,2025-03-27,59.43
...,...,...
PSO,2024-04-09,12.41
PSO,2024-04-08,12.53
PSO,2024-04-05,12.52
PSO,2024-04-04,12.41


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 385536 entries, ('WMB', Timestamp('2025-04-02 00:00:00')) to ('PSO', Timestamp('2024-04-03 00:00:00'))
Data columns (total 1 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   Adj Close  385536 non-null  float64
dtypes: float64(1)
memory usage: 4.5+ MB


In [6]:
import pandas as pd

def select_tickers_data(df, tickers):
  """
  Selects data for a list of tickers from a DataFrame with a MultiIndex
  where the first level is the ticker and the second level is the date.

  Args:
    df (pd.DataFrame): The input DataFrame with a MultiIndex.
    tickers (list): A list of ticker symbols to select.

  Returns:
    pd.DataFrame: A DataFrame containing only the data for the specified tickers.
                  Rows for tickers not found will not be included.
  """
  valid_tickers = [t for t in tickers if t in df.index.get_level_values(0)]  # Filter out the tickers that do not exist

  if not valid_tickers:
    print("No valid tickers found in the DataFrame. Returning an empty DataFrame.")
    return pd.DataFrame()

  try:
    ticker_df = df.loc[valid_tickers]
    return ticker_df
  except KeyError as e:
    print(f"KeyError after filtering valid tickers: {e}") #Added to help debugging.
    return pd.DataFrame()


# Example usage:
# ticker_list = ["UBS", "AAPL", "MSFT", "GEV"]  # Include some valid and invalid tickers
selected_data = select_tickers_data(df, tickers)
print(selected_data)

                   Adj Close
Symbol Date                 
AAPL   2025-04-02     223.89
       2025-04-01     223.19
       2025-03-31     222.13
       2025-03-28     217.90
       2025-03-27     223.85
...                      ...
FNDC   2024-04-09      34.34
       2024-04-08      34.26
       2024-04-05      34.05
       2024-04-04      33.92
       2024-04-03      34.13

[385536 rows x 1 columns]


In [7]:
# Get root directory (assuming notebook is in root/notebooks/)
NOTEBOOK_DIR = Path.cwd()
ROOT_DIR = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == 'notebooks' else NOTEBOOK_DIR

import sys

# Add src directory to Python path
sys.path.append(str(ROOT_DIR / 'src'))

import utils

list_dfs = utils.get_latest_dfs(selected_data, DAYS_RATIO)

In [8]:
import numpy as np

all_results = {}

for _df in list_dfs:
    tickers_in_df = _df.index.get_level_values(0).unique()
    for ticker in tickers_in_df:
        # Suppress division warnings just for this calculation
        with np.errstate(divide='ignore', invalid='ignore'):
            result_df = utils.analyze_stock(_df, ticker, risk_free_rate=RISK_FREE_RATE)
        
        if result_df is not None:
            # Extract the ticker name from the result_df index
            ticker_name = result_df.index[0]
            # Convert the single-row DataFrame to a dictionary
            metrics = result_df.iloc[0].to_dict()
            
            # Update the existing ticker entry or create a new one
            if ticker_name in all_results:
                all_results[ticker_name].update(metrics)
            else:
                all_results[ticker_name] = metrics

if all_results:
    combined_df = pd.DataFrame.from_dict(all_results, orient='index')
    print("\nCombined performance metrics DataFrame:")
    print(combined_df)
else:
    print("No performance metrics were calculated.")


Combined performance metrics DataFrame:
      Sharpe 3d  Sortino 3d  Omega 3d  Sharpe 5d  Sortino 5d  Omega 5d  \
A     -1.759850   -3.042670  0.728937  -4.911772   -6.132238  0.451568   
AA     3.558400   10.420028  1.928290  -5.353438   -6.521309  0.392458   
AAL    0.896490    1.948607  1.173596  -6.819074   -8.148965  0.360368   
AAON  20.423033         inf       NaN   4.158110    9.157712  2.153763   
AAPL  52.094135         inf       NaN   0.021746    0.031369  1.003952   
...         ...         ...       ...        ...         ...       ...   
ZION   4.761538   16.538619  2.473377   0.836698    1.486296  1.139951   
ZM    18.203738         inf       NaN  -5.222544   -6.249877  0.422538   
ZS    29.361199         inf       NaN  -1.360984   -1.945515  0.799521   
ZTO  -12.092754  -11.634661  0.000000 -13.561722  -11.154049  0.022420   
ZTS   -3.028159   -4.769619  0.575089  -2.745322   -3.714164  0.663939   

      Sharpe 10d  Sortino 10d  Omega 10d  Sharpe 15d  ...  Omega 30d  

In [9]:
import numpy as np
import pandas as pd

def find_nan_inf(df):
    """
    Finds locations of NaN, Inf, and -Inf values in a DataFrame.
    
    Args:
        df (pd.DataFrame): DataFrame to check
        
    Returns:
        dict: Dictionary with column names as keys and dicts of 
              {type: indices} as values
    """
    nan_inf_locations = {}
    for col in df.columns:
        locations = {
            'NaN': df[col].index[df[col].isnull()].tolist(),
            'Inf': df[col].index[df[col] == np.inf].tolist(),
            '-Inf': df[col].index[df[col] == -np.inf].tolist()
        }
        if any(locations.values()):
            nan_inf_locations[col] = locations
    return nan_inf_locations

def print_nan_inf_locations(locations, message):
    """Prints NaN/Inf locations with a descriptive message."""
    print(message)
    if locations:
        for col, locs in locations.items():
            print(f"Column: {col}")
            for typ, indices in locs.items():
                if indices:
                    print(f"  {typ}: {indices}")
    else:
        print("No NaN or Inf values found.")

def get_column_replacement_values(df):
    """
    Calculate replacement values for each column (max for Inf/NaN, min for -Inf).
    
    Args:
        df (pd.DataFrame): Input DataFrame
        
    Returns:
        tuple: (max_values, min_values) dictionaries with column replacement values
    """
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    max_values = {}
    min_values = {}
    
    for col in numeric_cols:
        # Get finite values only
        finite_vals = df[col][np.isfinite(df[col])]
        if len(finite_vals) > 0:
            max_values[col] = finite_vals.max()
            min_values[col] = finite_vals.min()
        else:
            # If no finite values, use default
            max_values[col] = 1e5
            min_values[col] = -1e5
    
    return max_values, min_values

# Check for NaN or Inf values
has_nan_inf = combined_df.isnull().any().any() or not np.isfinite(combined_df).all().all()
print(f'combined_df has_nan_inf: {has_nan_inf}')

if has_nan_inf:
    # Calculate replacement values for each column
    max_values, min_values = get_column_replacement_values(combined_df)
    print("Replacing NaN/Inf values with column-specific max/min values")
    
    nan_inf_locations_before = find_nan_inf(combined_df)
    print_nan_inf_locations(nan_inf_locations_before, "NaN/Inf Locations BEFORE Replacement:")
    
    # Create cleaned DataFrame
    df_to_save = combined_df.copy()
    
    # Replace values column by column
    for col in df_to_save.columns:
        if col in max_values:  # Only numeric columns will be in max_values
            # Replace NaN and +Inf with max value
            df_to_save[col] = df_to_save[col].fillna(max_values[col])
            df_to_save[col] = df_to_save[col].replace([np.inf], max_values[col])
            # Replace -Inf with min value
            df_to_save[col] = df_to_save[col].replace([-np.inf], min_values[col])
else:
    print("No NaN or Inf values found.")
    df_to_save = combined_df

# Verification
print(f'df_to_save has_nan_inf: {df_to_save.isnull().any().any() or not np.isfinite(df_to_save).all().all()}')
print_nan_inf_locations(find_nan_inf(df_to_save), "NaN/Inf Locations AFTER Replacement:")

# Output cleaned DataFrame
print(df_to_save)

combined_df has_nan_inf: True
Replacing NaN/Inf values with column-specific max/min values
NaN/Inf Locations BEFORE Replacement:
Column: Sharpe 3d
  -Inf: ['BSCR']
Column: Sortino 3d
  Inf: ['AAON', 'AAPL', 'ABEV', 'ABNB', 'ACI', 'ACIW', 'ACM', 'ACN', 'ACWI', 'ACWX', 'ADP', 'ADSK', 'ADT', 'AEE', 'AEG', 'AER', 'AFRM', 'AIT', 'AKAM', 'ALAB', 'ALLE', 'ALLY', 'ALSN', 'ALV', 'AM', 'AMAT', 'AMD', 'AMLP', 'AMP', 'AMZN', 'AN', 'ANET', 'ANGL', 'ANSS', 'AOS', 'APA', 'APD', 'APG', 'APH', 'APO', 'APP', 'APPF', 'AR', 'ARCC', 'ARES', 'ARKB', 'ARKK', 'ARM', 'ARMK', 'AS', 'ASML', 'ASR', 'ASX', 'ATI', 'ATO', 'ATR', 'AU', 'AUR', 'AVDE', 'AVEM', 'AVGO', 'AVLV', 'AVUS', 'AVUV', 'AVY', 'AWI', 'AXON', 'AXP', 'AXS', 'AXTA', 'AYI', 'BAH', 'BAM', 'BAP', 'BBAX', 'BBCA', 'BBEU', 'BBIN', 'BBUS', 'BBVA', 'BBWI', 'BBY', 'BCH', 'BCS', 'BECN', 'BEKE', 'BEN', 'BEP', 'BG', 'BIRK', 'BITB', 'BJ', 'BKLC', 'BKNG', 'BKR', 'BLD', 'BLDR', 'BLV', 'BMI', 'BMO', 'BN', 'BNT', 'BR', 'BRBR', 'BRK-A', 'BRK-B', 'BSBR', 'BSY', 'BUD', 

In [10]:
ticker_w_inf = nan_inf_locations_before['Sortino 3d']['Inf'][0]
ticker_w_inf 

'AAON'

#### Use Cell below to check the inf, -inf and NaN replacement

In [11]:
print(f"combined_df.loc['ticker_w_inf']")
display(combined_df.loc[ticker_w_inf])
print(f"\ndf_to_save.loc['ticker_w_inf']")
display(df_to_save.loc[ticker_w_inf])

combined_df.loc['ticker_w_inf']


Sharpe 3d       20.423033
Sortino 3d            inf
Omega 3d              NaN
Sharpe 5d        4.158110
Sortino 5d       9.157712
Omega 5d         2.153763
Sharpe 10d       2.652507
Sortino 10d      5.082534
Omega 10d        1.576732
Sharpe 15d       2.304908
Sortino 15d      4.073028
Omega 15d        1.442754
Sharpe 30d      -2.502645
Sortino 30d     -2.823924
Omega 30d        0.597546
Sharpe 60d      -1.983280
Sortino 60d     -2.247786
Omega 60d        0.661262
Sharpe 120d     -0.689896
Sortino 120d    -0.893596
Omega 120d       0.864522
Sharpe 250d      0.125453
Sortino 250d     0.163144
Omega 250d       1.026227
Name: AAON, dtype: float64


df_to_save.loc['ticker_w_inf']


Sharpe 3d          20.423033
Sortino 3d      62933.509287
Omega 3d         5607.562617
Sharpe 5d           4.158110
Sortino 5d          9.157712
Omega 5d            2.153763
Sharpe 10d          2.652507
Sortino 10d         5.082534
Omega 10d           1.576732
Sharpe 15d          2.304908
Sortino 15d         4.073028
Omega 15d           1.442754
Sharpe 30d         -2.502645
Sortino 30d        -2.823924
Omega 30d           0.597546
Sharpe 60d         -1.983280
Sortino 60d        -2.247786
Omega 60d           0.661262
Sharpe 120d        -0.689896
Sortino 120d       -0.893596
Omega 120d          0.864522
Sharpe 250d         0.125453
Sortino 250d        0.163144
Omega 250d          1.026227
Name: AAON, dtype: float64

In [12]:
# Save the DataFrame to parquet file (always do this)
df_to_save.to_parquet(dest_path)
print(f"DataFrame successfully saved to {dest_path}")



DataFrame successfully saved to ..\data\df_perf_ratios.parquet


In [13]:
import pandas as pd
import numpy as np
from tabulate import tabulate

_symbols = ['USFR','AAPL', 'MSFT', 'GOOG', 'NVDA', 'IBIT', 'GLD', 'VCIT']
# Create and display the formatted table
selected_df = df_to_save.loc[_symbols] #Use df_to_save here
print(tabulate(selected_df, headers='keys', tablefmt='grid', floatfmt='.4f'))

+------+-------------+--------------+------------+-------------+--------------+------------+--------------+---------------+-------------+--------------+---------------+-------------+--------------+---------------+-------------+--------------+---------------+-------------+---------------+----------------+--------------+---------------+----------------+--------------+
|      |   Sharpe 3d |   Sortino 3d |   Omega 3d |   Sharpe 5d |   Sortino 5d |   Omega 5d |   Sharpe 10d |   Sortino 10d |   Omega 10d |   Sharpe 15d |   Sortino 15d |   Omega 15d |   Sharpe 30d |   Sortino 30d |   Omega 30d |   Sharpe 60d |   Sortino 60d |   Omega 60d |   Sharpe 120d |   Sortino 120d |   Omega 120d |   Sharpe 250d |   Sortino 250d |   Omega 250d |
| USFR |     -6.7100 |      -8.3992 |     0.2517 |     -1.5471 |      -1.9370 |     0.7560 |      -0.4908 |       -0.7099 |      0.9225 |      -0.2191 |       -0.3744 |      0.9640 |      -1.3801 |       -1.9546 |      0.7994 |       1.3139 |        2.0456 |    