### Fix inf in sharpe 3d

In [1]:
# Configuration

# days used in calculating ratios
DAYS_RATIO = [3, 5, 10, 15, 30, 60, 120, 250]

# risk free rate
RISK_FREE_RATE = 0.04

In [2]:
# process_files.py
from config import date_str, DOWNLOAD_DIR, DEST_DIR
from pathlib import Path  # Better path handling

print(f"date_str: {date_str}")
# print(f"DOWNLOAD_DIR: {DOWNLOAD_DIR}")
print(f"DEST_DIR: {DEST_DIR}\n")

# Build paths
source_path = Path(DEST_DIR) / f'df_OHLCV_{date_str}_clean.parquet'
dest_path = Path(DEST_DIR) / f'df_perf_ratios.parquet'

print(f"source_path: {source_path}")
print(f"dest_path: {dest_path}")

date_str: 2025-04-01
DEST_DIR: ..\data

source_path: ..\data\df_OHLCV_2025-04-01_clean.parquet
dest_path: ..\data\df_perf_ratios.parquet


In [3]:
import pandas as pd

# Get tickers from df_finviz.pkl file
df_finviz = pd.read_parquet('..\data\df_finviz.parquet')
tickers = df_finviz.index.to_list()

In [4]:
import pandas as pd

# Load the DataFrame df_OHLCV_2025-03-14_clean.pkl
# df = pd.read_pickle(source_path)
df = pd.read_parquet(source_path, engine='pyarrow')

# Display the first few rows of the DataFrame to verify
df = df[['Adj Close']].copy()
display(df)

Unnamed: 0_level_0,Unnamed: 1_level_0,Adj Close
Symbol,Date,Unnamed: 2_level_1
TBIL,2025-04-01,49.84
TBIL,2025-03-31,50.01
TBIL,2025-03-28,49.99
TBIL,2025-03-27,49.99
TBIL,2025-03-26,49.98
...,...,...
IEI,2024-04-08,111.25
IEI,2024-04-05,111.45
IEI,2024-04-04,111.91
IEI,2024-04-03,111.66


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 386038 entries, ('TBIL', Timestamp('2025-04-01 00:00:00')) to ('IEI', Timestamp('2024-04-02 00:00:00'))
Data columns (total 1 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   Adj Close  386038 non-null  float64
dtypes: float64(1)
memory usage: 4.5+ MB


In [6]:
import pandas as pd

def select_tickers_data(df, tickers):
  """
  Selects data for a list of tickers from a DataFrame with a MultiIndex
  where the first level is the ticker and the second level is the date.

  Args:
    df (pd.DataFrame): The input DataFrame with a MultiIndex.
    tickers (list): A list of ticker symbols to select.

  Returns:
    pd.DataFrame: A DataFrame containing only the data for the specified tickers.
                  Rows for tickers not found will not be included.
  """
  valid_tickers = [t for t in tickers if t in df.index.get_level_values(0)]  # Filter out the tickers that do not exist

  if not valid_tickers:
    print("No valid tickers found in the DataFrame. Returning an empty DataFrame.")
    return pd.DataFrame()

  try:
    ticker_df = df.loc[valid_tickers]
    return ticker_df
  except KeyError as e:
    print(f"KeyError after filtering valid tickers: {e}") #Added to help debugging.
    return pd.DataFrame()


# Example usage:
# ticker_list = ["UBS", "AAPL", "MSFT", "GEV"]  # Include some valid and invalid tickers
selected_data = select_tickers_data(df, tickers)
print(selected_data)

                   Adj Close
Symbol Date                 
AAPL   2025-04-01     223.19
       2025-03-31     222.13
       2025-03-28     217.90
       2025-03-27     223.85
       2025-03-26     221.53
...                      ...
FNDC   2024-04-08      34.26
       2024-04-05      34.05
       2024-04-04      33.92
       2024-04-03      34.13
       2024-04-02      33.87

[386038 rows x 1 columns]


In [7]:
# Get root directory (assuming notebook is in root/notebooks/)
NOTEBOOK_DIR = Path.cwd()
ROOT_DIR = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == 'notebooks' else NOTEBOOK_DIR

import sys

# Add src directory to Python path
sys.path.append(str(ROOT_DIR / 'src'))

import utils

list_dfs = utils.get_latest_dfs(selected_data, DAYS_RATIO)

In [8]:
import numpy as np

all_results = {}

for _df in list_dfs:
    tickers_in_df = _df.index.get_level_values(0).unique()
    for ticker in tickers_in_df:
        # Suppress division warnings just for this calculation
        with np.errstate(divide='ignore', invalid='ignore'):
            result_df = utils.analyze_stock(_df, ticker, risk_free_rate=RISK_FREE_RATE)
        
        if result_df is not None:
            # Extract the ticker name from the result_df index
            ticker_name = result_df.index[0]
            # Convert the single-row DataFrame to a dictionary
            metrics = result_df.iloc[0].to_dict()
            
            # Update the existing ticker entry or create a new one
            if ticker_name in all_results:
                all_results[ticker_name].update(metrics)
            else:
                all_results[ticker_name] = metrics

if all_results:
    combined_df = pd.DataFrame.from_dict(all_results, orient='index')
    print("\nCombined performance metrics DataFrame:")
    print(combined_df)
else:
    print("No performance metrics were calculated.")


Combined performance metrics DataFrame:
      Sharpe 3d  Sortino 3d  Omega 3d  Sharpe 5d  Sortino 5d  Omega 5d  \
A     -9.327941  -10.188909  0.092300 -16.261063  -12.153240  0.044361   
AA   -29.558097  -14.840413  0.000000 -20.863322  -13.255455  0.000000   
AAL  -44.112204  -15.384240  0.000000 -39.320129  -14.984988  0.000000   
AAON  12.087042         inf       NaN  -6.463557   -7.328231  0.320867   
AAPL  18.298948         inf       NaN   1.477026    2.209082  1.278318   
...         ...         ...       ...        ...         ...       ...   
ZION  -0.039988   -0.079691  0.992901  -9.627297   -9.799240  0.243597   
ZM    -7.124720   -8.716744  0.223451 -16.687102  -12.314276  0.061065   
ZS    -5.882270   -7.719341  0.312306  -8.778581   -8.905426  0.207729   
ZTO    1.391622    3.177131  1.283041  -0.763550   -1.152891  0.859453   
ZTS   -3.235151   -5.022707  0.552542  -1.559483   -2.284264  0.793318   

      Sharpe 10d  Sortino 10d  Omega 10d  Sharpe 15d  ...  Omega 30d  

In [9]:
import numpy as np
import pandas as pd

def find_nan_inf(df):
    """
    Finds locations of NaN, Inf, and -Inf values in a DataFrame.
    
    Args:
        df (pd.DataFrame): DataFrame to check
        
    Returns:
        dict: Dictionary with column names as keys and dicts of 
              {type: indices} as values
    """
    nan_inf_locations = {}
    for col in df.columns:
        locations = {
            'NaN': df[col].index[df[col].isnull()].tolist(),
            'Inf': df[col].index[df[col] == np.inf].tolist(),
            '-Inf': df[col].index[df[col] == -np.inf].tolist()
        }
        if any(locations.values()):
            nan_inf_locations[col] = locations
    return nan_inf_locations

def print_nan_inf_locations(locations, message):
    """Prints NaN/Inf locations with a descriptive message."""
    print(message)
    if locations:
        for col, locs in locations.items():
            print(f"Column: {col}")
            for typ, indices in locs.items():
                if indices:
                    print(f"  {typ}: {indices}")
    else:
        print("No NaN or Inf values found.")

def get_column_replacement_values(df):
    """
    Calculate replacement values for each column (max for Inf/NaN, min for -Inf).
    
    Args:
        df (pd.DataFrame): Input DataFrame
        
    Returns:
        tuple: (max_values, min_values) dictionaries with column replacement values
    """
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    max_values = {}
    min_values = {}
    
    for col in numeric_cols:
        # Get finite values only
        finite_vals = df[col][np.isfinite(df[col])]
        if len(finite_vals) > 0:
            max_values[col] = finite_vals.max()
            min_values[col] = finite_vals.min()
        else:
            # If no finite values, use default
            max_values[col] = 1e5
            min_values[col] = -1e5
    
    return max_values, min_values

# Check for NaN or Inf values
has_nan_inf = combined_df.isnull().any().any() or not np.isfinite(combined_df).all().all()
print(f'combined_df has_nan_inf: {has_nan_inf}')

if has_nan_inf:
    # Calculate replacement values for each column
    max_values, min_values = get_column_replacement_values(combined_df)
    print("Replacing NaN/Inf values with column-specific max/min values")
    
    nan_inf_locations_before = find_nan_inf(combined_df)
    print_nan_inf_locations(nan_inf_locations_before, "NaN/Inf Locations BEFORE Replacement:")
    
    # Create cleaned DataFrame
    df_to_save = combined_df.copy()
    
    # Replace values column by column
    for col in df_to_save.columns:
        if col in max_values:  # Only numeric columns will be in max_values
            # Replace NaN and +Inf with max value
            df_to_save[col] = df_to_save[col].fillna(max_values[col])
            df_to_save[col] = df_to_save[col].replace([np.inf], max_values[col])
            # Replace -Inf with min value
            df_to_save[col] = df_to_save[col].replace([-np.inf], min_values[col])
else:
    print("No NaN or Inf values found.")
    df_to_save = combined_df

# Verification
print(f'df_to_save has_nan_inf: {df_to_save.isnull().any().any() or not np.isfinite(df_to_save).all().all()}')
print_nan_inf_locations(find_nan_inf(df_to_save), "NaN/Inf Locations AFTER Replacement:")

# Output cleaned DataFrame
print(df_to_save)

combined_df has_nan_inf: True
Replacing NaN/Inf values with column-specific max/min values
NaN/Inf Locations BEFORE Replacement:
Column: Sortino 3d
  Inf: ['AAON', 'AAPL', 'ABEV', 'ACI', 'ACIW', 'ACM', 'ACN', 'ACWI', 'ADM', 'ADP', 'ADSK', 'ADT', 'AEE', 'AFG', 'AFL', 'AGG', 'AGI', 'AIZ', 'AKAM', 'ALL', 'ALLE', 'AM', 'AMAT', 'AMCR', 'AMP', 'AMT', 'AN', 'APA', 'AR', 'ARCC', 'ARES', 'ARMK', 'AS', 'ATI', 'ATO', 'ATR', 'AU', 'AUR', 'AVLV', 'AVUS', 'AVUV', 'AVY', 'AWI', 'AXP', 'AXS', 'AXTA', 'AYI', 'BALL', 'BBCA', 'BBUS', 'BBY', 'BEN', 'BERY', 'BIP', 'BIRK', 'BJ', 'BKR', 'BLDR', 'BLV', 'BMO', 'BN', 'BNT', 'BR', 'BRBR', 'BRFS', 'BRK-A', 'BRK-B', 'BSCP', 'BSX', 'BUFR', 'BURL', 'BX', 'BXSL', 'CACC', 'CALF', 'CARR', 'CART', 'CASY', 'CAT', 'CAVA', 'CBRE', 'CCCS', 'CCEP', 'CCI', 'CCK', 'CELH', 'CF', 'CG', 'CGCP', 'CGDV', 'CGUS', 'CHE', 'CHRW', 'CHWY', 'CI', 'CIGI', 'CL', 'CLX', 'CM', 'CMG', 'CMS', 'CNA', 'CNH', 'CNQ', 'COKE', 'COOP', 'COP', 'COST', 'CP', 'CPAY', 'CPRT', 'CQP', 'CR', 'CRBG', 'CRK', 

#### Use Cell below to check the inf, -inf and NaN replacement

In [10]:
display(combined_df.loc['BSCQ'])
display(df_to_save.loc['BSCQ'])
print('=======')
display(combined_df.loc['ABEV'])
display(df_to_save.loc['ABEV'])

Sharpe 3d        4.283307
Sortino 3d      13.852584
Omega 3d         2.234086
Sharpe 5d       13.999801
Sortino 5d      45.299842
Omega 5d         6.707244
Sharpe 10d       0.546449
Sortino 10d      0.800819
Omega 10d        1.086077
Sharpe 15d       2.516347
Sortino 15d      4.615279
Omega 15d        1.483528
Sharpe 30d       1.773845
Sortino 30d      2.664171
Omega 30d        1.318463
Sharpe 60d       1.296363
Sortino 60d      2.208993
Omega 60d        1.247677
Sharpe 120d      0.252375
Sortino 120d     0.368535
Omega 120d       1.041556
Sharpe 250d      1.101944
Sortino 250d     1.578969
Omega 250d       1.205028
Name: BSCQ, dtype: float64

Sharpe 3d        4.283307
Sortino 3d      13.852584
Omega 3d         2.234086
Sharpe 5d       13.999801
Sortino 5d      45.299842
Omega 5d         6.707244
Sharpe 10d       0.546449
Sortino 10d      0.800819
Omega 10d        1.086077
Sharpe 15d       2.516347
Sortino 15d      4.615279
Omega 15d        1.483528
Sharpe 30d       1.773845
Sortino 30d      2.664171
Omega 30d        1.318463
Sharpe 60d       1.296363
Sortino 60d      2.208993
Omega 60d        1.247677
Sharpe 120d      0.252375
Sortino 120d     0.368535
Omega 120d       1.041556
Sharpe 250d      1.101944
Sortino 250d     1.578969
Omega 250d       1.205028
Name: BSCQ, dtype: float64



Sharpe 3d       16.485677
Sortino 3d            inf
Omega 3d              NaN
Sharpe 5d        3.466545
Sortino 5d       7.607027
Omega 5d         1.947030
Sharpe 10d      -1.647701
Sortino 10d     -2.257814
Omega 10d        0.762723
Sharpe 15d       5.371036
Sortino 15d      9.572514
Omega 15d        2.254690
Sharpe 30d       6.110339
Sortino 30d     15.569276
Omega 30d        3.051476
Sharpe 60d       4.309522
Sortino 60d      8.809576
Omega 60d        2.074300
Sharpe 120d      0.009914
Sortino 120d     0.014129
Omega 120d       1.001710
Sharpe 250d     -0.132258
Sortino 250d    -0.185550
Omega 250d       0.977714
Name: ABEV, dtype: float64

Sharpe 3d         16.485677
Sortino 3d      6826.028698
Omega 3d         609.110969
Sharpe 5d          3.466545
Sortino 5d         7.607027
Omega 5d           1.947030
Sharpe 10d        -1.647701
Sortino 10d       -2.257814
Omega 10d          0.762723
Sharpe 15d         5.371036
Sortino 15d        9.572514
Omega 15d          2.254690
Sharpe 30d         6.110339
Sortino 30d       15.569276
Omega 30d          3.051476
Sharpe 60d         4.309522
Sortino 60d        8.809576
Omega 60d          2.074300
Sharpe 120d        0.009914
Sortino 120d       0.014129
Omega 120d         1.001710
Sharpe 250d       -0.132258
Sortino 250d      -0.185550
Omega 250d         0.977714
Name: ABEV, dtype: float64

In [11]:
# Save the DataFrame to parquet file (always do this)
df_to_save.to_parquet(dest_path)
print(f"DataFrame successfully saved to {dest_path}")



DataFrame successfully saved to ..\data\df_perf_ratios.parquet


In [12]:
import pandas as pd
import numpy as np
from tabulate import tabulate

_symbols = ['USFR','AAPL', 'MSFT', 'GOOG', 'NVDA', 'IBIT', 'GLD', 'VCIT']
# Create and display the formatted table
selected_df = df_to_save.loc[_symbols] #Use df_to_save here
print(tabulate(selected_df, headers='keys', tablefmt='grid', floatfmt='.4f'))

+------+-------------+--------------+------------+-------------+--------------+------------+--------------+---------------+-------------+--------------+---------------+-------------+--------------+---------------+-------------+--------------+---------------+-------------+---------------+----------------+--------------+---------------+----------------+--------------+
|      |   Sharpe 3d |   Sortino 3d |   Omega 3d |   Sharpe 5d |   Sortino 5d |   Omega 5d |   Sharpe 10d |   Sortino 10d |   Omega 10d |   Sharpe 15d |   Sortino 15d |   Omega 15d |   Sharpe 30d |   Sortino 30d |   Omega 30d |   Sharpe 60d |   Sortino 60d |   Omega 60d |   Sharpe 120d |   Sortino 120d |   Omega 120d |   Sharpe 250d |   Sortino 250d |   Omega 250d |
| USFR |     -6.7065 |      -8.3964 |     0.2520 |      3.9176 |       8.0142 |     2.0097 |      -2.9552 |       -3.9265 |      0.6290 |      -1.4528 |       -2.3539 |      0.7903 |      -1.3773 |       -1.9507 |      0.7998 |       1.3171 |        2.0507 |    