### Fix inf in sharpe 3d

In [1]:
# Configuration

# days used in calculating ratios
DAYS_RATIO = [3, 5, 10, 15, 30, 60, 120, 250]

# risk free rate
RISK_FREE_RATE = 0.04

In [2]:
# process_files.py
from config import date_str, DOWNLOAD_DIR, DEST_DIR
from pathlib import Path  # Better path handling

print(f"date_str: {date_str}")
# print(f"DOWNLOAD_DIR: {DOWNLOAD_DIR}")
print(f"DEST_DIR: {DEST_DIR}\n")

# Build paths
source_path = Path(DEST_DIR) / f'df_OHLCV_{date_str}_clean.pkl'
dest_path = Path(DEST_DIR) / f'df_perf_ratios.pkl'

print(f"source_path: {source_path}")
print(f"dest_path: {dest_path}")

date_str: 2025-03-14
DEST_DIR: ..\data

source_path: ..\data\df_OHLCV_2025-03-14_clean.pkl
dest_path: ..\data\df_perf_ratios.pkl


In [3]:
import pandas as pd

# Get tickers from df_finviz.pkl file
df_finviz = pd.read_pickle('..\data\df_finviz.pkl')
tickers = df_finviz.index.to_list()

In [4]:
import pandas as pd

# Load the DataFrame df_OHLCV_2025-03-14_clean.pkl
df = pd.read_pickle(source_path)

# Display the first few rows of the DataFrame to verify
df = df[['Adj Close']].copy()
display(df)

Unnamed: 0_level_0,Unnamed: 1_level_0,Adj Close
Symbol,Date,Unnamed: 2_level_1
UBS,2025-03-14,32.73
UBS,2025-03-13,31.71
UBS,2025-03-12,31.94
UBS,2025-03-11,31.38
UBS,2025-03-10,31.88
...,...,...
PCVX,2024-03-21,67.64
PCVX,2024-03-20,69.13
PCVX,2024-03-19,67.60
PCVX,2024-03-18,68.04


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 344750 entries, ('UBS', Timestamp('2025-03-14 00:00:00')) to ('PCVX', Timestamp('2024-03-15 00:00:00'))
Data columns (total 1 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   Adj Close  344750 non-null  float64
dtypes: float64(1)
memory usage: 4.0+ MB


In [6]:
import pandas as pd

def select_tickers_data(df, tickers):
  """
  Selects data for a list of tickers from a DataFrame with a MultiIndex
  where the first level is the ticker and the second level is the date.

  Args:
    df (pd.DataFrame): The input DataFrame with a MultiIndex.
    tickers (list): A list of ticker symbols to select.

  Returns:
    pd.DataFrame: A DataFrame containing only the data for the specified tickers.
                  Rows for tickers not found will not be included.
  """
  valid_tickers = [t for t in tickers if t in df.index.get_level_values(0)]  # Filter out the tickers that do not exist

  if not valid_tickers:
    print("No valid tickers found in the DataFrame. Returning an empty DataFrame.")
    return pd.DataFrame()

  try:
    ticker_df = df.loc[valid_tickers]
    return ticker_df
  except KeyError as e:
    print(f"KeyError after filtering valid tickers: {e}") #Added to help debugging.
    return pd.DataFrame()


# Example usage:
# ticker_list = ["UBS", "AAPL", "MSFT", "GEV"]  # Include some valid and invalid tickers
selected_data = select_tickers_data(df, tickers)
print(selected_data)

                   Adj Close
Symbol Date                 
AAPL   2025-03-14     213.49
       2025-03-13     209.68
       2025-03-12     216.98
       2025-03-11     220.84
       2025-03-10     227.48
...                      ...
AIRR   2024-03-21      65.66
       2024-03-20      64.36
       2024-03-19      63.41
       2024-03-18      63.21
       2024-03-15      63.25

[344750 rows x 1 columns]


In [7]:
# Get root directory (assuming notebook is in root/notebooks/)
NOTEBOOK_DIR = Path.cwd()
ROOT_DIR = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == 'notebooks' else NOTEBOOK_DIR

import sys

# Add src directory to Python path
sys.path.append(str(ROOT_DIR / 'src'))

import utils

list_dfs = utils.get_latest_dfs(selected_data, DAYS_RATIO)

In [8]:
import numpy as np

all_results = {}

for _df in list_dfs:
    tickers_in_df = _df.index.get_level_values(0).unique()
    for ticker in tickers_in_df:
        # Suppress division warnings just for this calculation
        with np.errstate(divide='ignore', invalid='ignore'):
            result_df = utils.analyze_stock(_df, ticker, risk_free_rate=RISK_FREE_RATE)
        
        if result_df is not None:
            # Extract the ticker name from the result_df index
            ticker_name = result_df.index[0]
            # Convert the single-row DataFrame to a dictionary
            metrics = result_df.iloc[0].to_dict()
            
            # Update the existing ticker entry or create a new one
            if ticker_name in all_results:
                all_results[ticker_name].update(metrics)
            else:
                all_results[ticker_name] = metrics

if all_results:
    combined_df = pd.DataFrame.from_dict(all_results, orient='index')
    print("\nCombined performance metrics DataFrame:")
    print(combined_df)
else:
    print("No performance metrics were calculated.")


Combined performance metrics DataFrame:
      Sharpe 3d  Sortino 3d  Omega 3d  Sharpe 5d  Sortino 5d  Omega 5d  \
A      2.391301    6.077265  1.541406  -0.967958   -1.857841  0.850624   
AA     1.559067    3.621075  1.322591   9.832251   22.572613  3.843882   
AAPL  -3.420852   -5.243674  0.532856 -10.609635  -10.355952  0.222952   
ABBV  -5.482265   -7.366660  0.343726 -14.058346  -11.366243  0.036393   
ABEV  58.252193         inf       NaN   6.300423   13.580392  2.710969   
...         ...         ...       ...        ...         ...       ...   
ZG    -2.603426   -4.226575  0.623467  -2.300781   -3.165323  0.702736   
ZM     0.821347    1.772382  1.157896   1.832998    3.315179  1.356618   
ZS     0.209591    0.427158  1.038054   1.851402    3.250818  1.349685   
ZTO   25.822797         inf       NaN  -0.275099   -0.456699  0.958836   
ZTS   -1.819928   -3.132050  0.720975 -11.861555  -10.493258  0.104667   

      Sharpe 10d  Sortino 10d  Omega 10d  Sharpe 15d  ...  Omega 30d  

In [9]:
import numpy as np
import pandas as pd

def find_nan_inf(df):
    """
    Finds locations of NaN, Inf, and -Inf values in a DataFrame.
    
    Args:
        df (pd.DataFrame): DataFrame to check
        
    Returns:
        dict: Dictionary with column names as keys and dicts of 
              {type: indices} as values
    """
    nan_inf_locations = {}
    for col in df.columns:
        locations = {
            'NaN': df[col].index[df[col].isnull()].tolist(),
            'Inf': df[col].index[df[col] == np.inf].tolist(),
            '-Inf': df[col].index[df[col] == -np.inf].tolist()
        }
        if any(locations.values()):
            nan_inf_locations[col] = locations
    return nan_inf_locations

def print_nan_inf_locations(locations, message):
    """Prints NaN/Inf locations with a descriptive message."""
    print(message)
    if locations:
        for col, locs in locations.items():
            print(f"Column: {col}")
            for typ, indices in locs.items():
                if indices:
                    print(f"  {typ}: {indices}")
    else:
        print("No NaN or Inf values found.")

def get_column_replacement_values(df):
    """
    Calculate replacement values for each column (max for Inf/NaN, min for -Inf).
    
    Args:
        df (pd.DataFrame): Input DataFrame
        
    Returns:
        tuple: (max_values, min_values) dictionaries with column replacement values
    """
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    max_values = {}
    min_values = {}
    
    for col in numeric_cols:
        # Get finite values only
        finite_vals = df[col][np.isfinite(df[col])]
        if len(finite_vals) > 0:
            max_values[col] = finite_vals.max()
            min_values[col] = finite_vals.min()
        else:
            # If no finite values, use default
            max_values[col] = 1e5
            min_values[col] = -1e5
    
    return max_values, min_values

# Check for NaN or Inf values
has_nan_inf = combined_df.isnull().any().any() or not np.isfinite(combined_df).all().all()
print(f'combined_df has_nan_inf: {has_nan_inf}')

if has_nan_inf:
    # Calculate replacement values for each column
    max_values, min_values = get_column_replacement_values(combined_df)
    print("Replacing NaN/Inf values with column-specific max/min values")
    
    nan_inf_locations_before = find_nan_inf(combined_df)
    print_nan_inf_locations(nan_inf_locations_before, "NaN/Inf Locations BEFORE Replacement:")
    
    # Create cleaned DataFrame
    df_to_save = combined_df.copy()
    
    # Replace values column by column
    for col in df_to_save.columns:
        if col in max_values:  # Only numeric columns will be in max_values
            # Replace NaN and +Inf with max value
            df_to_save[col] = df_to_save[col].fillna(max_values[col])
            df_to_save[col] = df_to_save[col].replace([np.inf], max_values[col])
            # Replace -Inf with min value
            df_to_save[col] = df_to_save[col].replace([-np.inf], min_values[col])
else:
    print("No NaN or Inf values found.")
    df_to_save = combined_df

# Verification
print(f'df_to_save has_nan_inf: {df_to_save.isnull().any().any() or not np.isfinite(df_to_save).all().all()}')
print_nan_inf_locations(find_nan_inf(df_to_save), "NaN/Inf Locations AFTER Replacement:")

# Output cleaned DataFrame
print(df_to_save)

combined_df has_nan_inf: True
Replacing NaN/Inf values with column-specific max/min values
NaN/Inf Locations BEFORE Replacement:
Column: Sharpe 3d
  -Inf: ['BSCQ', 'BSCR']
Column: Sortino 3d
  Inf: ['ABEV', 'ACGL', 'ACWV', 'ADC', 'AEE', 'AEM', 'AEP', 'AER', 'AES', 'AFG', 'AFL', 'AGI', 'AIG', 'AIZ', 'ALL', 'ALLE', 'AMAT', 'AMGN', 'AMT', 'ANSS', 'AR', 'ASND', 'AU', 'AUR', 'AVTR', 'AWK', 'AZN', 'BA', 'BABA', 'BAP', 'BAX', 'BBD', 'BCH', 'BF-A', 'BF-B', 'BG', 'BHP', 'BIL', 'BP', 'BR', 'BRK-A', 'BRK-B', 'BRO', 'BSAC', 'CB', 'CDNS', 'CHDN', 'CHE', 'CHRW', 'CHT', 'CI', 'CINF', 'CMS', 'CNA', 'CNP', 'COR', 'CVX', 'D', 'DGX', 'DOW', 'DRS', 'DTE', 'DUK', 'EA', 'EBR', 'EC', 'ED', 'EDU', 'EG', 'EIX', 'ELV', 'ENB', 'ERJ', 'ES', 'ESLT', 'ETR', 'EVRG', 'EWU', 'EWZ', 'EXC', 'EXE', 'FCX', 'FDL', 'FE', 'FIS', 'FNDE', 'FNF', 'FNV', 'FTI', 'FTS', 'FXI', 'G', 'GBIL', 'GD', 'GDX', 'GDXJ', 'GLD', 'GLDM', 'GLW', 'GOLD', 'GRAB', 'GSK', 'GUNR', 'HALO', 'HCA', 'HDV', 'HESM', 'HIG', 'HII', 'HON', 'HRL', 'HUM', 'IAU

#### Use Cell below to check the inf, -inf and NaN replacement

In [10]:
display(combined_df.loc['BSCQ'])
display(df_to_save.loc['BSCQ'])
print('=======')
display(combined_df.loc['ABEV'])
display(df_to_save.loc['ABEV'])

Sharpe 3d            -inf
Sortino 3d     -15.874508
Omega 3d         0.000000
Sharpe 5d      -22.264769
Sortino 5d     -13.507096
Omega 5d         0.000000
Sharpe 10d      -6.334970
Sortino 10d     -6.658983
Omega 10d        0.353700
Sharpe 15d      -1.432219
Sortino 15d     -1.853466
Omega 15d        0.794955
Sharpe 30d       0.493266
Sortino 30d      0.807582
Omega 30d        1.086414
Sharpe 60d       0.756043
Sortino 60d      1.127886
Omega 60d        1.137477
Sharpe 120d     -0.789356
Sortino 120d    -1.053189
Omega 120d       0.879003
Sharpe 250d      1.160820
Sortino 250d     1.654491
Omega 250d       1.213368
Name: BSCQ, dtype: float64

Sharpe 3d      -416.776453
Sortino 3d      -15.874508
Omega 3d          0.000000
Sharpe 5d       -22.264769
Sortino 5d      -13.507096
Omega 5d          0.000000
Sharpe 10d       -6.334970
Sortino 10d      -6.658983
Omega 10d         0.353700
Sharpe 15d       -1.432219
Sortino 15d      -1.853466
Omega 15d         0.794955
Sharpe 30d        0.493266
Sortino 30d       0.807582
Omega 30d         1.086414
Sharpe 60d        0.756043
Sortino 60d       1.127886
Omega 60d         1.137477
Sharpe 120d      -0.789356
Sortino 120d     -1.053189
Omega 120d        0.879003
Sharpe 250d       1.160820
Sortino 250d      1.654491
Omega 250d        1.213368
Name: BSCQ, dtype: float64



Sharpe 3d       58.252193
Sortino 3d            inf
Omega 3d              NaN
Sharpe 5d        6.300423
Sortino 5d      13.580392
Omega 5d         2.710969
Sharpe 10d       9.692998
Sortino 10d     29.534024
Omega 10d        6.533087
Sharpe 15d       8.889358
Sortino 15d     30.630226
Omega 15d        5.534262
Sharpe 30d       6.288916
Sortino 30d     19.717549
Omega 30d        3.528572
Sharpe 60d       1.284537
Sortino 60d      1.883595
Omega 60d        1.257583
Sharpe 120d      0.036723
Sortino 120d     0.052245
Omega 120d       1.006293
Sharpe 250d     -0.303003
Sortino 250d    -0.422983
Omega 250d       0.949455
Name: ABEV, dtype: float64

Sharpe 3d          58.252193
Sortino 3d      10952.881635
Omega 3d          976.760249
Sharpe 5d           6.300423
Sortino 5d         13.580392
Omega 5d            2.710969
Sharpe 10d          9.692998
Sortino 10d        29.534024
Omega 10d           6.533087
Sharpe 15d          8.889358
Sortino 15d        30.630226
Omega 15d           5.534262
Sharpe 30d          6.288916
Sortino 30d        19.717549
Omega 30d           3.528572
Sharpe 60d          1.284537
Sortino 60d         1.883595
Omega 60d           1.257583
Sharpe 120d         0.036723
Sortino 120d        0.052245
Omega 120d          1.006293
Sharpe 250d        -0.303003
Sortino 250d       -0.422983
Omega 250d          0.949455
Name: ABEV, dtype: float64

In [11]:
# Save the DataFrame to pickle file (always do this)
df_to_save.to_pickle(dest_path)
print(f"DataFrame successfully saved to {dest_path}")



DataFrame successfully saved to ..\data\df_perf_ratios.pkl


In [12]:
import pandas as pd
import numpy as np
from tabulate import tabulate

_symbols = ['USFR','AAPL', 'MSFT', 'GOOG', 'NVDA', 'IBIT', 'GLD', 'VCIT']
# Create and display the formatted table
selected_df = df_to_save.loc[_symbols] #Use df_to_save here
print(tabulate(selected_df, headers='keys', tablefmt='grid', floatfmt='.4f'))

+------+-------------+--------------+------------+-------------+--------------+------------+--------------+---------------+-------------+--------------+---------------+-------------+--------------+---------------+-------------+--------------+---------------+-------------+---------------+----------------+--------------+---------------+----------------+--------------+
|      |   Sharpe 3d |   Sortino 3d |   Omega 3d |   Sharpe 5d |   Sortino 5d |   Omega 5d |   Sharpe 10d |   Sortino 10d |   Omega 10d |   Sharpe 15d |   Sortino 15d |   Omega 15d |   Sharpe 30d |   Sortino 30d |   Omega 30d |   Sharpe 60d |   Sortino 60d |   Omega 60d |   Sharpe 120d |   Sortino 120d |   Omega 120d |   Sharpe 250d |   Sortino 250d |   Omega 250d |
| USFR |      5.2371 |      19.6354 |     2.7493 |     -0.5308 |      -1.1491 |     0.9164 |      -0.2920 |       -0.4635 |      0.9526 |      -0.2263 |       -0.3428 |      0.9611 |       1.2113 |        1.8822 |      1.2252 |       3.3734 |        5.8339 |    