In [None]:
import pandas as pd
import numpy as np

# Scenario: Data starts on Jan 6, but series begins Jan 1 (common with indicators)
dates = pd.date_range("2023-01-01", periods=10, freq="D")
prices = [np.nan, np.nan, np.nan, np.nan, np.nan, 100.0, 101.0, 102.0, 103.0, 104.0]
s = pd.Series(prices, index=dates)

print("Original Series:")
print(s)

# BUGGY: bfill() pulls Jan 6 price ($100) back to Jan 1
start_buggy = s.bfill().iloc[0]  # = 100.0 (from Jan 6!)
end_buggy = s.ffill().iloc[-1]  # = 104.0 (correct)

gain_buggy = (end_buggy / start_buggy) - 1
print(f"\n🐛 BUGGY:  ({end_buggy} / {start_buggy}) - 1 = {gain_buggy:.1%}")
print(f"   Start date uses data from {s.first_valid_index()}!")


# CORRECT: Only use actual available data
s_clean = s.dropna()
start_correct = s_clean.iloc[0]  # = 100.0 (Jan 6 - first real data)
end_correct = s_clean.iloc[-1]  # = 104.0 (Jan 10)

gain_correct = (end_correct / start_correct) - 1
print(f"\n✅ CORRECT: ({end_correct} / {start_correct}) - 1 = {gain_correct:.1%}")
print(f"   Calculated from actual data only")

print(
    f"\n⚠️  In this case both give {gain_correct:.1%}, but if the gap was at the end..."
)

Original Series:
2023-01-01      NaN
2023-01-02      NaN
2023-01-03      NaN
2023-01-04      NaN
2023-01-05      NaN
2023-01-06    100.0
2023-01-07    101.0
2023-01-08    102.0
2023-01-09    103.0
2023-01-10    104.0
Freq: D, dtype: float64

🐛 BUGGY:  (104.0 / 100.0) - 1 = 4.0%
   Start date uses data from 2023-01-06 00:00:00!

✅ CORRECT: (104.0 / 100.0) - 1 = 4.0%
   Calculated from actual data only

⚠️  In this case both give 4.0%, but if the gap was at the end...


In [6]:
import pandas as pd
import numpy as np

# Scenario: Data starts on Jan 6, but series begins Jan 1 (common with indicators)
dates = pd.date_range("2023-01-01", periods=10, freq="D")
prices = [np.nan, np.nan, np.nan, np.nan, np.nan, 100.0, 101.0, 102.0, 103.0, 104.0]
s = pd.Series(prices, index=dates)

print("Original Series:")
print(s)

# BUGGY: bfill() pulls Jan 6 price ($100) back to Jan 1
# start_buggy = s.bfill().iloc[0]  # = 100.0 (from Jan 6!)
# end_buggy = s.ffill().iloc[-1]  # = 104.0 (correct)

# res = (data.ffill().iloc[-1] / data.bfill().iloc[0]) - 1
s_buggy = s.copy()
# s_buggy = s_buggy.ffill().iloc[-1]
s_buggy = s_buggy.bfill().iloc[0]

print(f"\nBuggy Series:")
print(s_buggy)

# gain_buggy = (end_buggy / start_buggy) - 1
# print(f"\n🐛 BUGGY:  ({end_buggy} / {start_buggy}) - 1 = {gain_buggy:.1%}")
# print(f"   Start date uses data from {s.first_valid_index()}!")


# # CORRECT: Only use actual available data
# s_clean = s.dropna()
# start_correct = s_clean.iloc[0]  # = 100.0 (Jan 6 - first real data)
# end_correct = s_clean.iloc[-1]  # = 104.0 (Jan 10)

# gain_correct = (end_correct / start_correct) - 1
# print(f"\n✅ CORRECT: ({end_correct} / {start_correct}) - 1 = {gain_correct:.1%}")
# print(f"   Calculated from actual data only")

# print(
#     f"\n⚠️  In this case both give {gain_correct:.1%}, but if the gap was at the end..."
# )

Original Series:
2023-01-01      NaN
2023-01-02      NaN
2023-01-03      NaN
2023-01-04      NaN
2023-01-05      NaN
2023-01-06    100.0
2023-01-07    101.0
2023-01-08    102.0
2023-01-09    103.0
2023-01-10    104.0
Freq: D, dtype: float64

Buggy Series:
100.0


In [19]:
import pandas as pd
import numpy as np

# --- SETUP: Price data with leading NaN (indicator warmup) and middle gap ---
dates = pd.date_range("2023-01-01", periods=15, freq="D")
prices = [
    np.nan,  # Jan 1 - no data yet (warmup period)
    np.nan,  # Jan 2
    np.nan,  # Jan 3
    100.0,  # Jan 4 - first actual price
    101.0,  # Jan 5
    102.0,  # Jan 6
    np.nan,  # Jan 7 - gap (missing data)
    np.nan,  # Jan 8 - gap
    110.0,  # Jan 9 - price jumps (new data source or split)
    111.0,  # Jan 10
    112.0,  # Jan 11
    np.nan,  # Jan 12 - trailing NaN
    np.nan,  # Jan 13
    115.0,  # Jan 14
    116.0,  # Jan 15
]

data = pd.Series(prices, index=dates)
print("=" * 60)
print("RAW DATA")
print("=" * 60)
print(data)
print()

# --- DEMONSTRATION 1: data.bfill().iloc[0] ---
print("=" * 60)
print("STEP 1: data.bfill().iloc[0]  <-- THE PROBLEM")
print("=" * 60)
bfilled = data.bfill()
print("After bfill():")
print(bfilled)
print()
print(f"data.iloc[0]           = {data.iloc[0]}")


print(f"data.bfill().iloc[0]   = {bfilled.iloc[0]}")
print(f"                       ↑ pulled from {data.first_valid_index()}!")
print()

# Show exactly what happened
first_valid_date = data.first_valid_index()
first_valid_value = data.loc[first_valid_date]

print(f"🚨 LOOK-AHEAD BIAS: Jan 1 shows ${bfilled.iloc[0]}")


print(f"   But on Jan 1, you didn't know price was ${first_valid_value}")
print(f"   That data only became available on {first_valid_date.strftime('%b %d')}!")
print()

# --- DEMONSTRATION 2: data.ffill().iloc[-1] ---
print("=" * 60)
print("STEP 2: data.ffill().iloc[-1]  <-- Usually OK")
print("=" * 60)
ffilled = data.ffill()
print("After ffill():")
print(ffilled)
print()

print(f"data.iloc[-1]          = {data.iloc[-1]}")
print(f"data.ffill().iloc[-1]  = {ffilled.iloc[-1]}")
print(f"                       ← carried forward from last valid")
print()

# The dangerous calculation
print("=" * 60)
print("STEP 3: THE BUGGY CALCULATION")
print("=" * 60)
start_price = data.bfill().iloc[0]  # 100.0 (from Jan 4, known on Jan 4)
end_price = data.ffill().iloc[-1]  # 116.0 (from Jan 15, known on Jan 15)

buggy_gain = (end_price / start_price) - 1
print(f"Start: {start_price} (bfill pulled from future)")
print(f"End:   {end_price} (ffill is fine here)")
print(f"Gain:  ({end_price}/{start_price}) - 1 = {buggy_gain:.1%}")
print()

# What it should be
print("=" * 60)
print("STEP 4: CORRECT CALCULATION")
print("=" * 60)
clean = data.dropna()
correct_start = clean.iloc[0]  # 100.0 on Jan 4
correct_end = clean.iloc[-1]  # 116.0 on Jan 15
correct_gain = (correct_end / correct_start) - 1

print(f"Start: {correct_start} at {clean.index[0]} (first valid)")
print(f"End:   {correct_end} at {clean.index[-1]} (last valid)")
print(f"Gain:  ({correct_end}/{correct_start}) - 1 = {correct_gain:.1%}")
print()

print("=" * 60)
print("VERDICT")
print("=" * 60)
print(f"In this case: Both give {correct_gain:.1%} → Bias is hidden!")
print(f"But conceptually: bfill().iloc[0] uses FUTURE information")
print(f"If first valid price was different, results would diverge...")

RAW DATA
2023-01-01      NaN
2023-01-02      NaN
2023-01-03      NaN
2023-01-04    100.0
2023-01-05    101.0
2023-01-06    102.0
2023-01-07      NaN
2023-01-08      NaN
2023-01-09    110.0
2023-01-10    111.0
2023-01-11    112.0
2023-01-12      NaN
2023-01-13      NaN
2023-01-14    115.0
2023-01-15    116.0
Freq: D, dtype: float64

STEP 1: data.bfill().iloc[0]  <-- THE PROBLEM
After bfill():
2023-01-01    100.0
2023-01-02    100.0
2023-01-03    100.0
2023-01-04    100.0
2023-01-05    101.0
2023-01-06    102.0
2023-01-07    110.0
2023-01-08    110.0
2023-01-09    110.0
2023-01-10    111.0
2023-01-11    112.0
2023-01-12    115.0
2023-01-13    115.0
2023-01-14    115.0
2023-01-15    116.0
Freq: D, dtype: float64

data.iloc[0]           = nan
data.bfill().iloc[0]   = 100.0
                       ↑ pulled from 2023-01-04 00:00:00!

🚨 LOOK-AHEAD BIAS: Jan 1 shows $100.0
   But on Jan 1, you didn't know price was $100.0
   That data only became available on Jan 04!

STEP 2: data.ffill().iloc[

In [20]:
import pandas as pd
import numpy as np

# --- SETUP: Price data with leading NaN (indicator warmup) and middle gap ---
dates = pd.date_range("2023-01-01", periods=15, freq="D")
prices = [
    np.nan,  # Jan 1 - no data yet (warmup period)
    np.nan,  # Jan 2
    np.nan,  # Jan 3
    100.0,  # Jan 4 - first actual price
    101.0,  # Jan 5
    102.0,  # Jan 6
    np.nan,  # Jan 7 - gap (missing data)
    np.nan,  # Jan 8 - gap
    110.0,  # Jan 9 - price jumps (new data source or split)
    111.0,  # Jan 10
    112.0,  # Jan 11
    np.nan,  # Jan 12 - trailing NaN
    np.nan,  # Jan 13
    115.0,  # Jan 14
    # 116.0,  # Jan 15
    np.nan,  # Jan 15
]

data = pd.Series(prices, index=dates)
print("=" * 60)
print("RAW DATA")
print("=" * 60)
print(data)
print()

# --- DEMONSTRATION 1: data.bfill().iloc[0] ---
print("=" * 60)
print("STEP 1: data.bfill().iloc[0]  <-- THE PROBLEM")
print("=" * 60)
bfilled = data.bfill()
print("After bfill():")
print(bfilled)
print()
print(f"data.iloc[0]           = {data.iloc[0]}")


print(f"data.bfill().iloc[0]   = {bfilled.iloc[0]}")
print(f"                       ↑ pulled from {data.first_valid_index()}!")
print()

# Show exactly what happened
first_valid_date = data.first_valid_index()
first_valid_value = data.loc[first_valid_date]

print(f"🚨 LOOK-AHEAD BIAS: Jan 1 shows ${bfilled.iloc[0]}")


print(f"   But on Jan 1, you didn't know price was ${first_valid_value}")
print(f"   That data only became available on {first_valid_date.strftime('%b %d')}!")
print()

# --- DEMONSTRATION 2: data.ffill().iloc[-1] ---
print("=" * 60)
print("STEP 2: data.ffill().iloc[-1]  <-- Usually OK")
print("=" * 60)
ffilled = data.ffill()
print("After ffill():")
print(ffilled)
print()

print(f"data.iloc[-1]          = {data.iloc[-1]}")
print(f"data.ffill().iloc[-1]  = {ffilled.iloc[-1]}")
print(f"                       ← carried forward from last valid")
print()

# The dangerous calculation
print("=" * 60)
print("STEP 3: THE BUGGY CALCULATION")
print("=" * 60)
start_price = data.bfill().iloc[0]  # 100.0 (from Jan 4, known on Jan 4)
end_price = data.ffill().iloc[-1]  # 116.0 (from Jan 15, known on Jan 15)

buggy_gain = (end_price / start_price) - 1
print(f"Start: {start_price} (bfill pulled from future)")
print(f"End:   {end_price} (ffill is fine here)")
print(f"Gain:  ({end_price}/{start_price}) - 1 = {buggy_gain:.1%}")
print()

# What it should be
print("=" * 60)
print("STEP 4: CORRECT CALCULATION")
print("=" * 60)
clean = data.dropna()
correct_start = clean.iloc[0]  # 100.0 on Jan 4
correct_end = clean.iloc[-1]  # 116.0 on Jan 15
correct_gain = (correct_end / correct_start) - 1

print(f"Start: {correct_start} at {clean.index[0]} (first valid)")
print(f"End:   {correct_end} at {clean.index[-1]} (last valid)")
print(f"Gain:  ({correct_end}/{correct_start}) - 1 = {correct_gain:.1%}")
print()

print("=" * 60)
print("VERDICT")
print("=" * 60)
print(f"In this case: Both give {correct_gain:.1%} → Bias is hidden!")
print(f"But conceptually: bfill().iloc[0] uses FUTURE information")
print(f"If first valid price was different, results would diverge...")

RAW DATA
2023-01-01      NaN
2023-01-02      NaN
2023-01-03      NaN
2023-01-04    100.0
2023-01-05    101.0
2023-01-06    102.0
2023-01-07      NaN
2023-01-08      NaN
2023-01-09    110.0
2023-01-10    111.0
2023-01-11    112.0
2023-01-12      NaN
2023-01-13      NaN
2023-01-14    115.0
2023-01-15      NaN
Freq: D, dtype: float64

STEP 1: data.bfill().iloc[0]  <-- THE PROBLEM
After bfill():
2023-01-01    100.0
2023-01-02    100.0
2023-01-03    100.0
2023-01-04    100.0
2023-01-05    101.0
2023-01-06    102.0
2023-01-07    110.0
2023-01-08    110.0
2023-01-09    110.0
2023-01-10    111.0
2023-01-11    112.0
2023-01-12    115.0
2023-01-13    115.0
2023-01-14    115.0
2023-01-15      NaN
Freq: D, dtype: float64

data.iloc[0]           = nan
data.bfill().iloc[0]   = 100.0
                       ↑ pulled from 2023-01-04 00:00:00!

🚨 LOOK-AHEAD BIAS: Jan 1 shows $100.0
   But on Jan 1, you didn't know price was $100.0
   That data only became available on Jan 04!

STEP 2: data.ffill().iloc[

In [24]:
import pandas as pd
import numpy as np

print("=" * 60)
print("THE SANITIZATION TRAP: inf → NaN → 0")
print("=" * 60)

# Setup: Strategy hits a zero price (delisting, error, or bad tick)
prices = pd.Series([100.0, 0.0, 50.0])  # Day1=100, Day2=0 (crash), Day3=50 (recovery?)

print(f"\nPrices: {prices.tolist()}")
print(f"Day 1: ${prices.iloc[0]}")
print(f"Day 2: ${prices.iloc[1]} ← ZERO")
print(f"Day 3: ${prices.iloc[2]}")

# Real total return (Day3 vs Day1)
real_gain = (prices.iloc[-1] / prices.iloc[0]) - 1
print(f"\nActual gain (Day3/Day1): {real_gain:+.1%}")

# Buggy intermediate calculation (Day3/Day2)
print(f"\n--- Buggy Calculation (Day3/Day2) ---")
numerator = prices.iloc[2]  # 50
denominator = prices.iloc[1]  # 0
raw_result = (numerator / denominator) - 1  # 50/0 - 1 = inf

print(f"({numerator} / {denominator}) - 1 = {raw_result}")

# The sanitization chain
print(f"\n--- Sanitization Chain ---")
step1 = np.inf  # raw_result
print(f"Raw:           {step1}")

step2 = pd.Series([step1]).replace([np.inf, -np.inf], np.nan).iloc[0]
print(f"After replace: {step2} (inf → NaN)")

step3 = pd.Series([step2]).fillna(0.0).iloc[0]
print(f"After fillna:  {step3} (NaN → 0.0)")

print(f"\n>>> Reported gain: {step3:+.1%}")
print(f">>> Actual gain:   {real_gain:+.1%}")
print(f">>> ERROR: Hidden {real_gain - step3:+.1%} loss!")

print("\n" + "=" * 60)
print("WHY THIS IS DANGEROUS IN BACKTESTS")
print("=" * 60)
print(
    """Your backtest shows:
  - Strategy broke even (0%)
  
Reality:
  - Strategy lost 50% (if comparing to Day 1)
  - Or hit undefined state (if calculating from Day 2)
  
The .fillna(0.0) turned a disaster into "meh".
"""
)

print("=" * 60)
print("CORRECT HANDLING")
print("=" * 60)


def safe_gain(start, end):
    """Return gain only if calculation is valid."""
    if start == 0 or not np.isfinite(start) or not np.isfinite(end):
        return np.nan  # Explicit: we can't calculate this
    return (end / start) - 1


safe = safe_gain(prices.iloc[0], prices.iloc[-1])
print(f"Safe gain (Day1→Day3): {safe:+.1%}")

# If you really tried Day2→Day3 (invalid)
unsafe = safe_gain(prices.iloc[1], prices.iloc[2])
print(f"Safe gain (Day2→Day3): {unsafe} ← Rejected (division by zero)")

THE SANITIZATION TRAP: inf → NaN → 0

Prices: [100.0, 0.0, 50.0]
Day 1: $100.0
Day 2: $0.0 ← ZERO
Day 3: $50.0

Actual gain (Day3/Day1): -50.0%

--- Buggy Calculation (Day3/Day2) ---
(50.0 / 0.0) - 1 = inf

--- Sanitization Chain ---
Raw:           inf
After replace: nan (inf → NaN)
After fillna:  0.0 (NaN → 0.0)

>>> Reported gain: +0.0%
>>> Actual gain:   -50.0%
>>> ERROR: Hidden -50.0% loss!

WHY THIS IS DANGEROUS IN BACKTESTS
Your backtest shows:
  - Strategy broke even (0%)

Reality:
  - Strategy lost 50% (if comparing to Day 1)
  - Or hit undefined state (if calculating from Day 2)

The .fillna(0.0) turned a disaster into "meh".

CORRECT HANDLING
Safe gain (Day1→Day3): -50.0%
Safe gain (Day2→Day3): nan ← Rejected (division by zero)


  raw_result = (numerator / denominator) - 1  # 50/0 - 1 = inf


In [30]:
import pandas as pd
import numpy as np

print("=" * 60)
print("THE SANITIZATION TRAP: inf → NaN → 0")
print("=" * 60)

# Setup: Strategy hits a zero price (delisting, error, or bad tick)
prices = pd.Series([100.0, 0.0, 50.0])  # Day1=100, Day2=0 (crash), Day3=50 (recovery?)

print(f"\nPrices: {prices.tolist()}")
print(f"Day 1: ${prices.iloc[0]}")
print(f"Day 2: ${prices.iloc[1]} ← ZERO")
print(f"Day 3: ${prices.iloc[2]}\n")

gain = (prices.ffill().iloc[-1] / prices.bfill().iloc[0]) - 1
print(f"gain:\n{gain}\n")
gain.replace([np.inf, -np.inf], np.nan).fillna(0.0)
print(f"gain_after_replace:\n{gain}\n")

THE SANITIZATION TRAP: inf → NaN → 0

Prices: [100.0, 0.0, 50.0]
Day 1: $100.0
Day 2: $0.0 ← ZERO
Day 3: $50.0

gain:
-0.5



AttributeError: 'numpy.float64' object has no attribute 'replace'

In [31]:
import pandas as pd
import numpy as np

print("=" * 60)
print("THE SANITIZATION TRAP: inf → NaN → 0")
print("=" * 60)

# Setup: Strategy hits a zero price (delisting, error, or bad tick)
prices = pd.Series([100.0, 0.0, 50.0])  # Day1=100, Day2=0 (crash), Day3=50 (recovery?)

print(f"\nPrices: {prices.tolist()}")
print(f"Day 1: ${prices.iloc[0]}")
print(f"Day 2: ${prices.iloc[1]} ← ZERO")
print(f"Day 3: ${prices.iloc[2]}\n")

gain = (prices.ffill().iloc[-1] / prices.bfill().iloc[0]) - 1
print(f"gain type: {type(gain)}")  # <class 'numpy.float64'>
print(f"gain value: {gain}\n")

# ERROR: gain is a scalar float, not a pandas object
# gain.replace([np.inf, -np.inf], np.nan)  # AttributeError!

# FIX 1: Use pandas Series wrapper
gain_series = pd.Series([gain])
gain_clean = gain_series.replace([np.inf, -np.inf], np.nan).fillna(0.0)
print(f"gain_clean (Series): {gain_clean.iloc[0]}\n")

# FIX 2: Use numpy directly (better for scalars)
if np.isinf(gain) or np.isnan(gain):
    gain_numpy = 0.0
else:
    gain_numpy = gain
print(f"gain_numpy (scalar): {gain_numpy}\n")

# FIX 3: One-liner with numpy where
gain_where = np.where(np.isfinite(gain), gain, 0.0)
print(f"gain_where: {gain_where}\n")

# FIX 4: The pandas way that works on Series (original context)
print("=" * 60)
print("ORIGINAL CONTEXT: Works on Series, not scalar")
print("=" * 60)
returns = pd.Series([0.05, np.inf, -0.02, np.nan, 0.03])
print(f"Original returns:\n{returns}\n")

sanitized = returns.replace([np.inf, -np.inf], np.nan).fillna(0.0)
print(f"Sanitized (inf→NaN→0):\n{sanitized}")

THE SANITIZATION TRAP: inf → NaN → 0

Prices: [100.0, 0.0, 50.0]
Day 1: $100.0
Day 2: $0.0 ← ZERO
Day 3: $50.0

gain type: <class 'numpy.float64'>
gain value: -0.5

gain_clean (Series): -0.5

gain_numpy (scalar): -0.5

gain_where: -0.5

ORIGINAL CONTEXT: Works on Series, not scalar
Original returns:
0    0.05
1     inf
2   -0.02
3     NaN
4    0.03
dtype: float64

Sanitized (inf→NaN→0):
0    0.05
1    0.00
2   -0.02
3    0.00
4    0.03
dtype: float64


In [32]:
# If gain is a scalar (your error):
gain = float(gain)
if not np.isfinite(gain):
    gain = 0.0

# Or the one-liner:
gain = 0.0 if not np.isfinite(gain) else float(gain)

In [None]:
prices = pd.Series([100.0, 0.0, 50.0])

In [None]:
# Calculate daily price gain for the prices Series
daily_gain = prices.pct_change()
print("Daily price gain:")
print(daily_gain)

In [40]:
import pandas as pd
import numpy as np

print("=" * 60)
print("THE SANITIZATION TRAP: inf → NaN → 0")
print("=" * 60)

# Setup: Strategy hits a zero price (delisting, error, or bad tick)
prices = pd.Series([100.0, 0.0, 50.0])  # Day1=100, Day2=0 (crash), Day3=50 (recovery?)

print(f"\nPrices: {prices.tolist()}")
print(f"Day 1: ${prices.iloc[0]}")
print(f"Day 2: ${prices.iloc[1]} ← ZERO")
print(f"Day 3: ${prices.iloc[2]}\n")

gain = prices.pct_change()
print("Daily price gain:")
print(gain)
print()

print(f"gain type: {type(gain)}")  # <class 'numpy.float64'>
print(f"gain value:\n{gain}\n")

# ERROR: gain is a scalar float, not a pandas object
gain_replace_inf = gain.replace([np.inf, -np.inf], np.nan)  # AttributeError!
print(f"gain value after replace +/-inf with nan:\n{gain_replace_inf}\n")

gain_replace_inf_fillna = gain.replace([np.inf, -np.inf], np.nan).fillna(0.0)
print(f"gain value after replace +/-inf with nan fillna:\n{gain_replace_inf_fillna}\n")


# # FIX 1: Use pandas Series wrapper
# gain_series = pd.Series([gain])
# gain_clean = gain_series.replace([np.inf, -np.inf], np.nan).fillna(0.0)
# print(f"gain_clean (Series): {gain_clean.iloc[0]}\n")

# # FIX 2: Use numpy directly (better for scalars)
# if np.isinf(gain) or np.isnan(gain):
#     gain_numpy = 0.0
# else:
#     gain_numpy = gain
# print(f"gain_numpy (scalar): {gain_numpy}\n")

# # FIX 3: One-liner with numpy where
# gain_where = np.where(np.isfinite(gain), gain, 0.0)
# print(f"gain_where: {gain_where}\n")

# # FIX 4: The pandas way that works on Series (original context)
# print("=" * 60)
# print("ORIGINAL CONTEXT: Works on Series, not scalar")
# print("=" * 60)
# returns = pd.Series([0.05, np.inf, -0.02, np.nan, 0.03])
# print(f"Original returns:\n{returns}\n")

# sanitized = returns.replace([np.inf, -np.inf], np.nan).fillna(0.0)
# print(f"Sanitized (inf→NaN→0):\n{sanitized}")

THE SANITIZATION TRAP: inf → NaN → 0

Prices: [100.0, 0.0, 50.0]
Day 1: $100.0
Day 2: $0.0 ← ZERO
Day 3: $50.0

Daily price gain:
0    NaN
1   -1.0
2    inf
dtype: float64

gain type: <class 'pandas.core.series.Series'>
gain value:
0    NaN
1   -1.0
2    inf
dtype: float64

gain value after replace +/-inf with nan:
0    NaN
1   -1.0
2    NaN
dtype: float64

gain value after replace +/-inf with nan fillna:
0    0.0
1   -1.0
2    0.0
dtype: float64



In [None]:
############################
############################

In [36]:
import pandas as pd
import numpy as np

from dataclasses import dataclass, field, fields, asdict, is_dataclass
from typing import List, Dict, Optional, Any, Union, TypedDict, Tuple

pd.set_option("display.max_rows", 500)

In [3]:
data_path = (
    r"c:\Users\ping\Files_win10\python\py311\stocks\data\df_OHLCV_stocks_etfs.parquet"
)
df_ohlcv = pd.read_parquet(data_path, engine="pyarrow")

In [5]:
df_ohlcv.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 9488873 entries, ('A', Timestamp('1999-11-18 00:00:00')) to ('ZWS', Timestamp('2026-02-04 00:00:00'))
Data columns (total 5 columns):
 #   Column     Dtype  
---  ------     -----  
 0   Adj Open   float64
 1   Adj High   float64
 2   Adj Low    float64
 3   Adj Close  float64
 4   Volume     int64  
dtypes: float64(4), int64(1)
memory usage: 398.9+ MB


In [42]:
num_tickers = 100

rng = np.random.default_rng(50)

# 1. Get unique tickers from the first level of the MultiIndex
unique_tickers = df_ohlcv.index.unique(level=0)

# 2. Randomly sample 10 tickers
sample_ticker = rng.choice(unique_tickers, size=num_tickers, replace=False)
# random_tickers = np.random.choice(unique_tickers, size=num_tickers, replace=False)

# 3. Extract all rows for the selected tickers
df_subset = df_ohlcv.loc[sample_ticker]

print(df_subset.info())
print()
print(df_subset.head())

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 600058 entries, ('SPTS', Timestamp('2011-12-01 00:00:00')) to ('PEP', Timestamp('2026-02-04 00:00:00'))
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   Adj Open   600058 non-null  float64
 1   Adj High   600058 non-null  float64
 2   Adj Low    600058 non-null  float64
 3   Adj Close  600058 non-null  float64
 4   Volume     600058 non-null  int64  
dtypes: float64(4), int64(1)
memory usage: 25.9+ MB
None

                   Adj Open  Adj High  Adj Low  Adj Close  Volume
Ticker Date                                                      
SPTS   2011-12-01   23.8162   23.8162  23.8082    23.8082    3278
       2011-12-02   23.8082   23.8082  23.8082    23.8082       0
       2011-12-05   23.7844   23.7844  23.7844    23.7844    3908
       2011-12-06   23.7844   23.7844  23.7844    23.7844       0
       2011-12-07   23.8003   23.8082  23.8003    23.8082    4413


In [45]:
# Essential for performance
df_subset = df_subset.sort_index()

# Most efficient range selection
idx = pd.IndexSlice
df_period = df_subset.loc[idx[:, "2000-01-01":"2005-01-01"], :]

print(df_period.info())
print()
print(df_period.head())

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 59842 entries, ('ADM', Timestamp('2000-01-03 00:00:00')) to ('ZION', Timestamp('2004-12-31 00:00:00'))
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Adj Open   59842 non-null  float64
 1   Adj High   59842 non-null  float64
 2   Adj Low    59842 non-null  float64
 3   Adj Close  59842 non-null  float64
 4   Volume     59842 non-null  int64  
dtypes: float64(4), int64(1)
memory usage: 3.2+ MB
None

                   Adj Open  Adj High  Adj Low  Adj Close   Volume
Ticker Date                                                       
ADM    2000-01-03   6.01376   6.04509  5.95112    6.01376  1782311
       2000-01-04   5.91980   6.10773  5.91980    5.95112  1969282
       2000-01-05   5.95112   5.95112  5.82583    5.85716  1969082
       2000-01-06   5.82583   5.95112  5.79451    5.88848  1628863
       2000-01-07   5.95112   6.01377  5.91980    5.98244  2147074


In [46]:
df_period.index

MultiIndex([( 'ADM', '2000-01-03'),
            ( 'ADM', '2000-01-04'),
            ( 'ADM', '2000-01-05'),
            ( 'ADM', '2000-01-06'),
            ( 'ADM', '2000-01-07'),
            ( 'ADM', '2000-01-10'),
            ( 'ADM', '2000-01-11'),
            ( 'ADM', '2000-01-12'),
            ( 'ADM', '2000-01-13'),
            ( 'ADM', '2000-01-14'),
            ...
            ('ZION', '2004-12-17'),
            ('ZION', '2004-12-20'),
            ('ZION', '2004-12-21'),
            ('ZION', '2004-12-22'),
            ('ZION', '2004-12-23'),
            ('ZION', '2004-12-27'),
            ('ZION', '2004-12-28'),
            ('ZION', '2004-12-29'),
            ('ZION', '2004-12-30'),
            ('ZION', '2004-12-31')],
           names=['Ticker', 'Date'], length=59842)

In [47]:
pd.set_option("display.max_rows", None)
print(df_period)

                      Adj Open     Adj High     Adj Low    Adj Close  \
Ticker Date                                                            
ADM    2000-01-03     6.013760     6.045090    5.951120     6.013760   
       2000-01-04     5.919800     6.107730    5.919800     5.951120   
       2000-01-05     5.951120     5.951120    5.825830     5.857160   
       2000-01-06     5.825830     5.951120    5.794510     5.888480   
       2000-01-07     5.951120     6.013770    5.919800     5.982440   
       2000-01-10     6.013770     6.107730    5.982440     5.982440   
       2000-01-11     5.982440     5.982440    5.888480     5.888480   
       2000-01-12     5.919800     6.076410    5.888480     6.045090   
       2000-01-13     6.013760     6.170370    5.982440     6.013760   
       2000-01-14     6.076410     6.295660    5.982440     6.264340   
       2000-01-18     6.295650     6.640190    6.295650     6.546230   
       2000-01-19     6.546240     6.734170    6.514910     6.70

In [48]:
# 1. Identify rows with any NaN, then extract unique tickers from Level 0
# nan_tickers = (
#     df_period[df_period.isna().any(axis=1)].index.get_level_values(0).unique().tolist()
# )

nan_tickers = (
    df_ohlcv[df_ohlcv.isna().any(axis=1)].index.get_level_values(0).unique().tolist()
)
# nan_tickers = df_period[df_period.isna().any(axis=1)]

nan_tickers
# # 2. To see the count per column for those specific tickers
# nan_summary = df_period.loc[nan_tickers].isna().sum(level=0)

[]

In [None]:
import pandas as pd
import numpy as np


def generate_ohlcv(
    tickers,
    start_date,
    end_date,
    price_range=(100, 200),
    vol_range=(0.01, 0.05),
    seed=24,
):
    """
    Generate realistic OHLCV dataframe with proper High/Low constraints.

    Parameters
    ----------
    tickers : list
        List of ticker symbols (e.g., ["AAPL", "GOOG"])
    start_date : str
        Start date in "YYYY-MM-DD" format
    end_date : str
        End date in "YYYY-MM-DD" format
    price_range : tuple, optional
        Min and max price for Open generation, default (100, 200)
    vol_range : tuple, optional
        Min and max daily volatility range (as decimal), default (0.01, 0.05)
    seed : int, optional
        Random seed for reproducibility, default 24

    Returns
    -------
    pd.DataFrame
        MultiIndex DataFrame with Ticker, Date and OHLCV columns
    """
    np.random.seed(seed)

    # Create date range (business days)
    dates = pd.date_range(start=start_date, end=end_date, freq="B")

    # Build MultiIndex
    index = pd.MultiIndex.from_product([tickers, dates], names=["Ticker", "Date"])
    n = len(index)

    # Generate prices with realistic intraday structure
    open_prices = np.random.uniform(price_range[0], price_range[1], n)
    daily_range = np.random.uniform(vol_range[0], vol_range[1], n)

    # High must be >= Open, Low must be <= Open
    high_prices = open_prices * (1 + np.random.uniform(0, daily_range, n))
    low_prices = open_prices * (1 - np.random.uniform(0, daily_range, n))

    # Close between Low and High
    close_weight = np.random.uniform(0, 1, n)
    close_prices = low_prices + close_weight * (high_prices - low_prices)

    # Assemble
    data = {
        "Open": open_prices,
        "High": high_prices,
        "Low": low_prices,
        "Close": close_prices,
        "Volume": np.random.randint(1_000_000, 2_000_000, n),
    }

    df = pd.DataFrame(data=data, index=index)

    # Compact but complete
    assert (df["High"] == df[["Open", "High", "Low", "Close"]].max(axis=1)).all()
    assert (df["Low"] == df[["Open", "High", "Low", "Close"]].min(axis=1)).all()

    return df


# Test it
df = generate_ohlcv(["AAPL", "GOOG", "NVDA"], "2000-01-01", "2000-01-10")
print(df)

                         Open        High         Low       Close   Volume
Ticker Date                                                               
AAPL   2000-01-03  196.001730  201.285433  191.655301  200.213832  1123971
       2000-01-04  169.951205  174.768239  169.193125  170.782370  1655112
       2000-01-05  199.986729  207.518664  194.974010  203.247967  1340197
       2000-01-06  122.006730  122.635438  119.781567  122.555873  1965105
       2000-01-07  136.105635  138.035193  133.414880  137.096687  1732605
       2000-01-10  173.984099  178.026223  171.713308  172.846252  1345949
GOOG   2000-01-03  199.645573  201.039625  194.348560  199.598858  1207121
       2000-01-04  131.634698  132.374107  126.338406  132.193808  1878650
       2000-01-05  113.654458  115.914085  113.035943  114.080167  1217698
       2000-01-06  138.398001  141.148607  138.242716  138.498103  1910622
       2000-01-07  132.051928  136.351517  127.700316  130.674888  1887264
       2000-01-10  136.64

In [105]:
import pandas as pd
import numpy as np


def generate_ohlcv(tickers, start_date, end_date, seed=24):
    """
    Generate realistic OHLCV data with proper High/Low constraints.

    Parameters
    ----------
    tickers : list
        List of ticker symbols (e.g., ["AAPL", "GOOG"])
    start_date : str
        Start date in "YYYY-MM-DD" format
    end_date : str
        End date in "YYYY-MM-DD" format
    seed : int, optional
        Random seed for reproducibility, default 24

    Returns
    -------
    pd.DataFrame
        MultiIndex DataFrame with Ticker, Date and OHLCV columns
    """
    np.random.seed(seed)

    # Create date range (business days)
    dates = pd.date_range(start=start_date, end=end_date, freq="B")

    # Build MultiIndex
    index = pd.MultiIndex.from_product([tickers, dates], names=["Ticker", "Date"])
    n = len(index)

    # Generate prices with realistic intraday structure
    open_prices = np.random.uniform(100, 200, n)
    daily_range = np.random.uniform(0.01, 0.05, n)

    # High must be >= Open, Low must be <= Open
    high_prices = open_prices * (1 + np.random.uniform(0, daily_range, n))
    low_prices = open_prices * (1 - np.random.uniform(0, daily_range, n))

    # Close between Low and High
    close_weight = np.random.uniform(0, 1, n)
    close_prices = low_prices + close_weight * (high_prices - low_prices)

    # Assemble
    data = {
        "Open": open_prices,
        "High": high_prices,
        "Low": low_prices,
        "Close": close_prices,
        "Volume": np.random.randint(1_000_000, 2_000_000, n),
    }

    df = pd.DataFrame(data=data, index=index)

    # Optional: validate constraints (remove after testing)
    assert (df["High"] >= df[["Open", "Close"]].max(axis=1)).all()
    assert (df["Low"] <= df[["Open", "Close"]].min(axis=1)).all()

    return df


# Test it
df = generate_ohlcv(["AAPL", "GOOG", "NVDA"], "2000-01-01", "2000-01-10")
print(df)

                         Open        High         Low       Close   Volume
Ticker Date                                                               
AAPL   2000-01-03  196.001730  201.285433  191.655301  200.213832  1123971
       2000-01-04  169.951205  174.768239  169.193125  170.782370  1655112
       2000-01-05  199.986729  207.518664  194.974010  203.247967  1340197
       2000-01-06  122.006730  122.635438  119.781567  122.555873  1965105
       2000-01-07  136.105635  138.035193  133.414880  137.096687  1732605
       2000-01-10  173.984099  178.026223  171.713308  172.846252  1345949
GOOG   2000-01-03  199.645573  201.039625  194.348560  199.598858  1207121
       2000-01-04  131.634698  132.374107  126.338406  132.193808  1878650
       2000-01-05  113.654458  115.914085  113.035943  114.080167  1217698
       2000-01-06  138.398001  141.148607  138.242716  138.498103  1910622
       2000-01-07  132.051928  136.351517  127.700316  130.674888  1887264
       2000-01-10  136.64

In [None]:
def generate_df_ohlcv(tickers, start_date, end_date)

In [None]:
np.random.seed(24)
tickers = ["AAPL", "GOOG", "NVDA"]
start_date = "2000-01-01"
end_date = "2000-01-10"

dates = pd.date_range(start=start_date, end=end_date, freq="B")
index = pd.MultiIndex.from_product([tickers, dates], names=["Ticker", "Date"])
n = len(index)

# Start with Open
open_prices = np.random.uniform(100, 200, n)  # price between 100 - 200

# Intraday volatility (High-Low range)
daily_range = np.random.uniform(0.01, 0.05, n)  # 1% - 5% price range

# Generate High and Low first
high_prices = open_prices * (1 + np.random.uniform(0, daily_range, n))
low_prices = open_prices * (1 - np.random.uniform(0, daily_range, n))

# Close must be between Low and High
close_weight = np.random.uniform(0, 1, n)  # Where in the range
close_prices = low_prices + close_weight * (high_prices - low_prices)

# Assemble
data = {
    "Open": open_prices,
    "High": high_prices,
    "Low": low_prices,
    "Close": close_prices,
    "Volume": np.random.randint(1_000_000, 2_000_000, n),
}
df = pd.DataFrame(data=data, index=index)
print(df)

                         Open        High         Low       Close   Volume
Ticker Date                                                               
AAPL   2000-01-03  196.001730  201.285433  191.655301  200.213832  1123971
       2000-01-04  169.951205  174.768239  169.193125  170.782370  1655112
       2000-01-05  199.986729  207.518664  194.974010  203.247967  1340197
       2000-01-06  122.006730  122.635438  119.781567  122.555873  1965105
       2000-01-07  136.105635  138.035193  133.414880  137.096687  1732605
       2000-01-10  173.984099  178.026223  171.713308  172.846252  1345949
GOOG   2000-01-03  199.645573  201.039625  194.348560  199.598858  1207121
       2000-01-04  131.634698  132.374107  126.338406  132.193808  1878650
       2000-01-05  113.654458  115.914085  113.035943  114.080167  1217698
       2000-01-06  138.398001  141.148607  138.242716  138.498103  1910622
       2000-01-07  132.051928  136.351517  127.700316  130.674888  1887264
       2000-01-10  136.64

In [92]:
np.random.seed(24)
tickers = ["AAPL", "GOOG", "NVDA"]
dates = pd.date_range(start="2000-01-01", end="2000-01-10", freq="B")
index = pd.MultiIndex.from_product([tickers, dates], names=["Ticker", "Date"])
print(dates)
print()
print(index)

n = len(index)
data = {
    "Open": np.random.uniform(10, 20, n),
    "High": np.random.uniform(10, 20, n),
    "Low": np.random.uniform(10, 20, n),
    "Close": np.random.uniform(10, 20, n),
    "Volume": np.random.randint(1_000_000, 2_000_000, n),
}
# print(data.shape)
print(data)
df = pd.DataFrame(data=data, index=index)
print()
print(df)

DatetimeIndex(['2000-01-03', '2000-01-04', '2000-01-05', '2000-01-06',
               '2000-01-07', '2000-01-10'],
              dtype='datetime64[ns]', freq='B')

MultiIndex([('AAPL', '2000-01-03'),
            ('AAPL', '2000-01-04'),
            ('AAPL', '2000-01-05'),
            ('AAPL', '2000-01-06'),
            ('AAPL', '2000-01-07'),
            ('AAPL', '2000-01-10'),
            ('GOOG', '2000-01-03'),
            ('GOOG', '2000-01-04'),
            ('GOOG', '2000-01-05'),
            ('GOOG', '2000-01-06'),
            ('GOOG', '2000-01-07'),
            ('GOOG', '2000-01-10'),
            ('NVDA', '2000-01-03'),
            ('NVDA', '2000-01-04'),
            ('NVDA', '2000-01-05'),
            ('NVDA', '2000-01-06'),
            ('NVDA', '2000-01-07'),
            ('NVDA', '2000-01-10')],
           names=['Ticker', 'Date'])
{'Open': array([19.60017303, 16.9951205 , 19.99867293, 12.200673  , 13.61056354,
       17.3984099 , 19.96455725, 13.16346978, 11.3654458 , 13.8398001

In [None]:
df_subset = df_subset.sort_index()

In [None]:
@staticmethod
def calculate_gain_fast(data: Union[pd.Series, pd.DataFrame]) -> float:
    """Optimized for hot path in RL training loop."""
    try:
        if isinstance(data, pd.DataFrame):
            # Pre-allocate, vectorized
            arr = data.values
            # Column-wise first/last valid
            mask = ~np.isnan(arr)
            gains = []
            for i in range(arr.shape[1]):
                col = arr[:, i]
                valid = col[mask[:, i]]
                if valid.size < 2 or valid[0] == 0:
                    gains.append(0.0)
                else:
                    g = (valid[-1] / valid[0]) - 1
                    gains.append(g if np.isfinite(g) else 0.0)
            return float(np.mean(gains))

        # Series: direct numpy (fastest)
        arr = data.values
        valid = arr[~np.isnan(arr)]
        if valid.size < 2 or valid[0] == 0:
            return 0.0
        result = (valid[-1] / valid[0]) - 1
        return float(result) if np.isfinite(result) else 0.0

    except Exception:
        return 0.0

In [None]:
import timeit

s = pd.Series(np.random.randn(1000) + 100)


# Original
def orig(d):
    if d.empty:
        return 0.0
    res = (d.ffill().iloc[-1] / d.bfill().iloc[0]) - 1
    print(f"res: {res}")
    if isinstance(res, pd.Series):
        return res.replace([np.inf, -np.inf], np.nan).fillna(0.0)
    return float(res) if np.isfinite(res) else 0.0


# Fixed
def fixed(d):
    return calculate_gain_fast(d)


# orig(s)
# print(f"Original: {timeit.timeit(lambda: orig(s), number=10000):.4f}s")
# print(f"Fixed:    {timeit.timeit(lambda: fixed(s), number=10000):.4f}s")
# Fixed is ~3-5x faster and correct

In [58]:
orig(s)

res: -0.03639802411143811


-0.03639802411143811

In [None]:
print(f"s:\n{s}")

s:
0      100.514619
1      100.134139
2       99.984792
3      100.828208
4       99.183533
          ...    
995     99.957080
996    100.652860
997     99.867585
998    100.901085
999    100.459286
Length: 1000, dtype: float64
