In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score
)
import numpy as np



In [None]:
predictions_path = "/Users/sagardhal/Desktop/Practice/personal-stock/results/predictions_20250908_092155.parquet"   # example


if predictions_path.endswith(".csv"):
    df = pd.read_csv(predictions_path)
elif predictions_path.endswith(".parquet"):
    df = pd.read_parquet(predictions_path)
else:
    raise ValueError("Unsupported file format for predictions")


print("Data loaded:", df.shape)
df.head()

new_df = df.copy()

In [None]:
# List of ALL current predictions
PREDICTIONS = [k for k in new_df.keys() if k.startswith('pred')]
PREDICTIONS

In [None]:
new_df[(new_df.split=='test')].Date.nunique()

In [None]:
pred13_rf_thresh_80 = pd.DataFrame(new_df[(new_df.split=='test')&(new_df.pred13_rf_thresh_80==1)].groupby('Date')['pred13_rf_thresh_80'].count())

In [None]:
pred13_rf_thresh_80.hist()

In [None]:
# Function to find all predictions (starting from 'pred'), generate is_correct (correctness of each prediction)
# and precision on TEST dataset (assuming there is df["split"] column with values 'train','validation','test'

# returns 2 lists of features: PREDICTIONS and IS_CORRECT

def get_predictions_correctness(df:pd.DataFrame, to_predict:str):
  PREDICTIONS = [k for k in df.keys() if k.startswith('pred')]
  print(f'Prediction columns founded: {PREDICTIONS}')

  # add columns is_correct_
  for pred in PREDICTIONS:
    part1 = pred.split('_')[0] # first prefix before '_'
    df[f'is_correct_{part1}'] =  (new_df[pred] == new_df[to_predict]).astype(int)

  # IS_CORRECT features set
  IS_CORRECT =  [k for k in df.keys() if k.startswith('is_correct_')]
  print(f'Created columns is_correct: {IS_CORRECT}')

  print('Precision on TEST set for each prediction:')
  # define "Precision" for ALL predictions on a Test dataset (~4 last years of trading)
  for i,column in enumerate(IS_CORRECT):
    prediction_column = PREDICTIONS[i]
    is_correct_column = column
    filter = (new_df.split=='test') & (new_df[prediction_column]==1)
    print(f'Prediction column:{prediction_column} , is_correct_column: {is_correct_column}')
    print(new_df[filter][is_correct_column].value_counts())
    print(new_df[filter][is_correct_column].value_counts()/len(new_df[filter]))
    print('---------')

  return PREDICTIONS, IS_CORRECT

In [None]:
to_predict = 'is_positive_growth_30d_future'


PREDICTIONS, IS_CORRECT = get_predictions_correctness(df = new_df, to_predict= to_predict)

In [None]:
new_df.groupby('split').Date.agg(['min','max'])

In [None]:
# Calculate fin. result for ALL predictions (manual and produced by models)

sim1_results = [] # results in Array

# Iterate over all predictions
for pred in PREDICTIONS:
  print(f'Calculating sumulation for prediction {pred}:')
  print(f"    Count times of investment {len(new_df[(new_df.split=='test')&(new_df[pred]==1)])} out of {len(new_df[(new_df.split=='test')])} TEST records")

  # Prefix: e.g. pred1 or pred10
  pred_prefix= pred.split('_')[0]

  # Fin. result columns: define new records for EACH positive prediction
  new_df['sim1_gross_rev_'+pred_prefix] = new_df[pred] * 100 * (new_df['growth_future_30d']-1)
  new_df['sim1_fees_'+pred_prefix] = -new_df[pred] * 100 * 0.002
  new_df['sim1_net_rev_'+pred_prefix] = new_df['sim1_gross_rev_'+pred_prefix] + new_df['sim1_fees_'+pred_prefix]

  # calculate agg. results for each PREDICTION columns (pred) on TEST
  filter_test_and_positive_pred = (new_df.split=='test')&(new_df[pred]==1) # filter records on TEST set, when current prediction is 1 (we invest $100 for 1 week ahead - 5 periods)
  sim1_count_investments = len(new_df[filter_test_and_positive_pred])
  sim1_gross_rev = new_df[filter_test_and_positive_pred]['sim1_gross_rev_'+pred_prefix].sum()
  sim1_fees = new_df[filter_test_and_positive_pred]['sim1_fees_'+pred_prefix].sum()
  sim1_net_rev = new_df[filter_test_and_positive_pred]['sim1_net_rev_'+pred_prefix].sum()

  if sim1_gross_rev>0:
    sim1_fees_percentage = -sim1_fees/sim1_gross_rev
  else:
    sim1_fees_percentage = None

  if sim1_count_investments>0:
    sim1_average_net_revenue = sim1_net_rev/sim1_count_investments
  else:
    sim1_average_net_revenue = None

  # APPROXIMATE CAPITAL REQUIRED and CAGR Calculation
  df_investments_count_daily = pd.DataFrame(new_df[filter_test_and_positive_pred].groupby('Date')[pred].count())
  sim1_avg_investments_per_day = df_investments_count_daily[pred].mean()
  sim1_q75_investments_per_day = df_investments_count_daily[pred].quantile(0.75)  # 75% case - how many $100 investments per day do we have?
  # df_investments_count_daily[pred].mean()
  sim1_capital = 100 * 30 * sim1_q75_investments_per_day # 30 days in a row with positive predictions

  # CAGR: average growth per year. E.g. if you have 1.5 return (50% growth in 4 years) --> (1.5)**(1/4) = 1.106 or 10.6% average
  sim1_CAGR = ((sim1_capital+sim1_net_rev)/sim1_capital)**(1/4)

  # append to DF
  sim1_results.append((pred,sim1_count_investments,sim1_gross_rev,sim1_fees,sim1_net_rev,sim1_fees_percentage,sim1_average_net_revenue,sim1_avg_investments_per_day,sim1_capital,sim1_CAGR))


  # output for all predictions with some positive predictions
  if  sim1_count_investments>1:
    print(f"    Financial Result: \n {new_df[filter_test_and_positive_pred][['sim1_gross_rev_'+pred_prefix,'sim1_fees_'+pred_prefix,'sim1_net_rev_'+pred_prefix]].sum()}")
    print(f"        Count Investments in 4 years (on TEST): {sim1_count_investments}")
    print(f"        Gross Revenue: ${int(sim1_gross_rev)}")
    print(f"        Fees (0.2% for buy+sell): ${int(-sim1_fees)}")
    print(f"        Net Revenue: ${int(sim1_net_rev)}")
    print(f"        Fees are {int(-100.0*sim1_fees/sim1_gross_rev)} % from Gross Revenue")
    print(f"        Capital Required : ${int(sim1_capital)} (Vbegin)")
    print(f"        Final value (Vbegin + Net_revenue) : ${int(sim1_capital + sim1_net_rev)} (Vfinal)")

    print(f"        Average CAGR on TEST (4 years) : {np.round(sim1_CAGR,3)}, or {np.round(100.0*(sim1_CAGR-1),1)}% ")

    print(f"        Average daily stats: ")
    print(f"            Average net revenue per investment: ${np.round(sim1_net_rev/sim1_count_investments,2)} ")
    print(f"            Average investments per day: {int(np.round(sim1_avg_investments_per_day))} ")
    print(f"            Q75 investments per day: {int(np.round(sim1_q75_investments_per_day))} ")
    print('=============================================+')


# results in a DataFrame from an Array
columns_simulation = ['prediction', 'sim1_count_investments', 'sim1_gross_rev', 'sim1_fees', 'sim1_net_rev', 'sim1_fees_percentage','sim1_average_net_revenue','sim1_avg_investments_per_day','sim1_capital','sim1_CAGR']

df_sim1_results = pd.DataFrame(sim1_results,columns=columns_simulation)

In [None]:
df_sim1_results['sim1_growth_capital_4y'] = (df_sim1_results.sim1_net_rev+df_sim1_results.sim1_capital) / df_sim1_results.sim1_capital

In [None]:
df_sim1_results

In [None]:
import plotly.express as px

# Create the scatter plot
fig = px.scatter(
    df_sim1_results.dropna(),
    x='sim1_avg_investments_per_day',
    y='sim1_CAGR',
    size='sim1_growth_capital_4y',  # Use the 'size' parameter for sim1_CAGR
    text='prediction',
    title='Compound Annual Growth vs. Time spent (Average investments per day)',
    labels={'sim1_capital': 'Initial Capital Requirement', 'growth_capital_4y': '4-Year Capital Growth'}
)

# Update the layout to improve readability of the annotations
fig.update_traces(textposition='top center')

# Show the plot
fig.show()

In [None]:
# detailed info from the bubble chart above on the winners
# top 1 result
df_sim1_results[df_sim1_results.prediction.isin(['pred21_ens_auto1p_or_top3'])]

## . Advanced Simulation
* to_predict = 'is_positive_growth_30d_future'
* invest ~3% of capital each day (sell positions from 30 days ago), REINVEST PREVIOUS GAINS
* use predict_proba predictions
* select top1..3..x predictions > threshold (0.53)
* invest proportionally the prediction
* stop loss y%
* take profit z%
* Not included: portfolio optimization

In [None]:
new_df.columns.to_list()

In [None]:
# https://stackoverflow.com/questions/17775935/sql-like-window-functions-in-pandas-row-numbering-in-python-pandas-dataframe
# rank of prediction

new_df["rf_pred_rank"] = new_df.groupby("Date")["rf_prob_30d"].rank(method="first", ascending=False)



In [None]:
new_df = new_df.sort_values(by=['Ticker', 'Date'])
new_df.head()

In [None]:
new_df['growth_future_30d'].describe().T

In [None]:
new_df['growth_future_30d'].quantile(0.95)

In [None]:
# stop loss when -11%
new_df['growth_future_30d'].quantile(0.1)

In [None]:
# check a sample on the test dataset
new_df[new_df.split=='test'][['Date','High','Low','Close','Ticker','ticker_type','growth_future_30d']].sample(10)

In [None]:
# Define a function to get the rolling max High and min Low for the next 30 trading days

# Sort the DataFrame: Sorting by Ticker and Date ensures that we are looking at each ticker's data in chronological order.
# Rolling window calculation: We use the rolling method with a window of 5 to calculate the maximum high and minimum low over the next 5 days.
# The shift method is used to align these values correctly with the current row.

def rolling_max_min(df, window=30):
    # high/low in 30 days
    df['Max_High_Next_30'] = df['High'].rolling(window=window, min_periods=1).max().shift(-window+1)
    df['Min_Low_Next_30'] = df['Low'].rolling(window=window, min_periods=1).min().shift(-window+1)

    # low in 1 day (for lower entry)
    df['Min_Low_Next_1'] = df['Low'].rolling(window=1, min_periods=1).min().shift(-1)
    return df

In [None]:
# Apply the function to each group of Ticker
# Important!: need to drop index from groupby operation (reset_index) - so that you can merge that afterwards
result = new_df[new_df.split=='test'][['Date','High','Low','Close','Ticker']].groupby('Ticker').apply(rolling_max_min).reset_index(drop=True)
result.head(30)

In [None]:
result[result.Ticker=='AAPL'].tail(10)

In [None]:
# Calculate the ratios + safe divide
result['Ratio_MaxHighNext30_to_Close'] = np.where(result['Close'] == 0, np.nan, result['Max_High_Next_30']/result['Close'])
result['Ratio_MinLowNext30_to_Close'] =  np.where(result['Close'] == 0, np.nan,  result['Min_Low_Next_30']/result['Close'])

result['Ratio_MinLowNext1_to_Close'] =  np.where(result['Close'] == 0, np.nan,  result['Min_Low_Next_1']/result['Close'])

In [None]:
result[result.Ticker=='AAPL'].head(10)

In [None]:
result.Ratio_MinLowNext1_to_Close.hist(bins=20)
plt.title(f'Distribution of MinLowNext1_to_Close (ratio)')
# Show the plot
plt.show()

In [None]:
result.Ratio_MinLowNext1_to_Close.describe().T

In [None]:
result.Ratio_MaxHighNext30_to_Close.hist()
plt.title(f'Distribution of Max_High_Next_30_to_Close (ratio)')
# Show the plot
plt.show()

In [None]:
result[result.Ratio_MaxHighNext30_to_Close>=1.3].sample(5)

In [None]:
# ~1.8% cases TAKE PROFIT (30%)
len(result[result.Ratio_MaxHighNext30_to_Close>=1.3])/len(result)

In [None]:
# High intra-day volatility, especially on earlier days after the IPO
result.Ratio_MinLowNext30_to_Close.hist(bins=20)

In [None]:
result.Ratio_MinLowNext30_to_Close.describe().T

In [None]:
result[result.Ratio_MinLowNext30_to_Close<=0.8].sample(5)

In [None]:
# Merge the results back with the original dataframe
new_df = new_df.merge(result[['Date', 'Ticker', 'Max_High_Next_30', 'Min_Low_Next_30','Ratio_MaxHighNext30_to_Close','Ratio_MinLowNext30_to_Close','Ratio_MinLowNext1_to_Close']], on=['Date', 'Ticker'])


new_df.sample(5)

### Generate fin.result for one date

In [None]:
from dataclasses import dataclass

@dataclass
class SimulationParams:
    initial_capital: float
    threshold: float
    fees: float
    top_k: int
    portfolio_optimization: bool
    stop_loss: float
    take_profit: float
    lower_entry: float

In [None]:
# Example of initializing with some values
sim_params = SimulationParams(
    initial_capital=10000,        # initial capital = $10k
    threshold=0.53,               # select all binary predictions with probability>=0.55
    fees=0.002,                   # trading fees = 0.2% (buy+sell)
    top_k=10,                     # select top_k predictions
    portfolio_optimization=False, # no portfolio optimizaiton
    stop_loss=0.8,                # automatic sell (with loss) if price (any of next 30 days) is lower than -20% from Close
    take_profit=1.3,              # automatic sell (with profit) if price (any of next 30 days) is higher than +30% from Close
    lower_entry= 0.995                # buy next day with the price = [Close] * 0.995 (try to buy cheaper)
)

print(sim_params)

In [None]:
# sorted array of dates for Simulation
DATES = new_df[new_df.split=='test'].sort_values(by='Date').Date.unique()
print(f' Min date {DATES.min()}, max date {DATES.max()}')

In [None]:
date = DATES[0]
date

In [None]:
# 1. get TOP_K predictions from pred10_rf_best_rule_50, that are higher than THE THRESHOLD
if sim_params.top_k is None:
  one_day_predictions_df = new_df[(new_df.Date==date)&(new_df.rf_prob_30d > sim_params.threshold)]
else:
  one_day_predictions_df = new_df[(new_df.Date==date)&(new_df.rf_prob_30d > sim_params.threshold)&(new_df.rf_pred_rank<=sim_params.top_k)]


one_day_predictions_df[['Date','Ticker',to_predict,'growth_future_30d','rf_prob_30d','rf_pred_rank',]]

In [None]:
# 2. Get non-normalized weights: probability-threshold + 0.01
one_day_predictions_df['weight'] = one_day_predictions_df.rf_prob_30d - sim_params.threshold +0.01

In [None]:
one_day_predictions_df[['Date','Ticker','Close', to_predict,'growth_future_30d','rf_prob_30d','rf_pred_rank','weight']]

In [None]:
# 3. Get normalized weights
one_day_predictions_df['weight_norm'] = one_day_predictions_df['weight']/one_day_predictions_df['weight'].sum()

In [None]:
one_day_predictions_df[['Date','Ticker','Close', to_predict,'growth_future_30d','rf_prob_30d','rf_pred_rank','weight','weight_norm']]

In [None]:
# 4. Capital: either 1/30 of initial (assuming you trade every day), or everything that you can sell from 30 days ago
one_day_predictions_df['investment'] = one_day_predictions_df['weight_norm'] * sim_params.initial_capital /30

In [None]:
one_day_predictions_df[['Date','Ticker','Close', to_predict,'growth_future_30d','rf_prob_30d','rf_pred_rank','weight','weight_norm','investment']]

In [None]:
# 5. Lower Entry: the trade is executed only is Low price for next day is lower than bet
one_day_predictions_df['lower_entry'] = (one_day_predictions_df['Ratio_MinLowNext1_to_Close']<=sim_params.lower_entry).astype(int)

In [None]:
one_day_predictions_df.tail(10)

In [None]:
one_day_predictions_df['stop_loss'] = (one_day_predictions_df['Ratio_MinLowNext30_to_Close'] <= sim_params.stop_loss).astype(int)

In [None]:
one_day_predictions_df['take_profit'] = (one_day_predictions_df['Ratio_MaxHighNext30_to_Close'] >= sim_params.take_profit).astype(int)

In [None]:
one_day_predictions_df.tail(10)

In [None]:
# future_gross_return, depending on lower_entry, take_profit, stop_loss

import random

def get_future_gross_return(row, sim_params:SimulationParams):
  if row['lower_entry']==0: # no trade, investment is untouched, no fees
    return row['investment']

  # buy trade is filled for ALL next cases:
  if row['take_profit']==1 and row['stop_loss']==1:
    if random.random()>0.5: #assume take_profit event was first
      return  row['investment']*(sim_params.take_profit+(1-sim_params.lower_entry))
    else: #assume stop_loss event was first
      return row['investment']*(sim_params.stop_loss+(1-sim_params.lower_entry))

  if row['take_profit']==1: # take some good profit, pay fees
    return  row['investment']*(sim_params.take_profit+(1-sim_params.lower_entry))

  if row['stop_loss']==1: # fix the loss, pay fees
      return row['investment']*(sim_params.stop_loss+(1-sim_params.lower_entry))

  # no stop_loss and no take_profit
  if pd.isna(row['growth_future_30d']):
    return row['investment'] # no information on growth in 30 days --> return the same investment in 5 days
  else:
    return row['investment']*(row['growth_future_30d']+(1-sim_params.lower_entry))

In [None]:
# fees, depending on lower_entry, take_profit, stop_loss

def get_fees(row, sim_params:SimulationParams):
  if row['lower_entry']==0: # no trade ==> no fees
    return 0

  # pay fees in all other cases
  return -row['investment']*sim_params.fees


In [None]:
# 8. calculate future returns (when the order is executed + stop_loss True/False + take_profit True/False)
one_day_predictions_df['future_gross_return'] = one_day_predictions_df.apply(lambda row: get_future_gross_return(row,sim_params=sim_params), axis=1)
one_day_predictions_df['fees'] =  one_day_predictions_df.apply(lambda row: get_fees(row,sim_params=sim_params), axis=1)
one_day_predictions_df['future_net_return'] = one_day_predictions_df['future_gross_return'] + one_day_predictions_df['fees']

# OLD code when no stop_loss, take_profit, and if order executed
# one_day_predictions_df['investment'] * one_day_predictions_df['growth_future_5d']
# one_day_predictions_df['fees'] =   - one_day_predictions_df['investment'] * sim_params.fees


In [None]:
one_day_predictions_df

## Wrap up one day simulation into a function

In [None]:
# Simulation Parameters
sim_params = SimulationParams(
    initial_capital=10000,                 # initial capital = $10k
    threshold=0.53,                        # select all binary predictions with probability>=0.55
    fees=0.002,                            # trading fees = 0.2% (buy+sell)
    top_k=10,                              # select top_k predictions
    portfolio_optimization=False,          # no portfolio optimizaiton
    stop_loss=0.8,                         # automatic sell (with loss) if price (any of next 30 trading days) is lower than -20% from Close
    take_profit=1.3,                       # automatic sell (with profit) if price (any of next 30 trading days) is higher than +30% from Close
    lower_entry= 0.995                     # buy next day with the price = [Close] * 0.995 (try to buy cheaper)
)

In [None]:
def one_date_simulation(date:str, invest_sum:float, df:pd.DataFrame, sim_params:SimulationParams, predictor:str='proba_pred10'):

  #rank_column = predictor.split('_')[1]+'_rank' # e.g. 'proba_pred10' --> 'pred10_rank'
  rank_column = predictor.split('_')[0] +'_pred_rank'

  # 1. get TOP_K (or ALL) predictions from the predictor (pred14_rf_best_rule_53 by default), that are higher than THE THRESHOLD
  if sim_params.top_k is None:
    one_day_predictions_df = df[(df.Date==date)&(df[predictor] > sim_params.threshold)]
  else:
    one_day_predictions_df = df[(df.Date==date)&(df[predictor] > sim_params.threshold)&(df[rank_column]<=sim_params.top_k)]

  FIELDS = ['Close', 'Ticker', 'Date', predictor, rank_column, 'growth_future_30d', 'Ratio_MaxHighNext30_to_Close','Ratio_MinLowNext30_to_Close','Ratio_MinLowNext1_to_Close']
  result_df = one_day_predictions_df[FIELDS].copy()

  # 2. Get non-normalized weights: probability-threshold + 0.01
  result_df['weight'] = result_df[predictor] - sim_params.threshold +0.01

  # 3. Get normalized weights
  result_df['weight_norm'] = result_df['weight']/result_df['weight'].sum()

  # 4. Make bets to allocate 'invest_sum' across all suitable predictions
  result_df['investment'] = result_df['weight_norm'] * invest_sum

  # 5. Lower Entry: the trade is executed only is Low price for next day is lower than the bet (Adj_Close_today * sim_params.lower_entry)
    # [ONLY TRADES with lower_entry==1 are filled by the exchange]
  result_df['lower_entry'] = (result_df['Ratio_MinLowNext1_to_Close'] <= sim_params.lower_entry).astype(int)

  # 6. Stop Loss: happens if the current price (or Low price) goes below stop loss threshold during one of the next 5 periods (1 week)
  result_df['stop_loss'] = (result_df['Ratio_MinLowNext30_to_Close'] <= sim_params.stop_loss).astype(int)

  # 7. Take Profit: take the money if the current Price (or Max_price) goes higher than sim_params.take_profit
  result_df['take_profit'] = (result_df['Ratio_MaxHighNext30_to_Close'] >= sim_params.take_profit).astype(int)

  # 8. Calculate future returns (when the order is executed + stop_loss True/False + take_profit True/False)
  result_df['future_gross_return'] = result_df.apply(lambda row: get_future_gross_return(row,sim_params=sim_params), axis=1)
  result_df['fees'] =  result_df.apply(lambda row: get_fees(row,sim_params=sim_params), axis=1)
  result_df['future_net_return'] = result_df['future_gross_return'] + result_df['fees']

  return result_df

In [None]:
predictor = 'rf_prob_30d'
rank_column = predictor.split('_')[0] +'_pred_rank'
print(rank_column)

In [None]:
r = one_date_simulation(date='2021-10-28', invest_sum=sim_params.initial_capital/30, df=new_df, sim_params=sim_params, predictor=predictor)

In [None]:
r

In [None]:
# initial investment
r.investment.sum()

In [None]:
# result in 30 days (returns+fees)
r.future_net_return.sum()

# Generate fin result for ALL days

In [None]:
all_dates = new_df[new_df.split=='test'].sort_values(by='Date').Date.unique()
all_dates

In [None]:
# simulate for all dates but last 5
all_dates[0:-5]

In [None]:
# these last days we only "sell" the positions
all_dates[-5:]

In [None]:
capital= 5 * [sim_params.initial_capital/5]
capital[-5]

In [None]:
def simulate(df:pd.DataFrame, sim_params:SimulationParams):

  simulation_df = None

  # all dates for simulation
  all_dates = df[df.split=='test'].sort_values(by='Date').Date.unique()

  # arrays of dates and capital available (capital for the first 5 days)
  dates = []
  capital= 30 * [sim_params.initial_capital/30]  # first 30 periods trade with 1/30 of the initial_capital. e.g. [333,...,333] = 10k in total

  for current_date in all_dates[0:-30]:  #growth_future_30d is not defined for the last 30 days : ALL, but last 30 dates

    current_invest_sum = capital[-30]    # take the value or everything that you can sell from 30 days ago

    one_day_simulation_results = one_date_simulation(date = current_date,  # one day simulation result
                                    invest_sum = current_invest_sum,
                                    df = df,
                                    sim_params=sim_params,
                                    predictor=predictor)

    # add capital available in 30 days
    if len(one_day_simulation_results)==0:  #no predictions -> no trades
      capital.append(current_invest_sum)
    else:
      capital.append(one_day_simulation_results.future_net_return.sum())
    dates.append(current_date)

    if simulation_df is None:
      simulation_df = one_day_simulation_results
    else:
      simulation_df = pd.concat([simulation_df, one_day_simulation_results], ignore_index=True)

  # add last 5 days to make the count of data points equal for dates/capital arrays
  dates.extend(all_dates[-30:])
  capital_df = pd.DataFrame({'capital':capital}, index=pd.to_datetime(dates))

  # results:
  print(f'============================================================================================')
  print(f'SIMULATION STARTED')
  print(f'Simulations params: {sim_params}')
  print(f' Count bids {len(simulation_df)} in total, avg.bids per day {len(simulation_df)/simulation_df.Date.nunique()},  filled bids {len(simulation_df[simulation_df.lower_entry==1])}, fill bids percent = {len(simulation_df[simulation_df.lower_entry==1])/len(simulation_df)}')
  stop_loss_filter = (simulation_df.lower_entry==1)&(simulation_df.stop_loss==1)
  print(f'  Stop loss events: count = {len(simulation_df[stop_loss_filter])}, net loss = {simulation_df[stop_loss_filter].future_net_return.sum()-simulation_df[stop_loss_filter].investment.sum()} ')
  take_profit_filter = (simulation_df.lower_entry==1)&(simulation_df.take_profit==1)
  print(f'  Take profit events: count = {len(simulation_df[take_profit_filter])}, net profit = {simulation_df[take_profit_filter].future_net_return.sum()-simulation_df[take_profit_filter].investment.sum()} ')
  print(f'  Start capital = {sim_params.initial_capital}, Resulting capital: {capital_df[-30:].capital.sum()} ')
  print(f'  CAGR in 4 years: {np.round((capital_df[-30:].capital.sum()/sim_params.initial_capital)**(1/4),3)} or {np.round(((capital_df[-30:].capital.sum()/sim_params.initial_capital)**(1/4)-1)*100.0,2)} % of avg. growth per year')
  print(f'============================================================================================')
  return simulation_df,capital_df

In [None]:
# One initial simulation
sim_params = SimulationParams(
    initial_capital = 10000,        # initial capital = $10k
    threshold = 0.55,               # select all binary predictions with probability>=0.55
    fees = 0.002,                   # trading fees = 0.2% (buy+sell)
    top_k = 5,                      # select top_k=5 predictions
    portfolio_optimization = False, # no portfolio optimization
    stop_loss = 0.8,                # automatic sell (with loss) if price (any of next 5 days) is lower than -20% from Adj.Close
    take_profit = 1.3,              # automatic sell (with profit) if price (any of next 5 days) is higher than +30% from Adj.Close
    lower_entry = 0.99               # buy next day with the price = [Close] * 0.99 (try to buy cheaper)
)

res, capital = simulate(new_df, sim_params)

#  Find optimal parameters of simulation

In [None]:
# BEST PREVIOUS CAGR is 7.8%  (top_k=5, portfolio_optimization=False, stop_loss=0.8, take_profit=1.3, lower_entry=0.99)
# iterate over lower_entry
for lower_entry in [0.97,0.98,0.99,1,1.01,1.02,1.03]:
  # One simulation
  sim_params = SimulationParams(
      initial_capital = 10000,        # initial capital = $10k
      threshold = 0.55,               # select all binary predictions with probability>=0.55
      fees = 0.002,                   # trading fees = 0.2% (buy+sell)
      top_k = 5,                     # select top_k predictions
      portfolio_optimization = False, # no portfolio optimization
      stop_loss = 0.8,                # automatic sell (with loss) if price (any of next 5 days) is lower than -5% from Adj.Close
      take_profit = 1.3,              # automatic sell (with profit) if price (any of next 5 days) is higher than +20% from Adj.Close
      lower_entry = lower_entry                # buy next day with the price = [Adj.Close] * 0.995 (try to buy cheaper)
  )

  res, capital = simulate(new_df, sim_params)

In [None]:
# BEST PREVIOUS is CAGR 11.16%  (top_k=5, portfolio_optimization=False, stop_loss=0.8, take_profit=1.3, lower_entry=0.99 ==> lower_entry=1.0)

# iterate over take profit   ==> new best take_profit=1.5
for take_profit in [1.2,1.25,1.3,1.35,1.4,1.45,1.5]:
  # One simulation
  sim_params = SimulationParams(
      initial_capital = 10000,        # initial capital = $10k
      threshold = 0.55,               # select all binary predictions with probability>=0.55
      fees = 0.002,                   # trading fees = 0.2% (buy+sell)
      top_k = 5,                     # select top_k predictions
      portfolio_optimization = False, # no portfolio optimization
      stop_loss = 0.8,                # automatic sell (with loss) if price (any of next 30 days) is lower than -5% from Adj.Close
      take_profit = take_profit,        # automatic sell (with profit) if price (any of next 30 days) is higher than +20% from Adj.Close
      lower_entry = 1                # buy next day with the price = [Close] * lower_entry (try to buy cheaper)
  )

  res, capital = simulate(new_df, sim_params)

In [None]:
# BEST PREVIOUS is CAGR 11.16%  (top_k=5, portfolio_optimization=False, stop_loss=0.8, take_profit=1.3, lower_entry=1.0)
# BEST PREVIOUS is CAGR 11.91%  (top_k=5, portfolio_optimization=False, stop_loss=0.8, take_profit=1.3 => take_profit=1.5, lower_entry=1.0)

# iterate over stop_loss
for stop_loss in [0.55,0.6,0.65,0.7,0.75,0.8]:
  # One simulation
  sim_params = SimulationParams(
      initial_capital = 10000,        # initial capital = $10k
      threshold = 0.55,               # select all binary predictions with probability>=0.55
      fees = 0.002,                   # trading fees = 0.2% (buy+sell)
      top_k = 5,                     # select top_k predictions
      portfolio_optimization = False, # no portfolio optimization
      stop_loss = stop_loss,                # automatic sell (with loss) if price (any of next 30 days) is lower than -"stop_loss"% from Close
      take_profit = 1.5,              # automatic sell (with profit) if price (any of next 30 days) is higher than +50% from Close
      lower_entry = 1                # buy next day with the price = [Close] * 1 (try to buy cheaper)
  )

  res, capital = simulate(new_df, sim_params)

In [None]:
# BEST PREVIOUS is CAGR 11.16%  (top_k=5, portfolio_optimization=False, stop_loss=0.8, take_profit=1.3, lower_entry=1.0)
# BEST PREVIOUS is CAGR 11.91%  (top_k=5, portfolio_optimization=False, stop_loss=0.8, take_profit=1.3 => take_profit=1.5, lower_entry=1.0)
# BEST PREVIOUS is CAGR 13.19%  (top_k=5, portfolio_optimization=False, stop_loss=0.8 => stop_loss=0.6, take_profit=1.5, lower_entry=1.0)


# iterate over lower_entry ++ take profit of 2.5%
  # best lower_entry ==0.98
for threshold in [0.51,0.52,0.53,0.54,0.55,0.56,0.57]:
  # One simulation
  sim_params = SimulationParams(
      initial_capital = 10000,        # initial capital = $10k
      threshold = threshold,               # select all binary predictions with probability>=threshold (iter param)
      fees = 0.002,                   # trading fees = 0.2% (buy+sell)
      top_k = 5,                     # select top_k predictions
      portfolio_optimization = False, # no portfolio optimization
      stop_loss = 0.6,                # automatic sell (with loss) if price (any of next 30 days) is lower than -40% from Close
      take_profit = 1.5,              # automatic sell (with profit) if price (any of next 30 days) is higher than +50% from Adj.Close
      lower_entry = 1                # buy next day with the price = [Close] * lower_entry (try to buy cheaper)
  )

  res, capital = simulate(new_df, sim_params)

In [None]:
# BEST PREVIOUS is CAGR 11.16%  (top_k=5, portfolio_optimization=False, stop_loss=0.8, take_profit=1.3, lower_entry=1.0)
# BEST PREVIOUS is CAGR 11.91%  (top_k=5, portfolio_optimization=False, stop_loss=0.8, take_profit=1.3 => take_profit=1.5, lower_entry=1.0)
# BEST PREVIOUS is CAGR 13.19%  (top_k=5, portfolio_optimization=False, stop_loss=0.8 => stop_loss=0.6, take_profit=1.5, lower_entry=1.0)
# BEST PREVIOUS is CAGR 13.5%  (threshold=0.55 -> threshold=0.56 -- for a decision rule, top_k=5, portfolio_optimization=False, stop_loss=0.6, take_profit=1.5, lower_entry=1.0)


# iterate over top_k
for top_k in [1,2,3,4,5,6,8,10,20,33]:
  # One simulation
  sim_params = SimulationParams(
      initial_capital = 10000,        # initial capital = $10k
      threshold = 0.56,               # select all binary predictions with probability>=0.56
      fees = 0.002,                     # trading fees = 0.2% (buy+sell)
      top_k = top_k,                    # select top_k predictions
      portfolio_optimization = False,   # no portfolio optimization
      stop_loss = 0.6,                 # automatic sell (with loss) if price (any of next 30 days) is lower than
      take_profit = 1.5,              # automatic sell (with profit) if price (any of next 30 days) is higher than +50% from Close
      lower_entry = 1.0                # buy next day with the price = [Close] * (try to buy cheaper)
  )

  res, capital = simulate(new_df, sim_params)

In [None]:
# BEST PREVIOUS is CAGR 11.16%  (top_k=5, portfolio_optimization=False, stop_loss=0.8, take_profit=1.3, lower_entry=1.0)
# BEST PREVIOUS is CAGR 11.91%  (top_k=5, portfolio_optimization=False, stop_loss=0.8, take_profit=1.3 => take_profit=1.5, lower_entry=1.0)
# BEST PREVIOUS is CAGR 13.19%  (top_k=5, portfolio_optimization=False, stop_loss=0.8 => stop_loss=0.6, take_profit=1.5, lower_entry=1.0)
# BEST PREVIOUS is CAGR 13.5%  (threshold=0.55 -> threshold=0.56 -- for a decision rule, top_k=5, portfolio_optimization=False, stop_loss=0.6, take_profit=1.5, lower_entry=1.0)

# BEST PREVIOUS is CAGR 13.9%  (threshold=0.56, top_k=5 ==> top_k=4, portfolio_optimization=False, stop_loss=0.6, take_profit=1.5, lower_entry=1.0)


# iterate over lower_entry
for stop_loss in [0.6, 0.65, 0.7,0.75, 0.8, 0.85]:
  # One simulation
  sim_params = SimulationParams(
      initial_capital = 10000,        # initial capital = $10k
      threshold = 0.56,               # select all binary predictions with probability>=0.56
      fees = 0.002,                   # trading fees = 0.2% (buy+sell)
      top_k = 4,                      # select top_k predictions
      portfolio_optimization = False, # no portfolio optimization
      stop_loss = stop_loss,                # automatic sell (with loss) if price (any of next 5 days) is lower than -5% from Adj.Close
      take_profit = 1.5,              # automatic sell (with profit) if price (any of next 5 days) is higher than +20% from Adj.Close
      lower_entry = 1                # buy next day with the price = [Adj.Close] * 0.995 (try to buy cheaper)
  )

  res, capital = simulate(new_df, sim_params)

###  Explore the best simulation params 

In [None]:
sim_params = SimulationParams(initial_capital=10000, threshold=0.51, fees=0.002, top_k=8, portfolio_optimization=False, stop_loss=0.6, take_profit=1.5, lower_entry=1)

res, capital = simulate(new_df, sim_params)

In [None]:
res[res.Date=='2024-04-30']


In [None]:
len(res)

In [None]:
# filled deals
res.lower_entry.sum()

In [None]:
# stop loss (regardless of a filled bid)

res.stop_loss.sum()

In [None]:
# take profit (regardless of a filled bid)
res.take_profit.sum()

In [None]:
res[(res.stop_loss==1)&(res.lower_entry==1)]

In [None]:
# could take profit, but the bid was not executed
res[(res.take_profit==1)&(res.lower_entry==0)]

In [None]:
res[(res.take_profit==1) &(res.lower_entry==1)]

In [None]:
res[res.Date=='2024-04-30'].future_net_return.sum()

In [None]:
# last 20 days
capital[-20:]

In [None]:
capital.rolling(5).sum().plot.line()
print(capital[-5:].sum())

### Debug optimal strategy with k=6 max trades per day

In [None]:
sim_params = SimulationParams(initial_capital=10000, threshold=0.55, fees=0.002, top_k=4, portfolio_optimization=False, stop_loss=0.6, take_profit=1.5, lower_entry=1)

res, capital_4trades = simulate(new_df, sim_params)

In [None]:
filter_stop_loss = (res.lower_entry==1) & (res.stop_loss==1)
print(f'Average real close price if not stop_loss: {res[filter_stop_loss].growth_future_30d.mean()}')
res[filter_stop_loss].head(10)

In [None]:
filter_take_profit = (res.lower_entry==1) & (res.take_profit==1) & (res.stop_loss==0)
print(f'Average real close price if not take_profit: {res[filter_take_profit].growth_future_30d.mean()}')
res[filter_take_profit].head(10)

In [None]:
filter_no_stop_loss_no_take_profit = (res.lower_entry==1) & (res.take_profit==0) & (res.stop_loss==0)
print(f'Average real close price if no take_profit or stop loss: {res[filter_no_stop_loss_no_take_profit].growth_future_30d.mean()}')
res['realised_profit'] = res.future_net_return/res.investment
res[filter_no_stop_loss_no_take_profit][['growth_future_30d','realised_profit']].describe().T

In [None]:
capital_4trades.rolling(30).sum().plot.line()
print(capital_4trades[-30:].sum())

### Debug optimal strategy with k=1 max trades per day

In [None]:
sim_params = SimulationParams(initial_capital=10000, threshold=0.55, fees=0.002, top_k=1, portfolio_optimization=False, stop_loss=0.6, take_profit=1.5, lower_entry=1.0)

res, capital_1_trade = simulate(new_df, sim_params)

In [None]:
# NO STOP LOSS events
filter_stop_loss = (res.lower_entry==1) & (res.stop_loss==1)
print(f'Average real close price if not stop_loss: {res[filter_stop_loss].growth_future_30d.mean()}')
res[filter_stop_loss].head(10)

In [None]:
filter_take_profit = (res.lower_entry==1) & (res.take_profit==1) & (res.stop_loss==0)
print(f'Average real close price if not take_profit: {res[filter_take_profit].growth_future_30d.mean()}')
res[filter_take_profit].head(5)

In [None]:
filter_no_stop_loss_no_take_profit = (res.lower_entry==1) & (res.take_profit==0) & (res.stop_loss==0)
print(f'Average real close price if no take_profit or stop loss: {res[filter_no_stop_loss_no_take_profit].growth_future_30d.mean()}')
res['realised_profit'] = res.future_net_return/res.investment
res[filter_no_stop_loss_no_take_profit][['growth_future_30d','realised_profit']].describe().T

In [None]:
capital_1_trade.rolling(30).sum().plot.line()
print(capital_1_trade[-30:].sum())

In [None]:
import matplotlib.pyplot as plt

# Assuming capital_6trades and capital_1_trade are defined and have the same index
capital_6trades_rolling = capital_4trades.rolling(30).sum()
capital_1_trade_rolling = capital_1_trade.rolling(30).sum()

# Plot the rolling sums on the same graph
plt.figure(figsize=(10, 6))
plt.plot(capital_6trades_rolling, label='Capital 4 Trades Rolling Sum (30)')
plt.plot(capital_1_trade_rolling, label='Capital 1 Trade Rolling Sum (30)')
plt.legend(loc='best')
plt.title('Rolling Sum of Capital for 4 Trades and 1 Trade (Window=30 days)')
plt.xlabel('Date')
plt.ylabel('Rolling Sum')
plt.grid(True)
plt.show()

# Print the ending values
print(f"Capital 4 Trades Ending Sum: {capital_4trades[-30:].sum()}")
print(f"Capital 1 Trade Ending Sum: {capital_1_trade[-30:].sum()}")

In [None]:
df.rf_prob_30d.hist()



In [None]:
df.rf_prob_30d.value_counts()


In [None]:
TARGET_COL = "is_positive_growth_30d_future"
if TARGET_COL not in df.columns:
    raise KeyError(f"Target column '{TARGET_COL}' not found in df.columns")
y_true = df[TARGET_COL].astype(int)



In [None]:
# --- 4) Helpers ---
def precision_at_k(y_true: pd.Series, y_scores: pd.Series, k: int) -> float:
    """
    Precision@k: Of the top k highest-scored samples, how many are positive?
    - Handles k > n by clipping to n
    - Handles NaNs by treating them as lowest score
    - Keeps index alignment correct
    """
    n = len(y_scores)
    if n == 0:
        return np.nan
    k = min(k, n)
    # Replace NaNs with -inf so they go to the bottom
    scores = y_scores.fillna(float("-inf")).to_numpy()
    # Argpartition is O(n) and faster than full sort for small k
    if k == n:
        topk_idx = np.argsort(scores)[::-1][:k]
    else:
        # get indices of k largest (unordered), then sort them to get true top order
        part = np.argpartition(scores, -k)[-k:]
        topk_idx = part[np.argsort(scores[part])[::-1]]
    # y_true is a Series; iloc keeps positional alignment
    topk_true = y_true.iloc[topk_idx]
    return float(topk_true.mean())


def lift_at_k(prec_k: float, base_rate: float) -> float:
    """Lift@k = precision@k / base_rate (>= 0)."""
    if base_rate == 0:
        return np.nan
    return float(prec_k / base_rate)

In [None]:
df.columns.to_list()

In [None]:
pred_cols = [col for col in df.columns if col.startswith('pred')]

results = []
k_values = [5, 10, 20, 50, 100, 200]
base_rate = y_true.mean()

for col in pred_cols:
    y_scores = df[col]
    # Robust binarization (0.5 threshold by default). If your model outputs logits,
    # apply sigmoid first or adjust threshold as needed.
    y_pred = (y_scores >= 0.5).astype(int)

    # Some metrics need non-NaN scores
    scores_no_nan = y_scores.fillna(y_scores.min() - 1e9)

    row = {
        "model": col,
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "f1": f1_score(y_true, y_pred, zero_division=0),
        "roc_auc": roc_auc_score(y_true, scores_no_nan),
        "pr_auc": average_precision_score(y_true, scores_no_nan),  # area under PR curve
        "base_rate_%": base_rate * 100,
    }

    # precision@k and lift@k
    for k in k_values:
        p_at_k = precision_at_k(y_true, y_scores, k)
        row[f"precision@{k}"] = p_at_k
        row[f"lift@{k}"] = lift_at_k(p_at_k, base_rate)

    results.append(row)

metrics_df = (
    pd.DataFrame(results)
      .sort_values(["roc_auc", "pr_auc"], ascending=False)
      .reset_index(drop=True)
)

# Pretty display: show precision@k and lift@k together
cols_order = (
    ["model", "roc_auc", "pr_auc", "accuracy", "precision", "recall", "f1", "base_rate_%"]
    + sum(([f"precision@{k}", f"lift@{k}"] for k in k_values), [])
)
display(metrics_df[cols_order])


In [None]:
print(df.info())
print(df.describe(include="all").transpose())

# Missing values
missing_summary = df.isnull().mean().sort_values(ascending=False)
print("Missing % per column:\n", missing_summary.head(20))

# Class balance
print("Target balance:", y_true.value_counts(normalize=True))


In [None]:
binary_cols = [c for c in df.columns if c.startswith("pred") and c != "rf_prob_30d"]
prob_cols   = ["rf_prob_30d"]  # extend if you add more prob outputs


In [None]:
for col in binary_cols:
    y_pred = df[col].fillna(0).astype(int)
    print(
        col,
        "Acc:", accuracy_score(y_true, y_pred),
        "Prec:", precision_score(y_true, y_pred, zero_division=0),
        "Rec:", recall_score(y_true, y_pred, zero_division=0),
        "F1:", f1_score(y_true, y_pred, zero_division=0),
    )


In [None]:
for k in [5, 10, 20, 50]:
    p_at_k = precision_at_k(y_true, df["rf_prob_30d"], k)
    print(f"rf_prob_30d | Precision@{k}: {p_at_k:.3f}, Lift@{k}: {lift_at_k(p_at_k, base_rate):.2f}")


In [None]:
from sklearn.calibration import calibration_curve
prob_true, prob_pred = calibration_curve(y_true, df["rf_prob_30d"], n_bins=10)
plt.plot(prob_pred, prob_true, marker="o", label="rf_prob_30d")
plt.plot([0,1],[0,1],"--", color="gray")
plt.legend(); plt.title("Calibration curve"); plt.show()


In [None]:
def simulate_threshold(y_true, y_scores, threshold=0.6):
    picks = (y_scores >= threshold).astype(int)
    return precision_score(y_true, picks), recall_score(y_true, picks)

for t in [0.4, 0.5, 0.6, 0.7]:
    prec, rec = simulate_threshold(y_true, df["rf_prob_30d"], t)
    print(f"rf_prob_30d τ={t}: Precision={prec:.3f}, Recall={rec:.3f}")


In [None]:
def simulate_binary_strategy(df, pred_col, return_col="growth_future_30d"):
    picks = df[df[pred_col] == 1]
    if picks.empty:
        return np.nan
    return {
        "avg_return": picks[return_col].mean(),
        "win_rate": (picks[return_col] > 0).mean(),
        "sharpe": picks[return_col].mean() / (picks[return_col].std() + 1e-9),
        "n_trades": len(picks),
    }

for col in binary_cols:
    print(col, simulate_binary_strategy(df, col))


In [None]:
def simulate_prob_threshold(df, prob_col="rf_prob_30d", return_col="growth_future_30d", tau=0.6):
    picks = df[df[prob_col] >= tau]
    if picks.empty:
        return np.nan
    return {
        "avg_return": picks[return_col].mean(),
        "win_rate": (picks[return_col] > 0).mean(),
        "sharpe": picks[return_col].mean() / (picks[return_col].std() + 1e-9),
        "n_trades": len(picks),
    }

for t in [0.4, 0.5, 0.6, 0.7,0.8,0.9,0.95]:
    print(f"τ={t}", simulate_prob_threshold(df, tau=t))


In [None]:
def simulate_topk(df, prob_col="rf_prob_30d", return_col="growth_future_30d", k=20):
    ranked = df.sort_values(prob_col, ascending=False).head(k)
    return {
        "avg_return": ranked[return_col].mean(),
        "win_rate": (ranked[return_col] > 0).mean(),
        "sharpe": ranked[return_col].mean() / (ranked[return_col].std() + 1e-9),
        "n_trades": len(ranked),
    }

for k in [5, 10, 20, 50,100,200,250]:
    print(f"Top-{k}", simulate_topk(df, k=k))


In [None]:
import numpy as np
import pandas as pd
from typing import List, Optional, Dict, Tuple
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score
)

def analyze_predictions(
    df: pd.DataFrame,
    target_col: str = "is_positive_growth_30d_future",
    return_col: str = "growth_future_30d",
    prob_cols: Optional[List[str]] = None,   # e.g. ["rf_prob_30d"]
    binary_cols: Optional[List[str]] = None, # e.g. [c for c in df if c.startswith("pred_")]
    k_values: List[int] = [5, 10, 20, 50, 100, 200],
    thresholds: List[float] = [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95],
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Unified EDA + Simulation for predictions (without calibration).

    Returns:
        metrics_df: rows per (strategy, variant) with classification metrics & precision@k/lift@k.
        simulations_df: rows per (strategy, variant) with investment metrics: avg_return, win_rate, sharpe, n_trades,
                        plus std_return, efficiency, ra_efficiency.
    """
    # ---------- helpers ----------
    def precision_at_k(y_true: pd.Series, y_scores: pd.Series, k: int) -> float:
        n = len(y_scores)
        if n == 0: return np.nan
        k = min(k, n)
        scores = y_scores.fillna(float("-inf")).to_numpy()
        if k == n:
            topk_idx = np.argsort(scores)[::-1][:k]
        else:
            part = np.argpartition(scores, -k)[-k:]
            topk_idx = part[np.argsort(scores[part])[::-1]]
        return float(y_true.iloc[topk_idx].mean())

    def lift_at_k(prec_k: float, base_rate: float) -> float:
        if base_rate == 0 or pd.isna(prec_k): return np.nan
        return float(prec_k / base_rate)

    def sharpe_ratio(returns: pd.Series, rf: float = 0.0) -> float:
        r = pd.to_numeric(returns, errors="coerce").dropna()
        if r.empty: return np.nan
        excess = r - rf
        std = excess.std(ddof=1)
        return float(excess.mean() / (std + 1e-12))

    def _safe_auc(y_true, scores):
        s = pd.to_numeric(scores, errors="coerce")
        if s.nunique(dropna=True) <= 1:
            return np.nan, np.nan
        s = s.fillna(s.min() - 1e9)
        return float(roc_auc_score(y_true, s)), float(average_precision_score(y_true, s))

    # ---------- inputs & guards ----------
    assert target_col in df.columns, f"Missing target_col: {target_col}"
    assert return_col in df.columns, f"Missing return_col: {return_col}"

    if prob_cols is None:
        prob_cols = [c for c in df.columns if c.endswith("_prob_30d") or c.startswith("rf_prob")]
    if binary_cols is None:
        binary_cols = [c for c in df.columns if c.startswith("pred") and c not in prob_cols]

    y_true = df[target_col].astype(int)
    base_rate = float(y_true.mean())

    metrics_rows: List[Dict] = []
    sim_rows: List[Dict] = []

    # ===== 1) Binary strategies (0/1) =====
    for col in binary_cols:
        y_pred = pd.to_numeric(df[col], errors="coerce").fillna(0).astype(int)

        row = {
            "strategy": col,
            "variant": "binary@0.5",
            "type": "binary",
            "base_rate": base_rate,
            "accuracy": accuracy_score(y_true, y_pred),
            "precision": precision_score(y_true, y_pred, zero_division=0),
            "recall": recall_score(y_true, y_pred, zero_division=0),
            "f1": f1_score(y_true, y_pred, zero_division=0),
            "roc_auc": np.nan,
            "pr_auc": np.nan,
        }
        for k in k_values:
            p_at_k = precision_at_k(y_true, y_pred, k)
            row[f"precision@{k}"] = p_at_k
            row[f"lift@{k}"] = lift_at_k(p_at_k, base_rate)
        metrics_rows.append(row)

        # Simulation: buy where label == 1
        #picks = df.loc[y_pred == 1, return_col]

        # after (factor -> simple)
        picks_factor = df.loc[y_pred == 1, return_col]
        picks = pd.to_numeric(picks_factor, errors="coerce") - 1.0
        std_ret = float(picks.std(ddof=1)) if len(picks) > 1 else np.nan
        efficiency = float(picks.mean() * len(picks)) if not picks.empty else np.nan
        ra_efficiency = (efficiency / (std_ret + 1e-12)) if std_ret == std_ret else np.nan

        sim_rows.append({
            "strategy": col,
            "variant": "binary@0.5",
            "type": "binary",
            "n_trades": int(picks.notna().sum()),
            "avg_return": float(picks.mean()) if not picks.empty else np.nan,
            "win_rate": float((picks > 0).mean()) if not picks.empty else np.nan,
            "sharpe": sharpe_ratio(picks),
            "std_return": std_ret,
            "efficiency": efficiency,
            "ra_efficiency": ra_efficiency,
        })

    # ===== 2) Probabilistic strategies =====
    for col in prob_cols:
        scores = pd.to_numeric(df[col], errors="coerce")

        # Classifier view at 0.5
        y_pred = (scores.fillna(0) >= 0.5).astype(int)
        roc_auc, pr_auc = _safe_auc(y_true, scores)

        row = {
            "strategy": col,
            "variant": "prob_raw",
            "type": "prob",
            "base_rate": base_rate,
            "accuracy": accuracy_score(y_true, y_pred),
            "precision": precision_score(y_true, y_pred, zero_division=0),
            "recall": recall_score(y_true, y_pred, zero_division=0),
            "f1": f1_score(y_true, y_pred, zero_division=0),
            "roc_auc": roc_auc,
            "pr_auc": pr_auc,
        }
        for k in k_values:
            p_at_k = precision_at_k(y_true, scores, k)
            row[f"precision@{k}"] = p_at_k
            row[f"lift@{k}"] = lift_at_k(p_at_k, base_rate)
        metrics_rows.append(row)

        # Threshold simulations
        for t in thresholds:
            #picks = df.loc[scores >= t, return_col]
            # after (factor -> simple)
            mask = scores >= t                     # <-- use τ here

            picks_factor = df.loc[mask, return_col]

            picks = pd.to_numeric(picks_factor, errors="coerce") - 1.0
            std_ret = float(picks.std(ddof=1)) if len(picks) > 1 else np.nan
            efficiency = float(picks.mean() * len(picks)) if not picks.empty else np.nan
            ra_efficiency = (efficiency / (std_ret + 1e-12)) if std_ret == std_ret else np.nan

            sim_rows.append({
                "strategy": col,
                "variant": f"τ={t:.2f}",
                "type": "prob",
                "n_trades": int(picks.notna().sum()),
                "avg_return": float(picks.mean()) if not picks.empty else np.nan,
                "win_rate": float((picks > 0).mean()) if not picks.empty else np.nan,
                "sharpe": sharpe_ratio(picks),
                "std_return": std_ret,
                "efficiency": efficiency,
                "ra_efficiency": ra_efficiency,
            })

        # Top-K simulations
        sorted_idx = scores.sort_values(ascending=False).index
        for k in k_values:
            ranked = df.loc[sorted_idx][:k]
            #picks = ranked[return_col]

            # after (factor -> simple)
            picks_factor = ranked[return_col]      # <-- use ranked rows


            picks = pd.to_numeric(picks_factor, errors="coerce") - 1.0
            std_ret = float(picks.std(ddof=1)) if len(picks) > 1 else np.nan
            efficiency = float(picks.mean() * len(picks)) if not picks.empty else np.nan
            ra_efficiency = (efficiency / (std_ret + 1e-12)) if std_ret == std_ret else np.nan

            sim_rows.append({
                "strategy": col,
                "variant": f"top-{k}",
                "type": "prob",
                "n_trades": int(picks.notna().sum()),
                "avg_return": float(picks.mean()) if not picks.empty else np.nan,
                "win_rate": float((picks > 0).mean()) if not picks.empty else np.nan,
                "sharpe": sharpe_ratio(picks),
                "std_return": std_ret,
                "efficiency": efficiency,
                "ra_efficiency": ra_efficiency,
            })

    metrics_df = pd.DataFrame(metrics_rows).reset_index(drop=True)
    simulations_df = pd.DataFrame(sim_rows).reset_index(drop=True)
    return metrics_df, simulations_df


In [None]:
prob_cols = ["rf_prob_30d"]
binary_cols = [c for c in df.columns if c.startswith("pred")]

metrics_df, simulations_df = analyze_predictions(
    df.query('split == "test"'),
    target_col="is_positive_growth_30d_future",
    return_col="growth_future_30d",
    prob_cols=prob_cols,
    binary_cols=binary_cols,
    k_values=[5, 10, 20, 50],
    thresholds=[0.4, 0.5, 0.6, 0.7,0.8,0.9,0.95]
)



In [None]:
df.split.unique()

In [None]:

print("=== METRICS ===")
metrics_df.head(20)


In [None]:
print("\n=== SIMULATIONS ===")
(simulations_df)

In [None]:
(simulations_df[["strategy","variant","n_trades","avg_return","win_rate"]]
        .sort_values(["avg_return","n_trades"], ascending= False).head(20))


In [None]:
(simulations_df.query("type=='prob'")[["strategy","variant","n_trades","avg_return","win_rate"]]
        .sort_values(["avg_return","n_trades"], ascending= False).head(20))


In [None]:
import numpy as np
import pandas as pd

def pareto_front(df: pd.DataFrame, cols=("avg_return","sharpe","n_trades")) -> pd.DataFrame:
    """Return non-dominated strategies on the given columns (maximize all)."""
    if df.empty:
        return df
    keep = []
    vals = df[list(cols)].to_numpy()
    for i, vi in enumerate(vals):
        dominated = False
        for j, vj in enumerate(vals):
            if j == i: 
                continue
            if np.all(np.nan_to_num(vj, nan=-1e9) >= np.nan_to_num(vi, nan=-1e9)) and \
               np.any(np.nan_to_num(vj, nan=-1e9) >  np.nan_to_num(vi, nan=-1e9)):
                dominated = True
                break
        if not dominated:
            keep.append(i)
    return df.iloc[keep].sort_values(list(cols), ascending=False)

def rank_strategies(
    simulations_df: pd.DataFrame,
    prefer_type: str = "prob",     # focus on prob strategies by default
    min_trades: int = 5,           # basic capacity filter
    weights=(1.0, 2.0, 0.2),       # (avg_return, sharpe, log(1+n_trades)) for utility
) -> dict:
    """
    Returns several ranked views:
      - 'filtered': filtered sims
      - 'pareto': Pareto frontier (avg_return, sharpe, n_trades)
      - 'top_utility': by weighted utility
      - 'top_efficiency': by avg_return * n_trades
      - 'top_ra_efficiency': risk-adjusted efficiency
    """
    if simulations_df.empty:
        return {"filtered": simulations_df, "pareto": simulations_df,
                "top_utility": simulations_df, "top_efficiency": simulations_df,
                "top_ra_efficiency": simulations_df}

    sdf = simulations_df.copy()
    if prefer_type:
        sdf = sdf[sdf["type"] == prefer_type].copy()
    if min_trades is not None:
        sdf = sdf[sdf["n_trades"] >= min_trades].copy()

    # Utility score
    w1, w2, w3 = weights
    sdf["utility"] = (
        w1 * sdf["avg_return"].fillna(-1e9) +
        w2 * sdf["sharpe"].fillna(-1e9) +
        w3 * np.log1p(sdf["n_trades"].clip(lower=0))
    )

    views = {
        "filtered": sdf.sort_values(["strategy","variant"]).reset_index(drop=True),
        "pareto": pareto_front(sdf, cols=("avg_return","sharpe","n_trades")).reset_index(drop=True),
        "top_utility": sdf.sort_values("utility", ascending=False).head(20).reset_index(drop=True),
        "top_efficiency": sdf.sort_values("efficiency", ascending=False).head(20).reset_index(drop=True),
        "top_ra_efficiency": sdf.sort_values("ra_efficiency", ascending=False).head(20).reset_index(drop=True),
    }
    return views


In [None]:
# Select best strategies with multiple lenses
views = rank_strategies(simulations_df, prefer_type="prob", min_trades=5, weights=(1.0, 2.0, 0.2))

pareto_df         = views["pareto"]
top_utility_df    = views["top_utility"]
top_eff_df        = views["top_efficiency"]
top_ra_eff_df     = views["top_ra_efficiency"]

print("Pareto frontier:\n", pareto_df.head(10))
print("\nTop by utility:\n", top_utility_df.head(10))
print("\nTop by efficiency:\n", top_eff_df.head(10))
print("\nTop by risk-adjusted efficiency:\n", top_ra_eff_df.head(10))

In [None]:
import numpy as np
import pandas as pd
from typing import List, Optional, Dict, Tuple

# ----------------------------
# Utilities
# ----------------------------
def _ensure_dt(s: pd.Series) -> pd.Series:
    s = pd.to_datetime(s, errors="coerce")
    if s.isna().any():
        raise ValueError("Some dates could not be parsed. Check your date_col.")
    return s

def _max_drawdown_from_equity(equity: pd.Series) -> Tuple[float, float]:
    """Return (max_dd_abs, max_dd_pct). equity must be indexed by date and positive."""
    if equity.empty:
        return 0.0, 0.0
    peaks = equity.cummax()
    dd = equity - peaks
    dd_pct = equity / peaks - 1.0
    return float(dd.min()), float(dd_pct.min())  # negatives

def _sharpe(series: pd.Series, rf: float = 0.0) -> float:
    x = pd.to_numeric(series, errors="coerce").dropna()
    if x.empty:
        return np.nan
    excess = x - rf
    std = excess.std(ddof=1)
    return float(excess.mean() / (std + 1e-12))

# ----------------------------
# Core simulator
# ----------------------------
def simulate_from_predictions(
    df: pd.DataFrame,
    date_col: str,
    return_col: str = "growth_future_30d",     # realized forward *factor* (1+r), e.g., 1.03 for +3%
    prob_cols: Optional[List[str]] = None,     # e.g., ["rf_prob_30d", "xgb_prob_30d"]
    binary_cols: Optional[List[str]] = None,   # e.g., all columns starting with "pred_"
    k_values: List[int] = (5, 10, 20, 50),     # per-date Top-K by prob
    prob_thresholds: List[float] = (0.5,0.55,0.6,0.65,0.7,0.75, 0.8,0.85, 0.9,0.95),  # per-date prob cuts (global thresholds)
    hold_days: int = 30,                       # holding window in days
    invest_per_trade: float = 1000.0,          # notional per position
    fee_rate: float = 0.0005,                  # round-trip fee as fraction of notional (e.g., 5 bps)
    split_col: Optional[str] = None,           # if you want to restrict to a split
    split_name: Optional[str] = None,          # e.g., "test"
    risk_free_annual: float = 0.0,             # set to e.g. 0.05 for 5% annual US T-bill equiv
) -> pd.DataFrame:
    """
    Returns a summary DataFrame: one row per (strategy, variant) with:
      n_trades, avg_return, win_rate, per_trade_sharpe, daily_sharpe,
      capital_required, net_pnl, max_dd_abs, max_dd_pct, cagr,
      avg_positions/day, p75_positions/day, efficiency, ra_efficiency.

    NOTE: `return_col` must be a *factor* (1+r). We convert to simple returns internally (r = factor - 1).
    """
    assert return_col in df.columns, f"{return_col} not in df"
    assert date_col in df.columns, f"{date_col} not in df"

    # Filter to split if requested
    work = df.copy()
    if split_col and split_name:
        work = work.loc[work[split_col] == split_name].copy()
        if work.empty:
            raise ValueError(f"No rows found for {split_col} == {split_name}")

    # Autodetect columns if not provided
    if prob_cols is None:
        prob_cols = [c for c in work.columns if c.endswith("_prob_30d") or c.startswith("rf_prob")]
    if binary_cols is None:
        binary_cols = [c for c in work.columns if c.lower().startswith("pred") and c not in prob_cols]

    # Guard
    if len(prob_cols) == 0 and len(binary_cols) == 0:
        raise ValueError("No strategies found. Provide prob_cols and/or binary_cols.")

    # Prepare
    work = work.copy()
    work[date_col] = _ensure_dt(work[date_col])
    work = work.sort_values(date_col).reset_index(drop=True)

    # Sanity: ensure return_col looks like a factor (~1.0 median)
    s = pd.to_numeric(work[return_col], errors="coerce").dropna()
    if not (0.5 < s.median() < 1.5):
        raise ValueError(
            f"{return_col} doesn't look like a factor (1+r). "
            f"If it's already simple returns, adjust the code to skip '- 1.0'. "
            f"Median value: {s.median():.4f}"
        )

    # Risk-free per trade (approx over hold_days)
    rf_per_trade = (1.0 + risk_free_annual)**(hold_days/365.25) - 1.0 if risk_free_annual > 0 else 0.0

    summaries: List[Dict] = []

    # Helper: compute summary from selected rows ("trades")
    def summarize_trades(trades: pd.DataFrame, label_strategy: str, label_variant: str) -> Dict:
        # --- Realized returns per trade (FACTOR -> SIMPLE) ---
        ret_factor = pd.to_numeric(trades[return_col], errors="coerce").dropna()
        ret = ret_factor - 1.0                     # simple returns (e.g., 1.03 -> 0.03)
        n_trades = int(len(ret))

        if n_trades == 0:
            return {
                "strategy": label_strategy, "variant": label_variant,
                "n_trades": 0, "avg_return": np.nan, "win_rate": np.nan,
                "per_trade_sharpe": np.nan, "daily_sharpe": np.nan,
                "capital_required": 0.0, "net_pnl": 0.0,
                "max_dd_abs": 0.0, "max_dd_pct": 0.0, "cagr": 0.0,
                "avg_pos_per_day": 0.0, "p75_pos_per_day": 0.0,
                "efficiency": np.nan, "ra_efficiency": np.nan
            }

        # --- Per-trade economics (use SIMPLE return) ---
        gross = invest_per_trade * ret              # $ P&L per trade before fees
        fees  = -invest_per_trade * fee_rate        # constant round-trip fee per trade
        net_per_trade = gross + fees

        net_pnl = float(net_per_trade.sum())
        avg_return = float(ret.mean())
        win_rate = float((ret > 0).mean())
        per_trade_sharpe = _sharpe(ret, rf=rf_per_trade)

        # Build entry/exit dates for concurrency & daily P&L
        twd = trades.copy()
        twd["_entry"] = pd.to_datetime(twd[date_col])
        twd["_exit"]  = twd["_entry"] + pd.Timedelta(days=hold_days - 1)

        # --- Daily P&L (book on EXIT day) ---
        daily = (
            pd.DataFrame({"date": twd["_exit"], "pnl": net_per_trade.values})
            .groupby("date")["pnl"].sum()
        )

        # --- Active positions per day via difference array (vectorized) ---
        start_day = twd["_entry"].min()
        end_day   = twd["_exit"].max()
        days = pd.date_range(start_day, end_day, freq="D")
        idx = pd.Index(days, name="date")

        delta = pd.Series(0.0, index=idx)
        entry_counts = twd["_entry"].value_counts()
        exit_counts  = (twd["_exit"] + pd.Timedelta(days=1)).value_counts()
        delta = delta.add(entry_counts, fill_value=0.0)
        delta = delta.add(-exit_counts, fill_value=0.0)
        active_pos = delta.cumsum()

        avg_pos = float(active_pos.mean())
        p75_pos = float(active_pos.quantile(0.75))
        pos_for_cap = max(avg_pos, p75_pos, 1.0)          # ≥1 position equivalent
        capital_required = float(invest_per_trade * pos_for_cap)

        # --- Equity & drawdown ---
        daily_all = pd.Series(0.0, index=idx)
        daily_all.loc[daily.index] = daily.values
        equity = capital_required + daily_all.cumsum()

        max_dd_abs, max_dd_pct = _max_drawdown_from_equity(equity)

        # --- Daily Sharpe: time-weighted by capital in use ---
        cap_in_use = invest_per_trade * active_pos
        daily_ret_proxy = (daily_all / cap_in_use.replace(0, np.nan)).fillna(0.0)
        daily_sharpe = _sharpe(daily_ret_proxy, rf=0.0)

        # --- CAGR over the full backtest interval ---
        years = max((idx[-1] - idx[0]).days / 365.25, 0.5)   # ≥ 6 months
        starting = max(capital_required, invest_per_trade)
        ending   = float(equity.iloc[-1])
        cagr = float((ending / starting) ** (1.0 / years) - 1.0)

        # Custom efficiency metrics
        efficiency = float(avg_return * n_trades)
        std_ret = float(ret.std(ddof=1)) if n_trades > 1 else np.nan
        ra_efficiency = (efficiency / (std_ret + 1e-12)) if std_ret == std_ret else np.nan

        return {
            "strategy": label_strategy, "variant": label_variant,
            "n_trades": n_trades, "avg_return": avg_return, "win_rate": win_rate,
            "per_trade_sharpe": per_trade_sharpe, "daily_sharpe": daily_sharpe,
            "capital_required": capital_required, "net_pnl": net_pnl,
            "max_dd_abs": max_dd_abs, "max_dd_pct": max_dd_pct, "cagr": cagr,
            "avg_pos_per_day": avg_pos, "p75_pos_per_day": p75_pos,
            "efficiency": efficiency, "ra_efficiency": ra_efficiency
        }

    # ---------------------------------
    # A) Binary strategies (per-date)
    # ---------------------------------
    for col in (binary_cols or []):
        if col not in work.columns:
            continue
        trades = work.loc[work[col].astype(float) == 1.0, [date_col, return_col]].copy()
        summaries.append(summarize_trades(trades, col, "binary@1"))

    # ---------------------------------
    # B) Prob strategies: per-date Top-K & thresholds
    # ---------------------------------
    for col in (prob_cols or []):
        if col not in work.columns:
            continue
        scores = pd.to_numeric(work[col], errors="coerce")

        # Top-K per date
        tmp = work.assign(_score=scores).dropna(subset=["_score"])
        g = tmp.groupby(date_col)
        for k in k_values:
            picks = (
                g.apply(lambda d: d.nlargest(k, columns="_score"))
                 .reset_index(level=0, drop=True)
                 [[date_col, return_col]]
            )
            summaries.append(summarize_trades(picks, col, f"top-{k}"))

        # Threshold (scores >= τ)
        for t in prob_thresholds:
            picks = work.loc[scores >= t, [date_col, return_col]]
            summaries.append(summarize_trades(picks, col, f"τ={t:.2f}"))

    result = pd.DataFrame(summaries)
    if not result.empty:
        result = result.sort_values(["strategy", "variant"]).reset_index(drop=True)
    return result


In [None]:
# Example wiring (tweak names if yours differ)
summary = simulate_from_predictions(
    df=df,                      # your loaded predictions dataframe
    date_col="Date",                  # <- your timestamp column
    return_col="growth_future_30d",   # realized forward return
    prob_cols=["rf_prob_30d"],        # add more prob columns if you have them
    binary_cols=[c for c in df.columns if c.startswith("pred")],
    k_values=[5, 10, 20, 50,75,100],
    prob_thresholds=[0.6, 0.65,0.7,0.75, 0.8,0.85, 0.9,0.95],
    hold_days=30,
    invest_per_trade=1000.0,
    fee_rate=0.0005,
    split_col="split",                   # or "split"
    split_name="test",                  # or "test"
    risk_free_annual=0.05              # set to 0.05 to include a 5% annual RF in per-trade Sharpe
)


hold_days = 30  # or whatever you used
summary = summary.copy()

# Per-trade annualization (geometric, safer)
summary["per_trade_annualized"] = np.exp(
    (365/hold_days) * np.log1p(summary["avg_return"].clip(lower=-0.999999)).replace([-np.inf, np.inf], np.nan)
) - 1

# Pretty %
for col in ["avg_return", "per_trade_annualized", "cagr"]:
    summary[f"{col}_%"] = (summary[col] * 100).round(2)



In [None]:
summary.loc[:, ["strategy","variant","n_trades",
                "avg_return_%","win_rate","per_trade_annualized_%","cagr_%"]]

In [None]:
summary.sort_values(
    by=["avg_return", "n_trades"],
    ascending=[False, True],     # high avg_return first, then fewer trades first
    na_position="last"
)

# 1) avg_return (per-trade, over the hold window) or “Average 30-day return per trade: +X%.”
# 2) Cagr : The annualized growth rate of your portfolio equity over the whole backtest, starting with capital_required and adding daily P&L (we book P&L on exit day) “Run as a portfolio with realistic concurrency, the strategy achieved a CAGR of Z% over the backtest period.”

In [None]:
# Check if your returns are realistic
ret_stats = df['growth_future_30d'].describe()
print(f"Return distribution:\n{ret_stats}")

# Check win rate vs market
market_win_rate = (df['growth_future_30d'] > 1.0).mean()
print(f"Market win rate: {market_win_rate:.3f}")


In [None]:
# Check your target variable timing
print("Feature date range:", df['Date'].min(), "to", df['Date'].max())
print("Target date range for growth_future_30d")

# Make sure predictions are made BEFORE target period
sample_row = df.iloc[100]
print(f"Prediction made on: {sample_row['Date']}")
print(f"Return measured from: {sample_row['Date']} to {sample_row['Date'] + pd.Timedelta(days=30)}")

In [None]:
# Look for features that might use future data
suspicious_features = [col for col in df.columns if 'future' in col.lower()]
print("Suspicious features:", suspicious_features)

In [None]:
# Ensure test data comes AFTER training data
train_dates = df[df['split'] == 'train']['Date']
test_dates = df[df['split'] == 'test']['Date']
valid_dates = df[df['split'] == 'validation']['Date']




print(f"Train period: {train_dates.min()} to {train_dates.max()}")
print(f"Test period: {test_dates.min()} to {test_dates.max()}")
print(f"Valid period: {valid_dates.min()} to {valid_dates.max()}")


# This should be True
print(f"Test starts after train ends: {test_dates.min() > train_dates.max()}")