In [1]:
import backtrader as bt
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import yfinance as yf
import pickle
from typing import List, Dict
from backtrader.feeds import PandasData
import sys
import os

In [2]:
# Load model
with open('xgb_model_simp.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

In [3]:
def get_sp500_tickers():
    """Get S&P 500 tickers from Wikipedia"""
    url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
    tables = pd.read_html(url)
    sp500_table = tables[0]
    return sp500_table['Symbol'].tolist()
sp500_tickers = get_sp500_tickers()
print(f"S&P 500 tickers: {len(sp500_tickers)}")

S&P 500 tickers: 503


In [4]:
data = yf.download(sp500_tickers[:2], start='2020-01-01', end='2023-12-31')
data.xs('AOS', axis=1, level=1)

  data = yf.download(sp500_tickers[:2], start='2020-01-01', end='2023-12-31')
[*********************100%***********************]  2 of 2 completed


Price,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-02,42.994770,43.183777,42.490748,43.120775,1093200
2020-01-03,42.616760,42.751767,42.076739,42.445754,883200
2020-01-06,42.886768,42.904766,42.085732,42.418749,1088400
2020-01-07,42.598759,42.967774,42.427749,42.661761,868200
2020-01-08,42.535751,43.030774,42.472749,42.679757,1119100
...,...,...,...,...,...
2023-12-22,79.236603,79.527446,78.761546,79.449885,449400
2023-12-26,79.352936,79.556528,78.936054,79.207510,420400
2023-12-27,79.653481,79.886157,79.197818,79.478972,441700
2023-12-28,79.731041,79.924944,79.333553,79.430501,452800


In [5]:
class TechnicalIndicators:
    """
    A class to calculate various technical indicators for stock price data
    """
    
    @staticmethod
    def moving_average(data: pd.Series, window: int) -> pd.Series:
        """Calculate Simple Moving Average"""
        return data.rolling(window=window).mean()
    
    @staticmethod
    def exponential_moving_average(data: pd.Series, window: int) -> pd.Series:
        """Calculate Exponential Moving Average"""
        return data.ewm(span=window).mean()
    
    @staticmethod
    def returns(data: pd.Series, periods: int = 1) -> pd.Series:
        """Calculate returns over specified periods"""
        return data.pct_change(periods=periods)
    
    @staticmethod
    def future_returns(data: pd.Series, periods: int = 1) -> pd.Series:
        """Calculate future returns over specified periods"""
        return (data.shift(periods=periods) - data) / data

    @staticmethod
    def volatility(data: pd.Series, window: int) -> pd.Series:
        """Calculate rolling volatility (standard deviation of returns)"""
        returns = data.pct_change()
        return returns.rolling(window=window).std() * np.sqrt(252)  # Annualized
    
    @staticmethod
    def rsi(data: pd.Series, window: int = 14) -> pd.Series:
        """Calculate Relative Strength Index"""
        delta = data.diff()
        gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
        loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
        rs = gain / loss
        rsi = 100 - (100 / (1 + rs))
        return rsi
    
    @staticmethod
    def macd(data: pd.Series, fast: int = 12, slow: int = 26, signal: int = 9) -> Dict[str, pd.Series]:
        """Calculate MACD (Moving Average Convergence Divergence)"""
        ema_fast = data.ewm(span=fast).mean()
        ema_slow = data.ewm(span=slow).mean()
        macd_line = ema_fast - ema_slow
        signal_line = macd_line.ewm(span=signal).mean()
        histogram = macd_line - signal_line
        
        return {
            'MACD': macd_line,
            'MACD_Signal': signal_line,
            'MACD_Histogram': histogram
        }
    
    @staticmethod
    def bollinger_bands(data: pd.Series, window: int = 20, num_std: float = 2) -> Dict[str, pd.Series]:
        """Calculate Bollinger Bands"""
        sma = data.rolling(window=window).mean()
        std = data.rolling(window=window).std()
        
        return {
            'BB_Upper': sma + (std * num_std),
            'BB_Middle': sma,
            'BB_Lower': sma - (std * num_std),
            'BB_Width': (sma + (std * num_std)) - (sma - (std * num_std)),
            'BB_Position': (data - sma) / (std * num_std)
        }
    
    @staticmethod
    def average_true_range(high: pd.Series, low: pd.Series, close: pd.Series, window: int = 14) -> pd.Series:
        """Calculate Average True Range"""
        high_low = high - low
        high_close = np.abs(high - close.shift())
        low_close = np.abs(low - close.shift())
        
        true_range = np.maximum(high_low, np.maximum(high_close, low_close))
        atr = true_range.rolling(window=window).mean()
        return atr
    
    @staticmethod
    def stochastic_oscillator(high: pd.Series, low: pd.Series, close: pd.Series, 
                            k_window: int = 14, d_window: int = 3) -> Dict[str, pd.Series]:
        """Calculate Stochastic Oscillator"""
        lowest_low = low.rolling(window=k_window).min()
        highest_high = high.rolling(window=k_window).max()
        
        k_percent = 100 * ((close - lowest_low) / (highest_high - lowest_low))
        d_percent = k_percent.rolling(window=d_window).mean()
        
        return {
            'Stoch_K': k_percent,
            'Stoch_D': d_percent
        }

In [6]:
def calculate_all_indicators(stock_data, ticker) -> pd.DataFrame:
    """
    Calculate all technical indicators for all stocks
    """
    all_indicators = []

    
    # print("Calculating technical indicators...")
    
    try:
        # Basic price data
        close = stock_data.loc[:,'Close']
        high = stock_data.loc[:,'High']
        low = stock_data.loc[:,'Low']
        volume = stock_data.loc[:,'Volume']
        
        # Create a DataFrame for this stock's indicators
        indicators_df = pd.DataFrame(index=stock_data.index)
        indicators_df['Close'] = close
        indicators_df['High'] = high
        indicators_df['Low'] = low
        indicators_df['Volume'] = volume

        

        # Future Returns
        for period in [-1, -5, -10, -21, -63]:
            indicators_df[f'Future_Return_{abs(period)}d'] = TechnicalIndicators.future_returns(close, period)

        # Moving Averages
        for window in [5, 10, 20, 50, 100, 200]:
            indicators_df[f'SMA_{window}'] = TechnicalIndicators.moving_average(close, window)
            indicators_df[f'EMA_{window}'] = TechnicalIndicators.exponential_moving_average(close, window)
        
        # Returns
        for period in [1, 5, 10, 21, 63]:
            indicators_df[f'Return_{period}d'] = TechnicalIndicators.returns(close, period)

        # Volatility
        for window in [10, 21, 63]:
            indicators_df[f'Volatility_{window}d'] = TechnicalIndicators.volatility(close, window)
        
        # RSI
        indicators_df['RSI_14'] = TechnicalIndicators.rsi(close, 14)
        indicators_df['RSI_21'] = TechnicalIndicators.rsi(close, 21)
        
        # MACD
        macd_data = TechnicalIndicators.macd(close)
        for key, value in macd_data.items():
            indicators_df[key] = value
        
        # Bollinger Bands
        bb_data = TechnicalIndicators.bollinger_bands(close)
        for key, value in bb_data.items():
            indicators_df[key] = value
        
        # ATR
        indicators_df['ATR_14'] = TechnicalIndicators.average_true_range(high, low, close, 14)
        
        # Stochastic Oscillator
        stoch_data = TechnicalIndicators.stochastic_oscillator(high, low, close)
        for key, value in stoch_data.items():
            indicators_df[key] = value
        
        # Volume indicators
        indicators_df['Volume_SMA_20'] = TechnicalIndicators.moving_average(volume, 20)
        indicators_df['Volume_Ratio'] = volume / indicators_df['Volume_SMA_20']
        
        # Price momentum
        indicators_df['Momentum_10'] = close / close.shift(10) - 1
        indicators_df['Momentum_21'] = close / close.shift(21) - 1
        
        # Price position relative to moving averages
        indicators_df['Price_vs_SMA20'] = close / indicators_df['SMA_20'] - 1
        indicators_df['Price_vs_SMA50'] = close / indicators_df['SMA_50'] - 1
        
        # print(f"✓ Calculated indicators for {ticker}")
        # print(indicators_df)
        return indicators_df
        
    except Exception as e:
        print(f"✗ Error calculating indicators for {ticker}: {str(e)}")
    
    # # Combine all data
    # if all_indicators:
    #     combined_df = pd.concat(all_indicators, ignore_index=False)
    #     combined_df.reset_index(inplace=True)
    #     combined_df.rename(columns={'index': 'Date'}, inplace=True)
        
    #     # Set MultiIndex
    #     combined_df.set_index(['Date', 'Ticker'], inplace=True)
        
    #     print(f"\nFinal dataset shape: {combined_df.shape}")
    #     print(f"Columns: {list(combined_df.columns)}")
        
    #     return combined_df
    # else:
    #     print("No data to combine!")
    #     return pd.DataFrame()



In [7]:
class SignalData(PandasData):
    """
    Define pandas DataFrame structure
    """
    cols = ['close', 'high', 'low', 'open', 'volume'] + ['predict']
# create lines
    lines = tuple(cols)
# define parameters
    params = {c: -1 for c in cols}
    params.update({'datetime': None})
    params = tuple(params.items())


In [8]:
params = {c: -1 for c in ['close', 'high', 'low', 'open', 'volume'] + ['predict']}
params.update({'datetime': None})
params = tuple(params.items())
params

(('close', -1),
 ('high', -1),
 ('low', -1),
 ('open', -1),
 ('volume', -1),
 ('predict', -1),
 ('datetime', None))

In [54]:
def silent_download(*args, **kwargs):
    original_stdout = sys.stdout
    try:
        sys.stdout = open(os.devnull, 'w')
        return yf.download(*args, **kwargs)
    finally:
        sys.stdout.close()
        sys.stdout = original_stdout

# Custom analyzer for portfolio tracking
class PortfolioValue(bt.Analyzer):
    def __init__(self):
        
        self.portfolio_values = []
    
    def next(self):
        self.portfolio_values.append({
            'date': self.strategy.datetime.date(0),
            'value': self.strategy.broker.getvalue()
        })
    
    def get_analysis(self):
        return self.portfolio_values

class TestStrategy(bt.Strategy):
    params = (
        ('top_n', 10),  # Number of top stocks to buy
        ('rebalance_freq', 7),  # Rebalance every 7 calendar days (1 week)
    )
    
    
    def log(self, txt, dt=None):
        ''' Logging function for this strategy'''
        dt = dt or self.datas[0].datetime.date(0)
        print('%s, %s' % (dt.isoformat(), txt))


    def __init__(self):
        # Keep a reference to the "close" line in the data[0] dataseries
        self.dataclose = self.datas[0].close
        self.datapredict = self.datas[0].predict
        self.last_trade_date = None
        self.current_positions = []

    def first_trade_day_of_week(self, last_chosen, current):
        """
        Determines if current_date should be picked based on Monday-or-next logic.
        
        Args:
            last_chosen_date: Last date that was chosen (yyyy-mm-dd string) or None if first iteration
            current_date: Current date being evaluated (yyyy-mm-dd string)
        
        Returns:
            bool: True if current_date should be picked
        """
        
        # If this is the first date, pick it
        if last_chosen is None:
            if current.weekday() != 0:
                return False
            if current.weekday() == 0:
                return True
        
        # Find the Monday of current date's week
        current_monday = current - timedelta(days=current.weekday())
        
        # Find the Monday of last chosen date's week
        last_monday = last_chosen - timedelta(days=last_chosen.weekday())
        
        # If we're in a new week
        if current_monday > last_monday:
            # Pick this date if it's Monday OR if it's the first date we see in this new week
            return current.weekday() == 0 or current_monday <= current
        
        # If we're in the same week, don't pick
        return False

    def next(self):
        # Rebalance every week (Monday or first trading day of the week)
        cur_date = self.datas[0].datetime.date(0)
        
        if self.first_trade_day_of_week(self.last_trade_date, cur_date):
            self.rebalance_portfolio()
            self.last_trade_date = cur_date

    def rebalance_portfolio(self):
        """Rebalance portfolio based on XGBoost predictions"""
        
        # Close all current positions
        for data in self.datas:
            if self.getposition(data).size != 0:
                self.close(data=data)
        
        
        # Sort by predicted returns and get top N
        predictions = pd.Series()
        for data in self.datas:
            predictions[data._name] = data.predict[0]
        top_stocks = predictions.nlargest(self.params.top_n)
        
        # Calculate position size (equal weight)
        available_cash = self.broker.getcash()
        position_size = (available_cash / len(top_stocks)) * .95
        
        # Buy top predicted stocks
        for ticker, predicted_return in top_stocks.items():
            data = self.get_data_by_name(ticker)
            if data is not None:
                size = int(position_size / data.close[0])
                if size > 0:
                    self.buy(data=data, size=size)
                    self.log(f"Buying {size} shares of {ticker} at {data.close[0]:.2f} (predicted return: {predicted_return:.4f})")
    
    def get_data_by_name(self, name):
        """Get data feed by ticker name"""
        for data in self.datas:
            if data._name == name:
                return data
        return None
    
    def notify_order(self, order):
        if order.status in [order.Completed]:
            if order.isbuy():
                self.log(f'BUY EXECUTED, Price: {order.executed.price}, Size: {order.executed.size}')
            elif order.issell():
                self.log(f'SELL EXECUTED, Price: {order.executed.price}, Size: {order.executed.size}')
        if order.status in [order.Rejected, order.Canceled]:
            self.log(f'Order {order.Status[order.status]}')
    


def run_backtest(sp500_tickers, start_date='2017-05-01', end_date='2020-05-01'):
    """
    Run the backtest with your XGBoost model
    """
    cerebro = bt.Cerebro()

    # Add a strategy
    cerebro.addstrategy(TestStrategy)

    values_to_remove = ['BRK.B', 'BF.B']
    sample_tickers = [x for x in sp500_tickers if x not in values_to_remove]

    cv_date = '2018-05-01'
    pull_date = datetime(2018, 5, 1) - timedelta(days=300)
    all_stock_data = silent_download(sample_tickers, start=pull_date, end=end_date, progress=False)
    all_stock_data.dropna(inplace=True)
    for ticker in sample_tickers:
        try:
            stock_data = all_stock_data.xs(ticker, axis=1, level=1)
            features = calculate_all_indicators(stock_data, ticker)
            features = features.loc[cv_date:]
            X_test = features.drop(['Future_Return_1d', 'Future_Return_5d', 'Future_Return_10d', 'Future_Return_21d', 'Future_Return_63d'], axis=1)
            X_test = X_test[['Price_vs_SMA50', 'Return_63d', 'RSI_21', 'MACD_Histogram', 'Return_5d', 'SMA_200', 'SMA_100']]
            stock_data = stock_data.loc[cv_date:]
            stock_data['predict'] = loaded_model.predict(X_test)
            stock_data.rename(columns = {'Open':'open','High':'high','Low':'low','Close':'close','Volume':'volume',
                                    }, inplace=True)
            if len(data) > 100:  # Ensure sufficient data
                bt_data = SignalData(dataname=stock_data)
                cerebro.adddata(bt_data, name=ticker)
        except:
            continue

    # Set initial cash
    cerebro.broker.setcash(100000.0)
    
    # Set commission
    cerebro.broker.setcommission(commission=0.001)  # 0.1% commission

    # Add analyzers
    cerebro.addanalyzer(bt.analyzers.SharpeRatio, _name='sharpe')
    cerebro.addanalyzer(bt.analyzers.Returns, _name='returns')
    cerebro.addanalyzer(bt.analyzers.DrawDown, _name='drawdown')
    cerebro.addanalyzer(PortfolioValue, _name='portfolio')


    print('Starting Portfolio Value: %.2f' % cerebro.broker.getvalue())

    # Run backtest
    results = cerebro.run()

    print('Final Portfolio Value: %.2f' % cerebro.broker.getvalue())

    # Print results
    strat = results[0]
    print(f'Sharpe Ratio: {strat.analyzers.sharpe.get_analysis()["sharperatio"]:.2f}')
    print(f'Total Return: {strat.analyzers.returns.get_analysis()["rtot"]:.2%}')
    print(f'Max Drawdown: {strat.analyzers.drawdown.get_analysis()["max"]["drawdown"]:.2%}')
    print(strat.analyzers.portfolio.get_analysis())
    
    return strat

In [20]:
results.analyzers.sharpe.get_analysis()

OrderedDict([('sharperatio', 2.3066087128248283)])

In [None]:
# Calculate Sharpe Ratio
def calculate_sharpe_ratio(daily_returns, risk_free_rate=0.02):
    """
    Calculate annualized Sharpe ratio from daily returns
    
    Parameters:
    daily_returns: pandas Series or numpy array of daily returns (as decimals, e.g., 0.01 for 1%)
    risk_free_rate: annual risk-free rate (default 2%)
    
    Returns:
    sharpe_ratio: annualized Sharpe ratio
    """
    
    # Convert to numpy array if pandas Series
    if isinstance(daily_returns, pd.Series):
        returns = daily_returns.values
    else:
        returns = daily_returns
    
    # Calculate daily risk-free rate
    daily_rf_rate = risk_free_rate / 252  # Assuming 252 trading days per year
    
    # Calculate excess returns
    excess_returns = returns - daily_rf_rate
    
    # Calculate mean and standard deviation of excess returns
    mean_excess_return = np.mean(excess_returns)
    std_excess_return = np.std(excess_returns, ddof=1)  # Sample standard deviation
    
    # Annualize the Sharpe ratio
    sharpe_ratio = (mean_excess_return / std_excess_return) * np.sqrt(252)
    
    return sharpe_ratio

Sample portfolio returns:
        date  daily_return
0 2023-01-01      0.010734
1 2023-01-02     -0.001965
2 2023-01-03      0.013754
3 2023-01-04      0.031261
4 2023-01-05     -0.003883
5 2023-01-06     -0.003883
6 2023-01-07      0.032384
7 2023-01-08      0.016149
8 2023-01-09     -0.008589
9 2023-01-10      0.011651

Total observations: 252

--- Sharpe Ratio Calculation ---
Annualized Sharpe Ratio: 0.530

--- Additional Metrics ---
Annualized Return: 18.3%
Annualized Volatility: 30.7%
Risk-free rate used: 2.0%


In [42]:
test = pd.DataFrame(results.analyzers.portfolio.get_analysis())
test['return'] = test['value'] / test['value'].shift() - 1
returns = test[test['date'] <= pd.to_datetime('2025-05-07')]['return'][1:]

rf_rate = .02

sharpe = calculate_sharpe_ratio(returns, rf_rate)

print(f"\n--- Sharpe Ratio Calculation ---")
print(f"Annualized Sharpe Ratio: {sharpe:.3f}")


annual_return = returns.mean() * 252
annual_volatility = returns.std() * np.sqrt(252)

print(f"\n--- Additional Metrics ---")
print(f"Annualized Return: {annual_return:.1%}")
print(f"Annualized Volatility: {annual_volatility:.1%}")
print(f"Risk-free rate used: {rf_rate: .1%}")



--- Sharpe Ratio Calculation ---
Annualized Sharpe Ratio: 0.652

--- Additional Metrics ---
Annualized Return: 13.6%
Annualized Volatility: 17.8%
Risk-free rate used:  2.0%


  returns = test[test['date'] <= pd.to_datetime('2025-05-07')]['return'][1:]


In [51]:
data = yf.download('^GSPC', start='2024-05-06', end='2025-05-06')
sp_500_data = data.xs('^GSPC', axis=1, level=1)
sp_500_data['return'] = sp_500_data['Close'] / sp_500_data['Close'].shift() - 1
returns = sp_500_data['return'][1:]

rf_rate = .04

sharpe = calculate_sharpe_ratio(returns, rf_rate)

print(f"\n--- Sharpe Ratio Calculation ---")
print(f"Annualized Sharpe Ratio: {sharpe:.3f}")


annual_return = returns.mean() * 252
annual_volatility = returns.std() * np.sqrt(252)

print(f"\n--- Additional Metrics ---")
print(f"Annualized Return: {annual_return:.1%}")
print(f"Annualized Volatility: {annual_volatility:.1%}")
print(f"Risk-free rate used: {rf_rate: .1%}")


  data = yf.download('^GSPC', start='2024-05-06', end='2025-05-06')
[*********************100%***********************]  1 of 1 completed


--- Sharpe Ratio Calculation ---
Annualized Sharpe Ratio: 0.342

--- Additional Metrics ---
Annualized Return: 10.6%
Annualized Volatility: 19.4%
Risk-free rate used:  4.0%





In [55]:
results = run_backtest(sp500_tickers)

  return yf.download(*args, **kwargs)

11 Failed downloads:
['PLTR', 'COIN', 'SOLV', 'CEG', 'GEV', 'KVUE', 'GEHC', 'EXE', 'VLTO', 'ABNB', 'DASH']: YFPricesMissingError('possibly delisted; no price data found  (1d 2017-07-05 00:00:00 -> 2020-05-01) (Yahoo error = "Data doesn\'t exist for startDate = 1499227200, endDate = 1588305600")')


Starting Portfolio Value: 100000.00


ZeroDivisionError: float division by zero

In [45]:
len(list(all_stock_data))

50

In [21]:
values_to_remove = ['BRK.B', 'BF.B']
sample_tickers = [x for x in sp500_tickers if x not in values_to_remove]
len(sample_tickers)

501

In [31]:
all_stock_data.xs('AAPL', axis=1, level=1)

Price,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-07-06,189.908585,190.116510,187.324457,187.958112,45094300
2023-07-07,188.789780,190.760058,188.354154,189.512554,46815000
2023-07-10,186.740311,188.106636,185.185867,187.383861,59922200
2023-07-11,186.215576,187.423484,184.750252,187.284872,46638100
2023-07-12,187.888824,189.799685,186.601708,187.799705,60750200
...,...,...,...,...,...
2025-07-31,207.570007,209.839996,207.160004,208.490005,80698400
2025-08-01,202.380005,213.580002,201.500000,210.869995,104434500
2025-08-04,203.350006,207.880005,201.679993,204.509995,75109300
2025-08-05,202.919998,205.339996,202.160004,203.399994,44155100


In [16]:
# all_stock_data = yf.download(sp500_tickers, start=datetime(2024, 5, 1) - timedelta(days=300), progress=False)
all_stock_data.xs('BRK.B', axis=1, level=1)

  all_stock_data = yf.download(sp500_tickers, start=datetime(2024, 5, 1) - timedelta(days=300), progress=False)

2 Failed downloads:
['BRK.B']: YFTzMissingError('possibly delisted; no timezone found')
['BF.B']: YFPricesMissingError('possibly delisted; no price data found  (1d 2023-07-06 00:00:00 -> 2025-08-07)')


Price,Adj Close,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-07-06,,,,,,
2023-07-07,,,,,,
2023-07-10,,,,,,
2023-07-11,,,,,,
2023-07-12,,,,,,
...,...,...,...,...,...,...
2025-07-31,,,,,,
2025-08-01,,,,,,
2025-08-04,,,,,,
2025-08-05,,,,,,


In [None]:
print(f'Sharpe Ratio: {results.analyzers.sharpe.get_analysis()["sharperatio"]:.2f}')
print(f'Total Return: {results.analyzers.returns.get_analysis()["rtot"]:.2%}')
print(f'Max Drawdown: {results.analyzers.drawdown.get_analysis()["max"]["drawdown"]:.2%}')
pd.DataFrame(results.analyzers.portfolio.get_analysis())

  return yf.download(*args, **kwargs)


Starting Portfolio Value: 100000.00
Final Portfolio Value: 100000.00


IndexError: list index out of range

In [None]:
class TechnicalIndicators:
    """
    A class to calculate various technical indicators for stock price data
    """
    
    @staticmethod
    def moving_average(data: pd.Series, window: int) -> pd.Series:
        """Calculate Simple Moving Average"""
        return data.rolling(window=window).mean()
    
    @staticmethod
    def exponential_moving_average(data: pd.Series, window: int) -> pd.Series:
        """Calculate Exponential Moving Average"""
        return data.ewm(span=window).mean()
    
    @staticmethod
    def returns(data: pd.Series, periods: int = 1) -> pd.Series:
        """Calculate returns over specified periods"""
        return data.pct_change(periods=periods)
    
    @staticmethod
    def future_returns(data: pd.Series, periods: int = 1) -> pd.Series:
        """Calculate future returns over specified periods"""
        return (data.shift(periods=periods) - data) / data

    @staticmethod
    def volatility(data: pd.Series, window: int) -> pd.Series:
        """Calculate rolling volatility (standard deviation of returns)"""
        returns = data.pct_change()
        return returns.rolling(window=window).std() * np.sqrt(252)  # Annualized
    
    @staticmethod
    def rsi(data: pd.Series, window: int = 14) -> pd.Series:
        """Calculate Relative Strength Index"""
        delta = data.diff()
        gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
        loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
        rs = gain / loss
        rsi = 100 - (100 / (1 + rs))
        return rsi
    
    @staticmethod
    def macd(data: pd.Series, fast: int = 12, slow: int = 26, signal: int = 9) -> Dict[str, pd.Series]:
        """Calculate MACD (Moving Average Convergence Divergence)"""
        ema_fast = data.ewm(span=fast).mean()
        ema_slow = data.ewm(span=slow).mean()
        macd_line = ema_fast - ema_slow
        signal_line = macd_line.ewm(span=signal).mean()
        histogram = macd_line - signal_line
        
        return {
            'MACD': macd_line,
            'MACD_Signal': signal_line,
            'MACD_Histogram': histogram
        }
    
    @staticmethod
    def bollinger_bands(data: pd.Series, window: int = 20, num_std: float = 2) -> Dict[str, pd.Series]:
        """Calculate Bollinger Bands"""
        sma = data.rolling(window=window).mean()
        std = data.rolling(window=window).std()
        
        return {
            'BB_Upper': sma + (std * num_std),
            'BB_Middle': sma,
            'BB_Lower': sma - (std * num_std),
            'BB_Width': (sma + (std * num_std)) - (sma - (std * num_std)),
            'BB_Position': (data - sma) / (std * num_std)
        }
    
    @staticmethod
    def average_true_range(high: pd.Series, low: pd.Series, close: pd.Series, window: int = 14) -> pd.Series:
        """Calculate Average True Range"""
        high_low = high - low
        high_close = np.abs(high - close.shift())
        low_close = np.abs(low - close.shift())
        
        true_range = np.maximum(high_low, np.maximum(high_close, low_close))
        atr = true_range.rolling(window=window).mean()
        return atr
    
    @staticmethod
    def stochastic_oscillator(high: pd.Series, low: pd.Series, close: pd.Series, 
                            k_window: int = 14, d_window: int = 3) -> Dict[str, pd.Series]:
        """Calculate Stochastic Oscillator"""
        lowest_low = low.rolling(window=k_window).min()
        highest_high = high.rolling(window=k_window).max()
        
        k_percent = 100 * ((close - lowest_low) / (highest_high - lowest_low))
        d_percent = k_percent.rolling(window=d_window).mean()
        
        return {
            'Stoch_K': k_percent,
            'Stoch_D': d_percent
        }
    
def calculate_features(data, ticker):
    """Calculate features for a given stock data"""
    try:
        # Convert backtrader data to pandas DataFrame
        df = pd.DataFrame({
            'Open': [data.open[i] for i in range(-50, 0)],
            'High': [data.high[i] for i in range(-50, 0)],
            'Low': [data.low[i] for i in range(-50, 0)],
            'Close': [data.close[i] for i in range(-50, 0)],
            'Volume': [data.volume[i] for i in range(-50, 0)]
        })
        
        # Use your feature calculator function
        features = self.feature_calculator(df)
        return features
    except:
        return None


In [None]:



# Example usage function
def run_backtest(model, sp500_tickers, start_date='2020-01-01', end_date='2023-12-31'):
    """
    Run the backtest with your XGBoost model
    """
    cerebro = bt.Cerebro()
    
    # Add strategy
    strategy = cerebro.addstrategy(XGBoostStrategy, top_n=10, rebalance_freq=5)
    
    # Download and add data for S&P 500 stocks (sample)
    sample_tickers = sp500_tickers[:50]  # Use first 50 for demo
    
    for ticker in sample_tickers:
        try:
            data = yf.download(ticker, start=start_date, end=end_date)
            if len(data) > 100:  # Ensure sufficient data
                bt_data = bt.feeds.PandasData(
                    dataname=data,
                    name=ticker
                )
                cerebro.adddata(bt_data)
        except:
            continue
    
    # Set initial cash
    cerebro.broker.setcash(100000.0)
    
    # Set commission
    cerebro.broker.setcommission(commission=0.001)  # 0.1% commission
    
    # Add analyzers
    cerebro.addanalyzer(bt.analyzers.SharpeRatio, _name='sharpe')
    cerebro.addanalyzer(bt.analyzers.Returns, _name='returns')
    cerebro.addanalyzer(bt.analyzers.DrawDown, _name='drawdown')
    
    print(f'Starting Portfolio Value: {cerebro.broker.getvalue():.2f}')
    
    # Run backtest
    results = cerebro.run()
    
    print(f'Final Portfolio Value: {cerebro.broker.getvalue():.2f}')
    
    # Print results
    strat = results[0]
    print(f'Sharpe Ratio: {strat.analyzers.sharpe.get_analysis()["sharperatio"]:.2f}')
    print(f'Total Return: {strat.analyzers.returns.get_analysis()["rtot"]:.2%}')
    print(f'Max Drawdown: {strat.analyzers.drawdown.get_analysis()["max"]["drawdown"]:.2%}')
    
    # Plot results
    cerebro.plot(style='candlestick')
    
    return results

# Save the code to a file
with open('xgboost_backtrader_strategy.py', 'w') as f:
    f.write('''
# XGBoost + Backtrader Strategy Implementation
# This file contains the complete strategy code

import backtrader as bt
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import yfinance as yf

# [Include all the code above here]

# To use this strategy:
# 1. Train your XGBoost model
# 2. Define your feature calculation function
# 3. Get S&P 500 ticker list
# 4. Run: results = run_backtest(model, sp500_tickers)
''')

print("Strategy code saved to 'xgboost_backtrader_strategy.py'")
print("\nTo use this strategy:")
print("1. Replace calculate_features() with your actual feature engineering")
print("2. Load your trained XGBoost model")
print("3. Get S&P 500 ticker list")
print("4. Call run_backtest(model, sp500_tickers)")

In [None]:
class XGBoostStrategy(bt.Strategy):
    params = (
        ('top_n', 10),  # Number of top stocks to buy
        ('rebalance_freq', 5),  # Rebalance every 5 trading days (1 week)
    )
    
    def __init__(self):
        self.model = None  # Your trained XGBoost model
        self.feature_calculator = None  # Your feature calculation function
        self.rebalance_counter = 0
        self.current_positions = []
        self.sp500_tickers = []  # List of S&P 500 tickers
        
    def set_model(self, model, feature_calculator, sp500_tickers):
        """Set the XGBoost model and feature calculator"""
        self.model = model
        self.feature_calculator = feature_calculator
        self.sp500_tickers = sp500_tickers
    
    def next(self):
        # Rebalance every week (5 trading days)
        if self.rebalance_counter % self.params.rebalance_freq == 0:
            self.rebalance_portfolio()
        
        self.rebalance_counter += 1
    
    def rebalance_portfolio(self):
        """Rebalance portfolio based on XGBoost predictions"""
        if self.model is None:
            return
        
        # Close all current positions
        for data in self.datas:
            if self.getposition(data).size != 0:
                self.close(data=data)
        
        # Get predictions for all stocks
        predictions = self.get_predictions()
        
        if len(predictions) == 0:
            return
        
        # Sort by predicted returns and get top N
        top_stocks = predictions.nlargest(self.params.top_n)
        
        # Calculate position size (equal weight)
        available_cash = self.broker.getcash()
        position_size = available_cash / len(top_stocks)
        
        # Buy top predicted stocks
        for ticker, predicted_return in top_stocks.items():
            data = self.get_data_by_name(ticker)
            if data is not None:
                size = int(position_size / data.close[0])
                if size > 0:
                    self.buy(data=data, size=size)
                    print(f"Buying {size} shares of {ticker} at {data.close[0]:.2f} (predicted return: {predicted_return:.4f})")
    
    def get_predictions(self):
        """Get XGBoost predictions for all stocks"""
        predictions = {}
        
        for ticker in self.sp500_tickers:
            data = self.get_data_by_name(ticker)
            if data is not None and len(data) > 50:  # Ensure enough data for features
                try:
                    # Calculate features for current stock
                    features = self.calculate_features(data, ticker)
                    if features is not None:
                        # Get prediction
                        pred = self.model.predict([features])[0]
                        predictions[ticker] = pred
                except Exception as e:
                    print(f"Error predicting for {ticker}: {e}")
                    continue
        
        return pd.Series(predictions)
    

    def calculate_all_indicators(stock_data: Dict[str, pd.DataFrame], ticker) -> pd.DataFrame:
        """
        Calculate all technical indicators for all stocks
        """
        all_indicators = []
        
        print("Calculating technical indicators...")
        
        for ticker in stock_data.columns.get_level_values(1).unique().tolist():
            try:
                # Basic price data
                close = stock_data.loc[:, ('Close', ticker)]
                high = stock_data.loc[:,('High', ticker)]
                low = stock_data.loc[:, ('Low', ticker)]
                volume = stock_data.loc[:, ('Volume', ticker)]
                
                # Create a DataFrame for this stock's indicators
                indicators_df = pd.DataFrame(index=data.index)
                indicators_df['Ticker'] = ticker
                indicators_df['Close'] = close
                indicators_df['High'] = high
                indicators_df['Low'] = low
                indicators_df['Volume'] = volume
                

                # Future Returns
                for period in [-1, -5, -10, -21, -63]:
                    indicators_df[f'Future_Return_{abs(period)}d'] = TechnicalIndicators.future_returns(close, period)

                # Moving Averages
                for window in [5, 10, 20, 50, 100, 200]:
                    indicators_df[f'SMA_{window}'] = TechnicalIndicators.moving_average(close, window)
                    indicators_df[f'EMA_{window}'] = TechnicalIndicators.exponential_moving_average(close, window)
                
                # Returns
                for period in [1, 5, 10, 21, 63]:
                    indicators_df[f'Return_{period}d'] = TechnicalIndicators.returns(close, period)

                # Volatility
                for window in [10, 21, 63]:
                    indicators_df[f'Volatility_{window}d'] = TechnicalIndicators.volatility(close, window)
                
                # RSI
                indicators_df['RSI_14'] = TechnicalIndicators.rsi(close, 14)
                indicators_df['RSI_21'] = TechnicalIndicators.rsi(close, 21)
                
                # MACD
                macd_data = TechnicalIndicators.macd(close)
                for key, value in macd_data.items():
                    indicators_df[key] = value
                
                # Bollinger Bands
                bb_data = TechnicalIndicators.bollinger_bands(close)
                for key, value in bb_data.items():
                    indicators_df[key] = value
                
                # ATR
                indicators_df['ATR_14'] = TechnicalIndicators.average_true_range(high, low, close, 14)
                
                # Stochastic Oscillator
                stoch_data = TechnicalIndicators.stochastic_oscillator(high, low, close)
                for key, value in stoch_data.items():
                    indicators_df[key] = value
                
                # Volume indicators
                indicators_df['Volume_SMA_20'] = TechnicalIndicators.moving_average(volume, 20)
                indicators_df['Volume_Ratio'] = volume / indicators_df['Volume_SMA_20']
                
                # Price momentum
                indicators_df['Momentum_10'] = close / close.shift(10) - 1
                indicators_df['Momentum_21'] = close / close.shift(21) - 1
                
                # Price position relative to moving averages
                indicators_df['Price_vs_SMA20'] = close / indicators_df['SMA_20'] - 1
                indicators_df['Price_vs_SMA50'] = close / indicators_df['SMA_50'] - 1
                
                all_indicators.append(indicators_df)
                print(f"✓ Calculated indicators for {ticker}")
                
            except Exception as e:
                print(f"✗ Error calculating indicators for {ticker}: {str(e)}")
        
        # Combine all data
        if all_indicators:
            combined_df = pd.concat(all_indicators, ignore_index=False)
            combined_df.reset_index(inplace=True)
            combined_df.rename(columns={'index': 'Date'}, inplace=True)
            
            # Set MultiIndex
            combined_df.set_index(['Date', 'Ticker'], inplace=True)
            
            print(f"\nFinal dataset shape: {combined_df.shape}")
            print(f"Columns: {list(combined_df.columns)}")
            
            return combined_df
        else:
            print("No data to combine!")
            return pd.DataFrame()
    
    def calculate_features(self, data, ticker):
        """Calculate features for a given stock data"""
        try:
            # Convert backtrader data to pandas DataFrame
            df = pd.DataFrame({
                'Open': [data.open[i] for i in range(-50, 0)],
                'High': [data.high[i] for i in range(-50, 0)],
                'Low': [data.low[i] for i in range(-50, 0)],
                'Close': [data.close[i] for i in range(-50, 0)],
                'Volume': [data.volume[i] for i in range(-50, 0)]
            })
            
            # Use your feature calculator function
            features = self.feature_calculator(df)
            return features
        except:
            return None
    
    def get_data_by_name(self, name):
        """Get data feed by ticker name"""
        for data in self.datas:
            if data._name == name:
                return data
        return None

# Example feature calculation function (replace with your actual function)
def calculate_features(df):
    """
    Calculate your 40 features from yfinance data
    Replace this with your actual feature engineering
    """
    features = []
    
    # Example features - replace with your actual features
    # Price-based features
    features.append(df['Close'].pct_change(1).iloc[-1])  # 1-day return
    features.append(df['Close'].pct_change(5).iloc[-1])  # 5-day return
    features.append(df['Close'].pct_change(20).iloc[-1])  # 20-day return
    
    # Moving averages
    features.append(df['Close'].iloc[-1] / df['Close'].rolling(10).mean().iloc[-1] - 1)  # Price vs 10-day MA
    features.append(df['Close'].iloc[-1] / df['Close'].rolling(20).mean().iloc[-1] - 1)  # Price vs 20-day MA
    
    # Volatility
    features.append(df['Close'].pct_change().rolling(10).std().iloc[-1])  # 10-day volatility
    features.append(df['Close'].pct_change().rolling(20).std().iloc[-1])  # 20-day volatility
    
    # Volume features
    features.append(df['Volume'].iloc[-1] / df['Volume'].rolling(10).mean().iloc[-1])  # Volume vs average
    
    # Technical indicators (simplified)
    # RSI approximation
    delta = df['Close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(14).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    features.append(rsi.iloc[-1])
    
    # Add more features to reach 40...
    # For now, pad with dummy features
    while len(features) < 40:
        features.append(np.random.random())  # Replace with actual features
    
    return features[:40]

