In [None]:
import yfinance as yf
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split



In [None]:
# Fetch historical data
def get_stock_data(ticker, start_date, end_date):
    stock = yf.Ticker(ticker)
    data = stock.history(start=start_date, end=end_date)
    return data

In [None]:
data = get_stock_data("MSFT", "2010-01-01", "2024-11-01")

**Moving Averages:**
Moving Averages smooth price data by calculating the average of the last n
periods.

In [None]:
# Calculate Simple Moving Averages
data['SMA_20'] = data['Close'].rolling(window=20).mean()
data['SMA_50'] = data['Close'].rolling(window=50).mean()

# Display the data with new columns
print(data[['Close', 'SMA_20', 'SMA_50']].tail(10))



                                Close      SMA_20      SMA_50
Date                                                         
2024-10-18 00:00:00-04:00  418.160004  420.879500  419.381454
2024-10-21 00:00:00-04:00  418.779999  420.142999  419.651264
2024-10-22 00:00:00-04:00  427.510010  420.059999  420.079903
2024-10-23 00:00:00-04:00  424.600006  419.684500  420.306600
2024-10-24 00:00:00-04:00  424.730011  419.355501  420.479001
2024-10-25 00:00:00-04:00  428.149994  419.362001  420.621401
2024-10-28 00:00:00-04:00  426.589996  419.176501  420.783801
2024-10-29 00:00:00-04:00  431.950012  419.739502  420.992201
2024-10-30 00:00:00-04:00  432.529999  420.509502  421.146801
2024-10-31 00:00:00-04:00  406.350006  420.000002  420.791001


In [None]:
# EMA Calculation
data['EMA_20'] = data['Close'].ewm(span=20, adjust=False).mean()
print(data[['Close', 'EMA_20']].head())


                               Close     EMA_20
Date                                           
2010-01-04 00:00:00-05:00  23.347315  23.347315
2010-01-05 00:00:00-05:00  23.354860  23.348033
2010-01-06 00:00:00-05:00  23.211535  23.335034
2010-01-07 00:00:00-05:00  22.970144  23.300282
2010-01-08 00:00:00-05:00  23.128551  23.283927


Relative Strength Index (RSI)
RSI measures the strength of recent price movements.

In [None]:
def calculate_rsi(data, window=14):
    delta = data['Close'].diff(1)
    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)

    avg_gain = gain.rolling(window=window, min_periods=1).mean()
    avg_loss = loss.rolling(window=window, min_periods=1).mean()

    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

data['RSI'] = calculate_rsi(data)
print(data[['Close', 'RSI']].tail())


                                Close        RSI
Date                                            
2024-10-25 00:00:00-04:00  428.149994  77.603045
2024-10-28 00:00:00-04:00  426.589996  69.734203
2024-10-29 00:00:00-04:00  431.950012  72.149186
2024-10-30 00:00:00-04:00  432.529999  76.349825
2024-10-31 00:00:00-04:00  406.350006  41.310792


**Bollinger Bands**
Bollinger Bands are constructed using a moving average and standard deviations.




In [None]:
def calculate_bollinger_bands(data, window=20, num_std=2):
    rolling_mean = data['Close'].rolling(window=window).mean()
    rolling_std = data['Close'].rolling(window=window).std()

    data['Bollinger_Upper'] = rolling_mean + (rolling_std * num_std)
    data['Bollinger_Lower'] = rolling_mean - (rolling_std * num_std)

calculate_bollinger_bands(data)
print(data[['Close', 'Bollinger_Upper', 'Bollinger_Lower']].tail())


                                Close  Bollinger_Upper  Bollinger_Lower
Date                                                                   
2024-10-25 00:00:00-04:00  428.149994       429.705796       409.018206
2024-10-28 00:00:00-04:00  426.589996       428.802542       409.550461
2024-10-29 00:00:00-04:00  431.950012       430.928506       408.550497
2024-10-30 00:00:00-04:00  432.529999       432.987693       408.031310
2024-10-31 00:00:00-04:00  406.350006       433.910563       406.089440


**MACD (Moving Average Convergence Divergence)**
MACD is the difference between short-term and long-term EMAs

In [None]:
def calculate_macd(data, short_window=12, long_window=26, signal_window=9):
    short_ema = data['Close'].ewm(span=short_window, adjust=False).mean()
    long_ema = data['Close'].ewm(span=long_window, adjust=False).mean()
    data['MACD'] = short_ema - long_ema
    data['Signal_Line'] = data['MACD'].ewm(span=signal_window, adjust=False).mean()

calculate_macd(data)
print(data[['Close', 'MACD', 'Signal_Line']].tail())


                                Close      MACD  Signal_Line
Date                                                        
2024-10-25 00:00:00-04:00  428.149994  0.733632    -0.457823
2024-10-28 00:00:00-04:00  426.589996  1.031302    -0.159998
2024-10-29 00:00:00-04:00  431.950012  1.680347     0.208071
2024-10-30 00:00:00-04:00  432.529999  2.215975     0.609652
2024-10-31 00:00:00-04:00  406.350006  0.521941     0.592109


In [None]:
import matplotlib.pyplot as plt

# Plot the Close price
plt.figure(figsize=(10, 6))
plt.subplot(2, 1, 1)
plt.plot(data['Close'], label='Close Price', color='black')
plt.title('Close Price')
plt.legend()

# Plot the MACD and Signal Line
plt.subplot(2, 1, 2)
plt.plot(data['MACD'], label='MACD Line', color='blue')
plt.plot(data['Signal_Line'], label='Signal Line', color='red')
plt.bar(data.index, data['MACD'] - data['Signal_Line'], label='Histogram', color='gray', alpha=0.5)
plt.title('MACD and Signal Line')
plt.legend()

plt.tight_layout()
plt.show()


<IPython.core.display.Javascript object>

**Average True Range (ATR)**
ATR measures market volatility.

In [None]:
def calculate_atr(data, window=14):
    high_low = data['High'] - data['Low']
    high_close = (data['High'] - data['Close'].shift(1)).abs()
    low_close = (data['Low'] - data['Close'].shift(1)).abs()

    tr = high_low.to_frame('HL').join(high_close.to_frame('HC')).join(low_close.to_frame('LC')).max(axis=1)
    atr = tr.rolling(window=window, min_periods=1).mean()
    return atr

data['ATR'] = calculate_atr(data)
print(data[['High', 'Low', 'Close', 'ATR']].tail())


                                 High         Low       Close       ATR
Date                                                                   
2024-10-25 00:00:00-04:00  432.519989  426.570007  428.149994  6.637852
2024-10-28 00:00:00-04:00  431.940002  426.299988  426.589996  6.505711
2024-10-29 00:00:00-04:00  433.170013  425.799988  431.950012  6.597855
2024-10-30 00:00:00-04:00  438.500000  432.100006  432.529999  6.757854
2024-10-31 00:00:00-04:00  416.160004  406.299988  406.350006  8.354283


**Stochastic Oscillator**

The Stochastic Oscillator compares the closing price to the price range over a certain period.



In [None]:
def calculate_stochastic_oscillator(data, window=14):
    high_max = data['High'].rolling(window=window).max()
    low_min = data['Low'].rolling(window=window).min()
    stochastic = ((data['Close'] - low_min) / (high_max - low_min)) * 100
    return stochastic

data['Stochastic'] = calculate_stochastic_oscillator(data)
print(data[['Close', 'Stochastic']].tail())


                                Close  Stochastic
Date                                             
2024-10-25 00:00:00-04:00  428.149994   82.053390
2024-10-28 00:00:00-04:00  426.589996   73.094380
2024-10-29 00:00:00-04:00  431.950012   94.623177
2024-10-30 00:00:00-04:00  432.529999   78.693777
2024-10-31 00:00:00-04:00  406.350006    0.155336


In [None]:
for lag in range(1, 6):  # Create lagged values for 1 to 5 days
    data[f'Lag_{lag}_Close'] = data['Close'].shift(lag)


On-Balance Volume (OBV):

Measures cumulative buying/selling pressure.
Formula:
Add the volume if the price closes higher than the previous day.
Subtract the volume if it closes lower.


In [None]:
def calculate_obv(data):
    obv = [0]
    for i in range(1, len(data)):
        if data['Close'][i] > data['Close'][i-1]:
            obv.append(obv[-1] + data['Volume'][i])
        elif data['Close'][i] < data['Close'][i-1]:
            obv.append(obv[-1] - data['Volume'][i])
        else:
            obv.append(obv[-1])
    return pd.Series(obv, index=data.index)

data['OBV'] = calculate_obv(data)


  if data['Close'][i] > data['Close'][i-1]:
  obv.append(obv[-1] + data['Volume'][i])
  elif data['Close'][i] < data['Close'][i-1]:
  obv.append(obv[-1] - data['Volume'][i])


In [None]:
data['ADL'] = ((data['Close'] - data['Low']) - (data['High'] - data['Close'])) / \
              (data['High'] - data['Low']) * data['Volume']
data['ADL'] = data['ADL'].cumsum()


In [None]:
data.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,SMA_20,SMA_50,EMA_20,...,Signal_Line,ATR,Stochastic,Lag_1_Close,Lag_2_Close,Lag_3_Close,Lag_4_Close,Lag_5_Close,OBV,ADL
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-10-25 00:00:00-04:00,426.76001,432.519989,426.570007,428.149994,16899100,0.0,0.0,419.362001,420.621401,421.551165,...,-0.457823,6.637852,82.05339,424.730011,424.600006,427.51001,418.779999,418.160004,1530809900,6204649000.0
2024-10-28 00:00:00-04:00,431.660004,431.940002,426.299988,426.589996,14882400,0.0,0.0,419.176501,420.783801,422.031054,...,-0.159998,6.505711,73.09438,428.149994,424.730011,424.600006,427.51001,418.779999,1515927500,6191297000.0
2024-10-29 00:00:00-04:00,428.0,433.170013,425.799988,431.950012,17644100,0.0,0.0,419.739502,420.992201,422.975716,...,0.208071,6.597855,94.623177,426.589996,428.149994,424.730011,424.600006,427.51001,1533571600,6203100000.0
2024-10-30 00:00:00-04:00,437.440002,438.5,432.100006,432.529999,29749100,0.0,0.0,420.509502,421.146801,423.885648,...,0.609652,6.757854,78.693777,431.950012,426.589996,428.149994,424.730011,424.600006,1563320700,6177348000.0
2024-10-31 00:00:00-04:00,415.359985,416.160004,406.299988,406.350006,53971000,0.0,0.0,420.000002,420.791001,422.215587,...,0.592109,8.354283,0.155336,432.529999,431.950012,426.589996,428.149994,424.730011,1509349700,6123925000.0


Model Training and Hyperparameter tuning

In [None]:
features = [
    'SMA_20', 'SMA_50', 'EMA_20', 'RSI', 'MACD', 'Signal_Line',
    'Bollinger_Upper', 'Bollinger_Lower', 'ATR', 'Stochastic',
    'Lag_1_Close', 'Lag_2_Close', 'Lag_3_Close', 'Lag_4_Close', 'Lag_5_Close',
    'OBV', 'ADL'
]
data['Target'] = (data['Close'].shift(-5) > data['Close']).astype(int)






In [None]:
data = data.dropna()


In [None]:
X = data[features]
y = data['Target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
print(f"Training size: {len(X_train)}, Testing size: {len(X_test)}")


Training size: 2947, Testing size: 737


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train
rf_model = RandomForestClassifier(n_estimators=100,class_weight='balanced', random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate on the test set
from sklearn.metrics import classification_report, accuracy_score

y_pred = rf_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))



Accuracy: 0.5345997286295794
Classification Report:
               precision    recall  f1-score   support

           0       0.48      0.38      0.43       333
           1       0.56      0.66      0.61       404

    accuracy                           0.53       737
   macro avg       0.52      0.52      0.52       737
weighted avg       0.53      0.53      0.53       737



In [None]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("Classification Report:\n", classification_report(y_test, y_pred_xgb))

XGBoost Accuracy: 0.5223880597014925
Classification Report:
               precision    recall  f1-score   support

           0       0.46      0.30      0.36       333
           1       0.55      0.71      0.62       404

    accuracy                           0.52       737
   macro avg       0.50      0.50      0.49       737
weighted avg       0.51      0.52      0.50       737



In [None]:
import matplotlib.pyplot as plt

feature_importances = rf_model.feature_importances_
plt.barh(features, feature_importances)
plt.xlabel("Feature Importance")
plt.title("Random Forest Feature Importance")
plt.show()



In [None]:
from xgboost import plot_importance

plot_importance(xgb_model, importance_type='weight')
plt.title("XGBoost Feature Importance")
plt.show()


<IPython.core.display.Javascript object>

In [None]:
import joblib

joblib.dump(rf_model, 'trading_model.pkl')


['trading_model.pkl']

In [None]:
model = joblib.load('trading_model.pkl')


Backtesting

In [None]:
pip install backtrader




In [None]:
import backtrader as bt

# Define Custom Data Feed
class CustomPandasData(bt.feeds.PandasData):
    lines = ('sma_20', 'sma_50', 'ema_20', 'rsi', 'macd', 'signal_line',
             'bollinger_upper', 'bollinger_lower', 'atr', 'stochastic', 'obv', 'adl')

    params = (
        ('sma_20', 'SMA_20'),
        ('sma_50', 'SMA_50'),
        ('ema_20', 'EMA_20'),
        ('rsi', 'RSI'),
        ('macd', 'MACD'),
        ('signal_line', 'Signal_Line'),
        ('bollinger_upper', 'Bollinger_Upper'),
        ('bollinger_lower', 'Bollinger_Lower'),
        ('atr', 'ATR'),
        ('stochastic', 'Stochastic'),
        ('obv', 'OBV'),
        ('adl', 'ADL'),
    )

# Define Strategy
class MLStrategy(bt.Strategy):
    def __init__(self):
        self.model = xgb_model  # Use the trained XGBoost model
        self.data_close = self.datas[0].close

    def next(self):
        features = [
            self.datas[0].sma_20[0], self.datas[0].sma_50[0], self.datas[0].ema_20[0],
            self.datas[0].rsi[0], self.datas[0].macd[0], self.datas[0].signal_line[0],
            self.datas[0].bollinger_upper[0], self.datas[0].bollinger_lower[0],
            self.datas[0].atr[0], self.datas[0].stochastic[0],
            self.data_close[-1], self.data_close[-2], self.data_close[-3],
            self.data_close[-4], self.data_close[-5],
            self.datas[0].obv[0], self.datas[0].adl[0]
        ]
        prediction = self.model.predict([features])[0]
        if prediction == 1 and not self.position:
            self.buy()
        elif prediction == 0 and self.position:
            self.sell()

# Initialize Backtrader
cerebro = bt.Cerebro()
cerebro.addstrategy(MLStrategy)

# Add Data Feed
data_feed = CustomPandasData(dataname=data)
cerebro.adddata(data_feed)

# Set Portfolio Parameters
cerebro.broker.set_cash(100000)
cerebro.broker.setcommission(commission=0.001)

# Run Backtest
print("Starting Portfolio Value: ${:.2f}".format(cerebro.broker.getvalue()))
cerebro.run()
print("Final Portfolio Value: ${:.2f}".format(cerebro.broker.getvalue()))

# Plot Results
cerebro.plot()


Starting Portfolio Value: $100000.00
Final Portfolio Value: $100701.27


[[<Figure size 640x480 with 16 Axes>]]

In [None]:
total_return = (cerebro.broker.getvalue() - 100000) / 100000
print(f"Total Return: {total_return:.2%}")



Total Return: 0.70%


In [None]:
daily_returns = data['Close'].pct_change()
sharpe_ratio = (daily_returns.mean() / daily_returns.std()) * (252**0.5)
print(f"Sharpe Ratio: {sharpe_ratio}")


Sharpe Ratio: 0.9018640926385995


In [None]:
cumulative_returns = (1 + daily_returns).cumprod()
peak = cumulative_returns.cummax()
drawdown = (cumulative_returns - peak) / peak
max_drawdown = drawdown.min()
print(f"Maximum Drawdown: {max_drawdown:.2%}")


Maximum Drawdown: -37.15%
