In [3]:
import sys
print(sys.path)

['/Users/pranuprakash/Library/Mobile Documents/com~apple~CloudDocs/pranup/Algorithmic Training/Project', '/', '/Users/pranuprakash/.vscode/extensions/ms-toolsai.jupyter-2022.8.1002431955/pythonFiles', '/Users/pranuprakash/.vscode/extensions/ms-toolsai.jupyter-2022.8.1002431955/pythonFiles/lib/python', '/Users/pranuprakash/opt/anaconda3/lib/python39.zip', '/Users/pranuprakash/opt/anaconda3/lib/python3.9', '/Users/pranuprakash/opt/anaconda3/lib/python3.9/lib-dynload', '', '/Users/pranuprakash/opt/anaconda3/lib/python3.9/site-packages', '/Users/pranuprakash/opt/anaconda3/lib/python3.9/site-packages/aeosa', '/Users/pranuprakash/opt/anaconda3/lib/python3.9/site-packages/IPython/extensions', '/Users/pranuprakash/.ipython']


In [2]:
import pandas as pd
import yfinance as yf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import xgboost as xgb
import backtrader as bt
import quantstats as qs
import pyfolio as pf
import json
from sklearn.ensemble import RandomForestClassifier
import os
from sklearn.metrics import confusion_matrix

import warnings
warnings.filterwarnings('ignore')


In [36]:
import numpy as np

class Stock:
    def __init__(self, data):
        self.data = data
        self.label = []
    
    #preprocess
    #feature creation
    def factor(self):
        del self.data["Close"]
        self.data = self.data.fillna(method = "bfill")
        
        #return
        #>0,+1;<=0,-1
        self.data['label']=self.data.rolling(2).apply(lambda x:x.iloc[1]>x.iloc[0])['Adj Close']
        self.data['label']=self.data.label.shift(-1)
        
        self.data['volume'] = self.data['Volume']
        self.data['volume_change'] = self.data['Volume'].diff()

        # Volume Change (daily percentage change)
        self.data['volume_pct_change'] = self.data['Volume'].pct_change()

        # Volume Moving Averages
        self.data['vol_ma_5'] = self.data['Volume'].rolling(window=5).mean()
        self.data['vol_ma_10'] = self.data['Volume'].rolling(window=10).mean()
        self.data['vol_ma_20'] = self.data['Volume'].rolling(window=20).mean()  
        self.data['vol_ma_50'] = self.data['Volume'].rolling(window=50).mean()  
        self.data['vol_ma_200'] = self.data['Volume'].rolling(window=200).mean()  

        #moving average
        self.data["ma_5"] = self.data["Adj Close"].rolling(window = 5).mean()
        self.data["ma_10"] = self.data["Adj Close"].rolling(window = 10).mean()
        self.data["ma_20"] = self.data["Adj Close"].rolling(window = 20).mean()
        self.data["ma_50"] = self.data["Adj Close"].rolling(window = 50).mean()
        self.data["ma_200"] = self.data["Adj Close"].rolling(window = 200).mean()

        self.add_volume_oscillator()
        self.add_relative_volume()
        self.add_volume_relative_to_ma()
        self.add_volume_spikes()
        self.add_price_volume_trend()
        #self.add_vix_feature()
        self.add_spy_vix_ratio_feature()
        self.add_spy_iwm_ratio_feature()
        #self.add_spy_qqq_ratio_feature()
        #self.add_spy_dia_ratio_feature()
        #macd
        ema_short = self.data['Adj Close'].ewm(span=12, adjust=False, min_periods=12).mean()
        ema_long = self.data['Adj Close'].ewm(span=26, adjust=False, min_periods=26).mean()
        macd = ema_short - ema_long
        macd_s = macd.ewm(span=9, adjust=False, min_periods=9).mean()
        macd_h = macd - macd_s
        self.data['macd'] = macd
        self.data['macd_h'] = macd_h
        self.data['macd_s'] = macd_s
        
        self.data = self.data.dropna(how = "any")
        self.label = list(self.data["label"])
        del self.data["label"]
        del self.data["Adj Close"]
               
    def add_volume_oscillator(self):
        """Add Volume Oscillator feature."""
        short_term = 5
        long_term = 10
        self.data['vol_ma_short'] = self.data['Volume'].rolling(window=short_term).mean()
        self.data['vol_ma_long'] = self.data['Volume'].rolling(window=long_term).mean()
        self.data['volume_oscillator'] = self.data['vol_ma_short'] - self.data['vol_ma_long']

    def add_relative_volume(self, comparison_period=20):
        """Add Relative Volume feature."""
        # Calculate the average volume over the specified comparison period
        self.data['avg_volume'] = self.data['Volume'].rolling(window=comparison_period).mean()

        # Calculate Relative Volume
        self.data['relative_volume'] = self.data['Volume'] / self.data['avg_volume']

    def add_volume_relative_to_ma(self, period=50):
        """Add Volume Relative to Moving Average."""
        self.data['vol_relative_to_ma'] = self.data['Volume'] / self.data['Volume'].rolling(window=period).mean()

    def add_volume_spikes(self, threshold=2):
        """Add Volume Spikes."""
        self.data['vol_spike'] = self.data['Volume'] > self.data['Volume'].rolling(window=50).mean() * threshold

    def add_price_volume_trend(self):
        """Add Price-Volume Trend."""
        self.data['pvt'] = (self.data['Volume'] * self.data['Adj Close'].diff()).cumsum()

    def download_vix(self):
        """Download VIX data for the same date range as the stock data."""
        start_date = self.data.index.min().strftime('%Y-%m-%d')
        end_date = self.data.index.max().strftime('%Y-%m-%d')
        self.vix_data = yf.download("^VIX", start=start_date, end=end_date)['Close']

    def add_vix_feature(self):
        """Add VIX as a feature."""
        self.download_vix()  # Download VIX data
        # Merge VIX data into the stock data
        self.data = self.data.merge(self.vix_data, how='left', left_index=True, right_index=True, suffixes=('', '_VIX'))
        self.data.rename(columns={'Close_VIX': 'VIX'}, inplace=True)

    def download_spy(self):
        """Download SPY data for the same date range as the stock data."""
        start_date = self.data.index.min().strftime('%Y-%m-%d')
        end_date = self.data.index.max().strftime('%Y-%m-%d')
        self.spy_data = yf.download("SPY", start=start_date, end=end_date)['Adj Close']

    def add_spy_vix_ratio_feature(self):
        """Add feature of SPY price change / VIX price change."""
        self.download_spy()  # Download SPY data
        self.download_vix()  # Download VIX data

        # Calculate daily percentage change for SPY and VIX
        spy_pct_change = self.spy_data.pct_change()
        vix_pct_change = self.vix_data.pct_change()
        
        # Calculate the ratio of SPY change to VIX change
        self.data['SPY_VIX_ratio'] = spy_pct_change/ vix_pct_change
        self.data['SPY_VIX_ratio'].replace([np.inf, -np.inf], np.nan, inplace=True)
        self.data['SPY_VIX_ratio'] = self.data['SPY_VIX_ratio'].fillna(method = "bfill")

    def download_etf_data(self, ticker, column_name):
        """Download ETF data for the same date range as the stock data."""
        start_date = self.data.index.min().strftime('%Y-%m-%d')
        end_date = self.data.index.max().strftime('%Y-%m-%d')
        etf_data = yf.download(ticker, start=start_date, end=end_date)['Adj Close']
        etf_pct_change = etf_data.pct_change()
        self.data[column_name] = etf_pct_change

    def add_spy_iwm_ratio_feature(self):
        """Add feature of SPY price change / IWM price change."""
        self.download_etf_data("SPY", "SPY_pct_change")
        self.download_etf_data("IWM", "IWM_pct_change")

        # Calculate the ratio of SPY change to IWM change
        self.data['SPY_IWM_ratio'] = self.data['SPY_pct_change'] / self.data['IWM_pct_change']
        self.data['SPY_IWM_ratio'].replace([np.inf, -np.inf], np.nan, inplace=True)
        self.data['SPY_IWM_ratio'] = self.data['SPY_VIX_ratio'].fillna(method = "bfill")
    def add_spy_qqq_ratio_feature(self):
        """Add feature of SPY price change / QQQ price change."""
        self.download_etf_data("SPY", "SPY_pct_change")
        self.download_etf_data("QQQ", "QQQ_pct_change")
        self.data['SPY_QQQ_ratio'] = self.data['SPY_pct_change'] / self.data['QQQ_pct_change']
        self.data['SPY_QQQ_ratio'].replace([np.inf, -np.inf], np.nan, inplace=True)
        self.data['SPY_QQQ_ratio'] = self.data['SPY_VIX_ratio'].fillna(method = "bfill")
    def add_spy_dia_ratio_feature(self):
        """Add feature of SPY price change / DIA price change."""
        self.download_etf_data("SPY", "SPY_pct_change")
        self.download_etf_data("DIA", "DIA_pct_change")
        self.data['SPY_DIA_ratio'] = self.data['SPY_pct_change'] / self.data['DIA_pct_change']
        self.data['SPY_DIA_ratio'].replace([np.inf, -np.inf], np.nan, inplace=True)
        self.data['SPY_DIA_ratio'] = self.data['SPY_VIX_ratio'].fillna(method = "bfill")

    #standardize data
    
    # def standardize(self):
    #     scaler = StandardScaler()      
    #     self.data = scaler.fit_transform(self.data)

    def standardize(self):
        # Replace infinite values with NaN
        self.data.replace([np.inf, -np.inf], np.nan, inplace=True)

        # Optionally, fill NaN values with column means or another strategy
        self.data.fillna(self.data.mean(), inplace=True)

        # Now standardize the data
        scaler = StandardScaler()
        self.data = scaler.fit_transform(self.data)
    
    #normalize data
    def normalize(self):
        scaler = MinMaxScaler()      
        self.data = scaler.fit_transform(self.data)
    


In [37]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score
import xgboost as xgb

# Assuming Stock class and list1 are defined as before

# Lists to store combined features and labels for all stocks
all_features_xgb = []
all_labels_xgb = []

# Download and preprocess data for each ticker in the list
yf_data = yf.download("SPY", start="2000-01-01", end="2021-11-12")
stock = Stock(yf_data)
stock.factor()        # Feature creation and preprocessing
stock.standardize()   # Standardizing the data
stock.normalize()     # Normalizing the data

# Append the features and labels to the respective lists
all_features_xgb.append(pd.DataFrame(stock.data))
all_labels_xgb.extend(stock.label)

X = pd.concat(all_features_xgb, ignore_index=True)
y = pd.Series(all_labels_xgb)

# Set a fixed random state for reproducibility
fixed_random_state = 42

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=fixed_random_state,shuffle=False)

# Initialize an XGBoost classifier with adjusted parameters
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    max_depth=6,
    n_estimators=200,
    learning_rate=0.05,
    gamma=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=fixed_random_state
)

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions
pred_y = xgb_model.predict(X_test)

# Calculating accuracy and precision
accuracy_xgb = accuracy_score(y_test, pred_y)
precision_xgb = precision_score(y_test, pred_y)
print("accuracy_xgb: ", accuracy_xgb)
print("precision_xgb: ", precision_xgb)
cm = confusion_matrix(y_test, pred_y)
print("Confusion Matrix:\n", cm)

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
accuracy_xgb:  0.5266383781235267
precision_xgb:  0.5561005518087063
Confusion Matrix:
 [[210 724]
 [280 907]]


In [38]:
import yfinance as yf
import pandas as pd
from sklearn.model_selection import cross_validate, TimeSeriesSplit
from sklearn.metrics import make_scorer
import xgboost as xgb

# Assuming the Stock class is defined as before

# Download and preprocess data
yf_data = yf.download("SPY", start="2000-01-01", end="2021-11-12")
stock = Stock(yf_data)
stock.factor()        # Feature creation and preprocessing
stock.standardize()   # Standardizing the data
stock.normalize()     # Normalizing the data

# Create feature matrix X and target vector y
X = pd.DataFrame(stock.data)
y = pd.Series(stock.label)

# Initialize an XGBoost classifier with adjusted parameters
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    max_depth=6,
    n_estimators=200,
    learning_rate=0.05,
    gamma=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=fixed_random_state
)

# Train the model
xgb_model.fit(X_train, y_train)

# Define the number of folds for cross-validation and scoring metrics
num_folds = 5
scoring_metrics = {'accuracy': make_scorer(accuracy_score), 
                   'precision': make_scorer(precision_score)}

# Use TimeSeriesSplit for cross-validation
tscv = TimeSeriesSplit(n_splits=num_folds)

# Perform cross-validation
cv_results = cross_validate(xgb_model, X, y, cv=tscv, scoring=scoring_metrics, return_train_score=True)

# Output the results
print(f"Cross-Validation Accuracy Scores: {cv_results['test_accuracy']}")
print(f"Mean CV Accuracy: {cv_results['test_accuracy'].mean()}")
print(f"Cross-Validation Precision Scores: {cv_results['test_precision']}")
print(f"Mean CV Precision: {cv_results['test_precision'].mean()}")


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
Cross-Validation Accuracy Scores: [0.53793884 0.53227633 0.45526614 0.5198188  0.53907135]
Mean CV Accuracy: 0.5168742921857304
Cross-Validation Precision Scores: [0.56825397 0.55379747 0.54491018 0.55936073 0.58277027]
Mean CV Precision: 0.561818523422599


In [39]:
import yfinance as yf
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score
from sklearn.ensemble import RandomForestClassifier

# Ensure this file exists in your directory
list1 = {"SPY"}
# Assuming Stock class and list1 are defined as before

# Lists to store combined features and labels for all stocks
all_features_rf = []
all_labels_rf = []

for tic in list1:
    # Download and preprocess data for each ticker in the list
    yf_data = yf.download(tic, start="2000-01-01", end="2021-11-12")
    
    # Skip if data is empty
    if yf_data.empty:
        continue

    stock = Stock(yf_data)
    stock.factor()        # Feature creation and preprocessing
    stock.standardize()   # Standardizing the data
    stock.normalize() 

    # Append the features and labels to the respective lists
    all_features_rf.append(pd.DataFrame(stock.data))
    all_labels_rf.extend(stock.label)

# Concatenating all features and labels into single datasets
X = pd.concat(all_features_rf)
y = pd.Series(all_labels_rf)

# Set a fixed random state for reproducibility
fixed_random_state = 42

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=fixed_random_state,shuffle=False)

# Initialize a Random Forest classifier with adjusted parameters
rf_model = RandomForestClassifier(
    n_estimators=150,    # Increased number of trees
    max_depth=10,        # Maximum depth of each tree
    min_samples_split=4, # Minimum number of samples required to split an internal node
    min_samples_leaf=2,  # Minimum number of samples required to be at a leaf node
    random_state=fixed_random_state  # Random state for reproducibility
)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions
pred_y_rf = rf_model.predict(X_test)

# Calculating accuracy and precision for the combined dataset
accuracy_rf = accuracy_score(y_test, pred_y_rf)
precision_rf = precision_score(y_test, pred_y_rf)
print("accuracy_rf: ", accuracy_rf)
print("precision_rf: ", precision_rf)


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
accuracy_rf:  0.4926921263554927
precision_rf:  0.5459817729908865


In [40]:
import yfinance as yf
import pandas as pd
from sklearn.model_selection import cross_validate, TimeSeriesSplit
from sklearn.metrics import make_scorer
from sklearn.ensemble import RandomForestClassifier

# Ensure this file exists in your directory
list1 = {"SPY"}
# Assuming Stock class and list1 are defined as before

# Lists to store combined features and labels for all stocks
all_features_rf = []
all_labels_rf = []

for tic in list1:
    # Download and preprocess data for each ticker in the list
    yf_data = yf.download(tic, start="2000-01-01", end="2021-11-12")
    
    # Skip if data is empty
    if yf_data.empty:
        continue

    stock = Stock(yf_data)
    stock.factor()        # Feature creation and preprocessing
    stock.standardize()   # Standardizing the data
    stock.normalize()     # Normalizing the data

    # Append the features and labels to the respective lists
    all_features_rf.append(pd.DataFrame(stock.data))
    all_labels_rf.extend(stock.label)

# Concatenating all features and labels into single datasets
X = pd.concat(all_features_rf)
y = pd.Series(all_labels_rf)

# Initialize a Random Forest classifier with adjusted parameters
rf_model = RandomForestClassifier(
    n_estimators=150,    # Increased number of trees
    max_depth=10,        # Maximum depth of each tree
    min_samples_split=4, # Minimum number of samples required to split an internal node
    min_samples_leaf=2,  # Minimum number of samples required to be at a leaf node
    random_state=42      # Random state for reproducibility
)

# Fitting the rf_model to x_training and y_training data
rf_model.fit(X_train, y_train)

# Define the number of folds for cross-validation and scoring metrics
num_folds = 5
scoring_metrics = {'accuracy': make_scorer(accuracy_score), 
                   'precision': make_scorer(precision_score)}

# Use TimeSeriesSplit for cross-validation
tscv = TimeSeriesSplit(n_splits=num_folds)

# Perform cross-validation
cv_results = cross_validate(rf_model, X, y, cv=tscv, scoring=scoring_metrics, return_train_score=True)

# Output the results
print(f"Cross-Validation Accuracy Scores: {cv_results['test_accuracy']}")
print(f"Mean CV Accuracy: {cv_results['test_accuracy'].mean()}")
print(f"Cross-Validation Precision Scores: {cv_results['test_precision']}")
print(f"Mean CV Precision: {cv_results['test_precision'].mean()}")


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
Cross-Validation Accuracy Scores: [0.47678369 0.53907135 0.46092865 0.49490374 0.4801812 ]
Mean CV Accuracy: 0.49037372593431494
Cross-Validation Precision Scores: [0.56299213 0.55409357 0.54716981 0.61029412 0.58273381]
Mean CV Precision: 0.5714566870306336


In [41]:
ticker = pd.read_csv("tickers.csv")  # Ensure this file exists in your directory
list1 = list(ticker["Ticker"])
# Custom data loading class for backtrader
class PredictionsData(bt.feeds.PandasData):
    lines = ('predictions',)
    params = (('predictions', -1),)

def get_stock_predictions(ticker, model):
    yf_data = yf.download(ticker, start="2000-01-01", end="2021-11-12")
    if yf_data.empty:
        return None

    # Preprocess the data
    stock = Stock(yf_data.copy())
    stock.factor()
    stock.standardize()
    stock.normalize()

    # Splitting the processed data
    X = pd.DataFrame(stock.data)
    _, X_test = train_test_split(X, test_size=0.4, random_state=42)

    # Generate predictions for the test set
    test_predictions = model.predict(X_test)

    # Store predictions in a DataFrame with the same index as the original data
    predictions_series = pd.Series(index=yf_data.index)
    predictions_series[X_test.index] = test_predictions
    yf_data['predictions'] = predictions_series

    return yf_data

# Backtrader strategy class
class RFStrategy(bt.Strategy):
    def __init__(self):
        self.predicted = self.datas[0].predictions

    def next(self):
        if not self.position:
            if self.predicted[0] == 1 and self.broker.get_cash() > 100:
                self.buy()
        elif self.predicted[0] == 0 and self.getposition().size > 0:
            self.sell()

In [42]:
import pandas as pd

evaluation_metrics = {
    "rf_model": {
        "Accuracy": accuracy_rf, 
        "Precision": precision_rf
    },
    "xgb_model": {
        "Accuracy": accuracy_xgb, 
        "Precision": precision_xgb
    }
}

models = [rf_model, xgb_model]
model_names = ['rf_model', 'xgb_model']
top_tickers = {}  # Dictionary to store top two tickers for each model
ticker_data = pd.read_csv("tickers.csv")  # Ensure this file exists
list1 = list(ticker_data["Ticker"])

for model, model_name in zip(models, model_names):
    final_values = {}
    stock_predictions = {}
    for ticker in list1:
        processed_data = get_stock_predictions(ticker, model)
        if processed_data is not None:
            stock_predictions[ticker] = processed_data

    # Running backtest and storing final portfolio values
    for ticker, data in stock_predictions.items():
        cerebro = bt.Cerebro()
        cerebro.addstrategy(RFStrategy)
        data_feed = PredictionsData(dataname=data)
        cerebro.adddata(data_feed)
        cerebro.broker.set_cash(10000)
        cerebro.broker.setcommission(commission=0.001)
        cerebro.run()
        final_val = cerebro.broker.getvalue()
        final_values[ticker] = final_val

    # Identify top two tickers
    top_two = sorted(final_values, key=final_values.get, reverse=True)[:2]
    top_tickers[model_name] = top_two

combined_output_df = pd.DataFrame({
    "Model": ['Random Forest', 'XGBoost'],
    "Top_Ticker_1": [top_tickers[model][0] for model in model_names],
    "Top_Ticker_2": [top_tickers[model][1] for model in model_names],
    "Accuracy_1": [evaluation_metrics[model]["Accuracy"] for model in model_names],
    "Precision_1": [evaluation_metrics[model]["Precision"] for model in model_names],
    "Accuracy_2": [evaluation_metrics[model]["Accuracy"] for model in model_names],
    "Precision_2": [evaluation_metrics[model]["Precision"] for model in model_names]
})

# # Export to CSV
combined_output_df.to_csv('small_universe_results.csv', index=False)

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

In [43]:
def generate_report_for_ticker(ticker, model, strategy_class, stock_predictions):
    cerebro = bt.Cerebro()
    cerebro.addstrategy(strategy_class)
    data_feed = PredictionsData(dataname=stock_predictions[ticker])
    cerebro.adddata(data_feed)
    cerebro.broker.set_cash(10000)
    cerebro.broker.setcommission(commission=0.001)
    cerebro.addanalyzer(bt.analyzers.TimeReturn, _name='time_return')
    strat = cerebro.run()
    daily_returns = strat[0].analyzers.time_return.get_analysis()
    returns_series = pd.Series(daily_returns)
    returns_series.index = pd.to_datetime(returns_series.index)

    # Generate and save reports
    qs.reports.html(returns_series, output=f'quantstats_{ticker}_{model}.html')

# Generate reports for top tickers of each model
for model_name, tickers in top_tickers.items():
    for ticker in tickers:
        generate_report_for_ticker(ticker, model_name, RFStrategy, stock_predictions)

In [44]:
def process_stock_data(data):
    if data.empty:
        return None, None

    stock = Stock(data)
    stock.factor()  # Feature creation and preprocessing
    stock.standardize()  # Standardizing the data

    # Check if data is still non-empty after standardization
    if stock.data.size == 0:  # Use .size for NumPy arrays
        return None, None

    stock.normalize()  # Normalizing the data

    X = pd.DataFrame(stock.data)
    y = pd.Series(stock.label)

    return X, y

In [45]:
def fetch_data(ticker):
    try:
        data = yf.download(ticker,start="2000-01-01", end="2021-11-12")
        if data.empty:
            return None
        else:
            return data
    except Exception as e:
        return None

In [46]:
ticker_df = pd.read_csv("tickers_nasd.csv")  
tickers = list(ticker_df["Symbol"])
results_rf = {}
results_xgb = {}

# Initialize dictionaries to store results
accuracy_results_rf = {}
accuracy_results_xgb = {}

# Initialize a dictionary to store combined accuracies
combined_accuracy_results = {}

for ticker in tickers:
    fetched_data = fetch_data(ticker)
    if fetched_data is None or fetched_data.empty:
        continue

    X, y = process_stock_data(fetched_data)
    if X is None or y is None or X.empty or len(y) < 2:
        continue

    # Ensure there's enough data to split
    if len(X) > 1:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

        # Random Forest Model
        rf_model.fit(X_train, y_train)
        accuracy_rf = accuracy_score(y_test, rf_model.predict(X_test))
        accuracy_results_rf[ticker] = accuracy_rf  # Store RF accuracy

        # XGBoost Model
        xgb_model.fit(X_train, y_train)
        accuracy_xgb = accuracy_score(y_test, xgb_model.predict(X_test))
        accuracy_results_xgb[ticker] = accuracy_xgb  # Store XGB accuracy

        # Combine accuracies (here, taking the average)
        combined_accuracy = (accuracy_rf + accuracy_xgb) / 2
        combined_accuracy_results[ticker] = combined_accuracy



# Sort and select top 10 tickers based on combined model accuracies
top_10_tickers_combined = sorted(combined_accuracy_results, key=combined_accuracy_results.get, reverse=True)[:10]

[*********************100%%**********************]  1 of 1 completed


1 Failed download:
['PIH']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


1 Failed download:
['FCCY']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%******


1 Failed download:
['JOBS']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


1 Failed download:
['AVHI']: Exception("%ticker%: Data doesn't exist for startDate = 946702800, endDate = 1636693200")



[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%******


1 Failed download:
['ABEOW']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%%**********************]  1 of 1 completed


1 Failed download:
['ABIL']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


ValueError: Found array with 0 sample(s) (shape=(0, 32)) while a minimum of 1 is required by StandardScaler.

In [None]:
# Sort and select top 10 tickers based on combined model accuracies
top_10_tickers_combined = sorted(combined_accuracy_results, key=combined_accuracy_results.get, reverse=True)[:10]

In [None]:
# Create a DataFrame to display the tickers along with accuracies from both models
top_10_df = pd.DataFrame({
    'Ticker': top_10_tickers_combined,
    'Combined_Accuracy': [combined_accuracy_results.get(ticker, None) for ticker in top_10_tickers_combined],
    'RF_Accuracy': [accuracy_results_rf.get(ticker, None) for ticker in top_10_tickers_combined],
    'XGB_Accuracy': [accuracy_results_xgb.get(ticker, None) for ticker in top_10_tickers_combined]
})


# Convert the DataFrame to a CSV string and write to a file
csv_data = top_10_df.to_csv(index=False)
with open('top_10_combined_accuracy.csv', 'w') as file:
    file.write(csv_data)

In [None]:
stock_predictions = {}
for ticker in top_10_tickers_combined:
    predictions = get_stock_predictions(ticker, rf_model)  # Replace 'your_model' with the actual model
    if predictions is not None:
        stock_predictions[ticker] = predictions


In [None]:
for ticker in top_10_tickers_combined:
    cerebro = bt.Cerebro()
    cerebro.addstrategy(RFStrategy)
    data_feed = PredictionsData(dataname=stock_predictions[ticker])
    cerebro.adddata(data_feed)
    cerebro.broker.set_cash(10000)
    cerebro.broker.setcommission(commission=0.001)
    cerebro.addanalyzer(bt.analyzers.TimeReturn, _name='time_return')
    strat = cerebro.run()
    daily_returns = strat[0].analyzers.time_return.get_analysis()
    returns_series = pd.Series(daily_returns)
    returns_series.index = pd.to_datetime(returns_series.index)

    # Generate and save reports
    qs.reports.html(returns_series, output=f'quantstats_{ticker}.')