In [2]:
import pandas as pd
from sklearn.metrics import confusion_matrix, precision_score
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
import catboost as cb
import re

In [3]:
eth_ohlc = pd.read_csv('../data/transformed/ohlc/ethereum.csv')
eth_ohlc

Unnamed: 0,date,open,high,low,close,volume,volume_eth,market_cap,tmw_avg_high_close,tmw_percent_increase,...,october,november,december,monday,tuesday,wednesday,thursday,friday,saturday,sunday
0,2015-11-15,0.8912,0.9215,0.8750,0.9064,4.118000e+05,458365.0,6.720000e+07,0.93685,0.032503,...,0,1,0,0,0,0,0,0,0,1
1,2015-11-16,0.9062,0.9447,0.8920,0.9290,6.209000e+05,676442.0,6.860000e+07,1.02000,0.089216,...,0,1,0,1,0,0,0,0,0,0
2,2015-11-17,0.9249,1.0300,0.9058,1.0100,1.100000e+06,1183690.0,7.220000e+07,1.00000,-0.010000,...,0,1,0,0,1,0,0,0,0,0
3,2015-11-18,0.9900,1.0100,0.9405,0.9900,6.811000e+05,691994.0,7.360000e+07,0.98275,-0.007377,...,0,1,0,0,0,1,0,0,0,0
4,2015-11-19,0.9887,1.0100,0.9375,0.9555,4.435000e+05,455866.0,7.280000e+07,0.94030,-0.016165,...,0,1,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3057,2024-04-17,3083.0000,3119.0000,2926.0000,2984.0000,1.550000e+10,5093979.0,3.722000e+11,3075.50000,0.029751,...,0,0,0,0,0,1,0,0,0,0
3058,2024-04-18,2986.0000,3087.0000,2960.0000,3064.0000,1.350000e+10,4453302.0,3.706000e+11,3101.00000,0.011932,...,0,0,0,0,0,0,1,0,0,0
3059,2024-04-19,3061.0000,3117.0000,2879.0000,3085.0000,1.740000e+10,5675824.0,3.746000e+11,3160.50000,0.023889,...,0,0,0,0,0,0,0,1,0,0
3060,2024-04-20,3085.0000,3166.0000,3025.0000,3155.0000,8.900000e+09,2884086.0,3.776000e+11,3168.50000,0.004261,...,0,0,0,0,0,0,0,0,1,0


In [4]:
column_names = eth_ohlc.columns.tolist()
print(column_names)

['date', 'open', 'high', 'low', 'close', 'volume', 'volume_eth', 'market_cap', 'tmw_avg_high_close', 'tmw_percent_increase', 'tmw_1_0_percent_increase_binary', 'tmw_1_25_percent_increase_binary', 'tmw_1_5_percent_increase_binary', 'tmw_1_75_percent_increase_binary', 'tmw_2_0_percent_increase_binary', 'tmw_2_25_percent_increase_binary', 'tmw_2_5_percent_increase_binary', 'tmw_2_75_percent_increase_binary', 'tmw_3_0_percent_increase_binary', 'tmw_positive_percent_increase_binary', 'ema_2', 'rsi_2', 'sma_2', 'last_2_day_1_0_percent_increase_count', 'last_2_day_1_25_percent_increase_count', 'last_2_day_1_5_percent_increase_count', 'last_2_day_1_75_percent_increase_count', 'last_2_day_2_0_percent_increase_count', 'last_2_day_2_25_percent_increase_count', 'last_2_day_2_5_percent_increase_count', 'last_2_day_2_75_percent_increase_count', 'last_2_day_3_0_percent_increase_count', 'ema_5', 'rsi_5', 'sma_5', 'last_5_day_1_0_percent_increase_count', 'last_5_day_1_25_percent_increase_count', 'last_

## Make 10 random forest models for each of the 10 response variables. Do an analysis of precision scores (Positive Predicted Value, PPV) across thresholds.

### This is the backtesting function. Its like cross-validation, but for time series data. For a particular model, it returns a dataframe consisting of 'target' (what the actual value of the respone variable was) and 'probability', the predicted value of our model which signifies the proabality of a response of 1.

In [5]:
def get_backtested_predictions_df(data, predictors, model, response_var, start=1000, step=100):
    
    def predict(train, test, predictors, model):
        model.fit(train[predictors], train[response_var])
        probability = model.predict_proba(test[predictors])[:,1]
        proba_series = pd.Series(probability, index=test.index, name="probability")
        combined = pd.concat([test[response_var], test['tmw_percent_increase'], proba_series], axis=1)
        return combined

    all_predictions = []
    for i in range(start, data.shape[0], step):
        train = data.iloc[0:i].copy()
        test = data.iloc[i:(i+100)].copy()
        predictions = predict(train, test, predictors, model)
        all_predictions.append(predictions)

    data_with_predictions = pd.concat(all_predictions)
    
    return data_with_predictions

In [6]:
predictors = []
periods = [2, 5, 10, 25, 50, 100]
percent_increase_counts = ['1_0', '1_25', '1_5', '1_75', '2_0', '2_25', '2_5', '2_75', '3_0']
for period in periods:
    for indicator in ['ema', 'rsi', 'sma']:
        predictors.append(f'{indicator}_{period}')
    for percent in percent_increase_counts:
        predictors.append(f'last_{period}_day_{percent}_percent_increase_count')
months = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december']
predictors.extend(months)
weekdays = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
predictors.extend(weekdays)

model = RandomForestClassifier(n_estimators=250, min_samples_split=200, random_state=1)

### Get backtested predictions for each of the 9 response variables we made, plus the 10th control response variable of > 0% increase tomorrow

In [8]:
list_of_dfs = []
intervals = [1 + i * 0.25 for i in range(int((3 - 1) / 0.25) + 1)] # 1-3, by .25
for i in intervals:
    df = get_backtested_predictions_df(eth_ohlc, predictors, model, f"tmw_{str(i).replace('.', '_')}_percent_increase_binary")
    list_of_dfs.append(df)

In [None]:
# the 10th response varaible of whehter tomorrows return is over 0.0%
list_of_dfs.append(get_backtested_predictions_df(eth_ohlc, predictors, model, f"tmw_positive_percent_increase_binary"))

In [37]:
list_of_dfs[2]

Unnamed: 0,tmw_1_5_percent_increase_binary,tmw_percent_increase,probability,pred_50.0,pred_50.25,pred_50.5,pred_50.75,pred_51.0,pred_51.25,pred_51.5,...,pred_67.75,pred_68.0,pred_68.25,pred_68.5,pred_68.75,pred_69.0,pred_69.25,pred_69.5,pred_69.75,pred_70.0
1000,0,0.006079,0.548271,True,True,True,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
1001,0,-0.047685,0.517924,True,True,True,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
1002,0,-0.013621,0.640777,True,True,True,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
1003,1,0.047939,0.595422,True,True,True,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
1004,1,0.037185,0.512278,True,True,True,True,True,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3052,0,-0.028227,0.507019,True,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3053,1,0.046368,0.569980,True,True,True,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
3054,0,0.010047,0.375886,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3055,0,0.000161,0.394011,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


### Plot PPV's (positive predictive value, precision score) for each of the 9 response variables along an interval

In [45]:
def plot_ppv_by_threshold(list_of_dfs, use_over_0_percent_target, min_true_count, threshold_start, threshold_end, threshold_step):
    # Initialize a Plotly figure
    fig = go.Figure()

    # Loop through each DataFrame and interval
    for df in list_of_dfs:
        filtered_thresholds = []
        filtered_ppvs = []
        filtered_hover_texts = []  # List to store hover text information only for true_count >= min_true_count

        # Use regular expressions to extract the number or word describing the interval
        first_column_name = df.columns[0]
        match = re.search(r'tmw_([\d_]+|\w+)_percent_increase_binary', first_column_name)
        if match:
            interval = match.group(1).replace('_', '.')  # Replace underscores with dots for floating point values
            if interval == 'positive':
                interval_label = 'Positive'
            else:
                interval_label = f'{interval}'
                
        thresholds = [x * threshold_step for x in range(int(threshold_start / threshold_step), int(threshold_end / threshold_step) + 1)]
        for threshold in thresholds:
            col_name = f'pred_{threshold}'
            df[col_name] = df['probability'] >= (threshold / 100)
            
            # Calculate the confusion matrix
            # response_col_name = f"tmw_{str(interval).replace('.', '_')}_percent_increase_binary"
            if use_over_0_percent_target:
                cm = confusion_matrix(df['tmw_percent_increase_binary'], df[col_name])
            else:
                response_col_name = df.columns[0]
                cm = confusion_matrix(df[response_col_name], df[col_name])

            tn, fp, fn, tp = cm.ravel()
            ppv = tp / (tp + fp) if (tp + fp) > 0 else 0

            # Calculate value counts for hover information
            value_counts = df[col_name].value_counts().to_dict()
            false_count = value_counts.get(False, 0)
            true_count = value_counts.get(True, 0)
            
            # Only include data points where true_count >= min_true_count
            if true_count >= min_true_count:
                filtered_thresholds.append(threshold)
                filtered_ppvs.append(ppv)
                hover_text = f"{interval}% Return Tomorrow<br>Threshold: {threshold}<br>PPV: {ppv:.4f}<br>0: {false_count}<br>1: {true_count}"
                filtered_hover_texts.append(hover_text)

        # Add a trace for each DataFrame, only if there are valid data points
        if filtered_ppvs:
            fig.add_trace(go.Scatter(
                x=filtered_thresholds,
                y=filtered_ppvs,
                mode='lines+markers',
                name=f'{interval_label}% Return Tomorrow',  # Using a dynamically generated name based on interval
                text=filtered_hover_texts,
                hoverinfo="text"
            ))

    # Update plot layout with a title for the legend
    fig.update_layout(
        title=f'Precision Score of Models Accross Different Threshold Ranges. (Filtered by sum(prediction=1) >= {min_true_count})',
        xaxis_title='Probability Threshold (%)',
        yaxis_title='Positive Predictive Value (PPV, Precision)',
        legend_title="Model With Response Variable",  # Setting the title of the legend
        hovermode="closest"  # Ensure hover-effect shows for the closest data point
    )

    # Show the plot
    fig.show()

#### A min_true_count = 100, means that over all the 2057 predictions made, 100 where predicted as 1
#### 2057/365 = 5.63 years
#### The model makes, on average, 100 / 5.63 = 17.8 predictions per year

In [46]:
# def plot_ppv_by_threshold(list_of_dfs, min_true_count, threshold_start, threshold_end, threshold_step):
plot_ppv_by_threshold(list_of_dfs, False, 100, 50, 70, 0.25)

### If we had no model, and just bought Ethereum every single day, we would achieve each return the following proportion of the time

In [63]:
columns_and_labels = [
    ('tmw_positive_percent_increase_binary', 'Positive'), ('tmw_1_0_percent_increase_binary', '1.0%'), ('tmw_1_25_percent_increase_binary', '1.25%'), ('tmw_1_5_percent_increase_binary', '1.5%'), ('tmw_1_75_percent_increase_binary', '1.75%'), ('tmw_2_0_percent_increase_binary', '2.0%'), ('tmw_2_25_percent_increase_binary', '2.25%'), ('tmw_2_75_percent_increase_binary', '2.75%'), ('tmw_3_0_percent_increase_binary', '3.0%')
]
short_names, proportions = [], []
for col_name, label in columns_and_labels:
    value_counts = eth_ohlc[col_name][1000:].value_counts(normalize=True)
    proportions.append(value_counts.get(1, 0))
    short_names.append(label)

for name, prop in zip(short_names, proportions):
    print(f'{name}: {prop}')

Positive: 0.6562955760816723
1.0%: 0.4720466699076325
1.25%: 0.43364122508507535
1.5%: 0.4025279533300924
1.75%: 0.3675255226057365
2.0%: 0.3354399611084103
2.25%: 0.31016042780748665
2.75%: 0.25911521633446766
3.0%: 0.24258629071463297


**Comparing the precision scores from the graph above with these naive baseline 'models' (just buying ethereum every day), we can see that the random forest model significantly outperforms the naive strategy by about 10%-20%, when a specifc return percentage is used as response variable. However, the model that predicts just a positive return is no better than the naive baseline model.**

### Adjust the precision scores so that any percent return over 0 is the target

In [47]:
for df in list_of_dfs:
    df['tmw_percent_increase_binary'] = (df['tmw_percent_increase'] > 0).astype(int)

In [48]:
plot_ppv_by_threshold(list_of_dfs, True, 100, 50, 70, 0.25)

**Compared to the baseline naive 'model' of a positive return occuring 0.6563 of the time, these actual models outperform that. However, the model that predicts a positive% return is not significantly better than the naive baseline model, expect slightly at the higher probability thresholds**



## Now let's try several different models and hyperparameter combinations, and find the best one.

### To reduce computational cost of going through different hyperparamters, probability thresholds, and response variable type we will focus on predicting just 1 predictor variable, 1.0% Return Tomorrow, using a single threshold probability of 0.55. 

In [7]:
predictors = []
periods = [2, 5, 10, 25, 50, 100]
percent_increase_counts = ['1_0', '1_25', '1_5', '1_75', '2_0', '2_25', '2_5', '2_75', '3_0']
for period in periods:
    for indicator in ['ema', 'rsi', 'sma']:
        predictors.append(f'{indicator}_{period}')
    for percent in percent_increase_counts:
        predictors.append(f'last_{period}_day_{percent}_percent_increase_count')
months = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december']
predictors.extend(months)
weekdays = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
predictors.extend(weekdays)

### Random Forest Classifier

In [50]:
n_estimators_values = [100, 250, 500] # num bootstraps
min_samples_split_values = [50, 100, 200] # small=more overfit. 

results = []
for n_estimators in n_estimators_values:
    for min_samples_split in min_samples_split_values:
        model = RandomForestClassifier(n_estimators=n_estimators, min_samples_split=min_samples_split, random_state=1)
        
        # Get precision score
        df = get_backtested_predictions_df(eth_ohlc, predictors, model, "tmw_1_0_percent_increase_binary")
        df['prediction'] = df['probability'] >= 0.55
        score = precision_score(df['tmw_1_0_percent_increase_binary'], df['prediction'])
        
        # Store the results
        results.append({
            'n_estimators': n_estimators,
            'min_samples_split': min_samples_split,
            'precision_score': score,
            'num positive predictions': df['prediction'].value_counts()
        })

for result in results:
    print(result)

{'n_estimators': 100, 'min_samples_split': 50, 'precision_score': 0.5509259259259259}
{'n_estimators': 100, 'min_samples_split': 100, 'precision_score': 0.5793871866295265}
{'n_estimators': 100, 'min_samples_split': 200, 'precision_score': 0.6026936026936027}
{'n_estimators': 250, 'min_samples_split': 50, 'precision_score': 0.5756097560975609}
{'n_estimators': 250, 'min_samples_split': 100, 'precision_score': 0.5855614973262032}
{'n_estimators': 250, 'min_samples_split': 200, 'precision_score': 0.5986842105263158}
{'n_estimators': 500, 'min_samples_split': 50, 'precision_score': 0.5641646489104116}
{'n_estimators': 500, 'min_samples_split': 100, 'precision_score': 0.5846994535519126}
{'n_estimators': 500, 'min_samples_split': 200, 'precision_score': 0.5898305084745763}


### LASSO Regression

In [8]:
# model = LogisticRegression(penalty='l1', C=.1, solver='liblinear', random_state=1)  # error, failed to converge,
model = LogisticRegression(penalty='l1', C=.1, solver='liblinear', random_state=1, max_iter=1000)

df = get_backtested_predictions_df(eth_ohlc, predictors, model, "tmw_1_0_percent_increase_binary", start=1000, step=100)
df['prediction'] = df['probability'] >= 0.55
print(precision_score(df['tmw_1_0_percent_increase_binary'], df['prediction']))
df['prediction'].value_counts()

0.576271186440678


prediction
False    1880
True      177
Name: count, dtype: int64

In [92]:
model = LogisticRegression(penalty='l1', C=.1, solver='liblinear', random_state=1, max_iter=1000)
model.fit(eth_ohlc[predictors], eth_ohlc['tmw_1_0_percent_increase_binary'])

In [95]:
coefficients = model.coef_[0]
# Map coefficients to feature names
name_coef = {feature: coef for feature, coef in zip(predictors, coefficients)}

for feature, coef in name_coef.items():
    print(f"{feature}: {coef}")

ema_2: 0.0
rsi_2: -0.007193116707537571
sma_2: 0.0
last_2_day_1_0_percent_increase_count: 0.05553797490225255
last_2_day_1_25_percent_increase_count: 0.0
last_2_day_1_5_percent_increase_count: 0.0
last_2_day_1_75_percent_increase_count: 0.0
last_2_day_2_0_percent_increase_count: 0.0
last_2_day_2_25_percent_increase_count: 0.0
last_2_day_2_5_percent_increase_count: 0.02914822617291995
last_2_day_2_75_percent_increase_count: 0.0
last_2_day_3_0_percent_increase_count: 0.051143843010660474
ema_5: 0.0
rsi_5: 0.0
sma_5: 0.0
last_5_day_1_0_percent_increase_count: 0.0593598611771807
last_5_day_1_25_percent_increase_count: 0.0
last_5_day_1_5_percent_increase_count: 0.0
last_5_day_1_75_percent_increase_count: -0.07600133015743603
last_5_day_2_0_percent_increase_count: -0.05699880221238087
last_5_day_2_25_percent_increase_count: 0.0
last_5_day_2_5_percent_increase_count: 0.0
last_5_day_2_75_percent_increase_count: 0.0
last_5_day_3_0_percent_increase_count: 0.0625137930950976
ema_10: 0.0
rsi_10: 0

In [91]:
df['prediction'].value_counts()

prediction
False    1880
True      177
Name: count, dtype: int64

In [None]:
# Small values specify stronger regularization
C_values = [0.01, 0.1, 1, 10, 100]

results = []
for C in C_values:
    # LASSO logistic regression model
    model = LogisticRegression(penalty='l1', C=C, solver='liblinear', random_state=1)  # 'liblinear' solver supports L1 penalty
    
    # Get precision score
    df = get_backtested_predictions_df(eth_ohlc, predictors, model, "tmw_1_0_percent_increase_binary")
    df['prediction'] = df['probability'] >= 0.55
    score = precision_score(df['tmw_1_0_percent_increase_binary'], df['prediction'])
    
    # Store the results
    results.append({
        'C': C,
        'precision_score': score,
        'num positive predictions': df['prediction'].value_counts()
    })

for result in results:
    print(result)

### LightGBM

In [None]:
num_leaves_values = [15, 31, 45]
max_depth_values = [4, 6, 8]
learning_rate_values = [0.005, 0.01, 0.05]

results = []

# Loop through each combination of parameters
for num_leaves in num_leaves_values:
    for max_depth in max_depth_values:
        for learning_rate in learning_rate_values:
            # Create and train the model
            model = lgb.LGBMClassifier(num_leaves=num_leaves, max_depth=max_depth, learning_rate=learning_rate, random_state=1)
            
            # Get precision score
            df = get_backtested_predictions_df(eth_ohlc, predictors, model, "tmw_1_0_percent_increase_binary")
            df['prediction'] = df['probability'] >= 0.55
            score = precision_score(df['tmw_1_0_percent_increase_binary'], df['prediction'])
            
            # Store the results
            results.append({
                'num_leaves': num_leaves,
                'max_depth': max_depth,
                'learning_rate': learning_rate,
                'precision_score': score,
                'num positive predictions': df['prediction'].value_counts()
            })

for result in results:
    print(result)

In [86]:
for result in results:
    print(result)

{'num_leaves': 15, 'max_depth': 4, 'learning_rate': 0.005, 'precision_score': 0.5571428571428572, 'num positive predictions': prediction
False    1917
True      140
Name: count, dtype: int64}
{'num_leaves': 15, 'max_depth': 4, 'learning_rate': 0.01, 'precision_score': 0.584192439862543, 'num positive predictions': prediction
False    1766
True      291
Name: count, dtype: int64}
{'num_leaves': 15, 'max_depth': 4, 'learning_rate': 0.05, 'precision_score': 0.5466666666666666, 'num positive predictions': prediction
False    1607
True      450
Name: count, dtype: int64}
{'num_leaves': 15, 'max_depth': 6, 'learning_rate': 0.005, 'precision_score': 0.5534591194968553, 'num positive predictions': prediction
False    1898
True      159
Name: count, dtype: int64}
{'num_leaves': 15, 'max_depth': 6, 'learning_rate': 0.01, 'precision_score': 0.5667655786350149, 'num positive predictions': prediction
False    1720
True      337
Name: count, dtype: int64}
{'num_leaves': 15, 'max_depth': 6, 'learning

### CatBoost

In [35]:
model = cb.CatBoostClassifier(iterations=100, 
                                learning_rate=.005, 
                                depth=6,
                                verbose=False,
                                random_seed=1)

df = get_backtested_predictions_df(eth_ohlc, predictors, model, "tmw_1_0_percent_increase_binary", start=1000, step=100)
df['prediction'] = df['probability'] >= 0.53
precision_score(df['tmw_1_0_percent_increase_binary'], df['prediction'])

0.656934306569343

In [36]:
df['prediction'].value_counts()

prediction
False    1920
True      137
Name: count, dtype: int64

In [23]:
df['tmw_1_0_percent_increase_binary'].value_counts()

tmw_1_0_percent_increase_binary
0    1086
1     971
Name: count, dtype: int64

In [83]:
df['prediction'].value_counts()

prediction
False    2035
True       22
Name: count, dtype: int64

In [78]:
score

0.5540229885057472

In [84]:
iterations_values = [100, 300, 500]
learning_rate_values = [0.01, 0.05, 0.1]
depth_values = [3, 4, 6]

results = []

for iterations in iterations_values:
    for learning_rate in learning_rate_values:
        for depth in depth_values:
            model = cb.CatBoostClassifier(iterations=iterations, 
                                          learning_rate=learning_rate, 
                                          depth=depth,
                                          verbose=False, # Turn off verbose output for simplicity
                                          random_seed=1)

            df = get_backtested_predictions_df(eth_ohlc, predictors, model, "tmw_1_0_percent_increase_binary")
            df['prediction'] = df['probability'] >= 0.55
            score = precision_score(df['tmw_1_0_percent_increase_binary'], df['prediction'])
            
            results.append({
                'iterations': iterations,
                'learning_rate': learning_rate,
                'depth': depth,
                'precision_score': score,
                'num positive predictions': df['prediction'].value_counts()
            })

for result in results:
    print(result)

{'iterations': 100, 'learning_rate': 0.01, 'depth': 3, 'precision_score': 0.6363636363636364, 'num positive predictions': prediction
False    1969
True       88
Name: count, dtype: int64}
{'iterations': 100, 'learning_rate': 0.01, 'depth': 4, 'precision_score': 0.646551724137931, 'num positive predictions': prediction
False    1941
True      116
Name: count, dtype: int64}
{'iterations': 100, 'learning_rate': 0.01, 'depth': 6, 'precision_score': 0.6521739130434783, 'num positive predictions': prediction
False    1919
True      138
Name: count, dtype: int64}
{'iterations': 100, 'learning_rate': 0.05, 'depth': 3, 'precision_score': 0.6031746031746031, 'num positive predictions': prediction
False    1742
True      315
Name: count, dtype: int64}
{'iterations': 100, 'learning_rate': 0.05, 'depth': 4, 'precision_score': 0.5873015873015873, 'num positive predictions': prediction
False    1742
True      315
Name: count, dtype: int64}
{'iterations': 100, 'learning_rate': 0.05, 'depth': 6, 'preci

In [65]:
len(predictors)

91

## Check if the predictions from backtesting function get more accurate as time goes on (more data is used in each subsequent training set)