In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import sys
import os
sys.path.append(os.path.abspath('../'))
from src.hopsworks_connections import pull_data, upload_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score
import joblib

# Print the current working directory
current_working_directory = os.getcwd()
print("Current Working Directory:", current_working_directory)

  from .autonotebook import tqdm as notebook_tqdm


Current Working Directory: /Users/davydsadovskyy/GitBackedShit/crypto-prediction/notebooks


### Get feature/target data from hopsworks, so we can train a model

In [3]:
eth_ohlc = pull_data('eth_ohlc_transformed', 1, 'eth_ohlc_transformed', 1)
eth_ohlc = eth_ohlc[:-1] # the last row contains NA values for tomorrow's price and target, so we have to ommit it here
eth_ohlc

Connected. Call `.close()` to terminate connection gracefully.




To ensure compatibility please install the latest bug fix release matching the minor version of your backend (3.7) by running 'pip install hopsworks==3.7.*'



Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/582805
Connected. Call `.close()` to terminate connection gracefully.
Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/582805/fs/578628/fv/eth_ohlc_transformed/version/1
Finished: Reading data from Hopsworks, using ArrowFlight (1.63s) 




Unnamed: 0,date,open,high,low,close,volume,volume_eth,market_cap,tmw_avg_high_close,tmw_percent_increase,...,october,november,december,monday,tuesday,wednesday,thursday,friday,saturday,sunday
806,2015-11-15 00:00:00+00:00,0.8912,0.9215,0.8750,0.9064,4.118000e+05,458365.0,6.720000e+07,0.936850,0.032503,...,0,1,0,0,0,0,0,0,0,1
1225,2015-11-16 00:00:00+00:00,0.9062,0.9447,0.8920,0.9290,6.209000e+05,676442.0,6.860000e+07,1.020000,0.089216,...,0,1,0,1,0,0,0,0,0,0
2432,2015-11-17 00:00:00+00:00,0.9249,1.0300,0.9058,1.0100,1.100000e+06,1183690.0,7.220000e+07,1.000000,-0.010000,...,0,1,0,0,1,0,0,0,0,0
1968,2015-11-18 00:00:00+00:00,0.9900,1.0100,0.9405,0.9900,6.811000e+05,691994.0,7.360000e+07,0.982750,-0.007377,...,0,1,0,0,0,1,0,0,0,0
859,2015-11-19 00:00:00+00:00,0.9887,1.0100,0.9375,0.9555,4.435000e+05,455866.0,7.280000e+07,0.940300,-0.016165,...,0,1,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
591,2024-04-17 00:00:00+00:00,3083.0000,3119.0000,2926.0000,2984.0000,1.550000e+10,5093979.0,3.722000e+11,3075.500000,0.029751,...,0,0,0,0,0,1,0,0,0,0
2751,2024-04-18 00:00:00+00:00,2986.0000,3087.0000,2960.0000,3064.0000,1.350000e+10,4453302.0,3.706000e+11,3101.000000,0.011932,...,0,0,0,0,0,0,1,0,0,0
2793,2024-04-19 00:00:00+00:00,3061.0000,3117.0000,2879.0000,3085.0000,1.740000e+10,5675824.0,3.746000e+11,3160.500000,0.023889,...,0,0,0,0,0,0,0,1,0,0
2705,2024-04-20 00:00:00+00:00,3085.0000,3166.0000,3025.0000,3155.0000,8.900000e+09,2884086.0,3.776000e+11,3168.500000,0.004261,...,0,0,0,0,0,0,0,0,1,0


### Define the model and obtain a backtested precision score (precision score = out of all the times we predicted the price would go up, what proportion of the time did it actually go up the next day)

In [4]:
model = RandomForestClassifier(n_estimators=250, min_samples_split=200, random_state=1)

predictors = []
periods = [2, 5, 10, 25, 50, 100]
percent_increase_counts = ['1_0', '1_25', '1_5', '1_75', '2_0', '2_25', '2_5', '2_75', '3_0']
for period in periods:
    for indicator in ['ema', 'rsi', 'sma']:
        predictors.append(f'{indicator}_{period}')
    for percent in percent_increase_counts:
        predictors.append(f'last_{period}_day_{percent}_percent_increase_count')
months = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december']
predictors.extend(months)
weekdays = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
predictors.extend(weekdays)

response_var = 'tmw_1_0_percent_increase_binary'

threshold = 0.55

# Make start a function of the model deployment number. For example, the second model will be trained in 100 days, so 
# start will be set to start=1100. This is done so that the precision score for the next model will better reflect
# performance on more recent data
start = 1000

In [5]:
def get_precision_score(data, model, predictors, response_var, threshold, start=1000, step=100):
    
    def predict(train, test, predictors, model):
        model.fit(train[predictors], train[response_var])
        probability = model.predict_proba(test[predictors])[:,1]
        proba_series = pd.Series(probability, index=test.index, name="probability")
        combined = pd.concat([test[response_var], test['tmw_percent_increase'], proba_series], axis=1)
        return combined

    all_predictions = []
    for i in range(start, data.shape[0], step):
        train = data.iloc[0:i].copy()
        test = data.iloc[i:(i+100)].copy()
        predictions = predict(train, test, predictors, model)
        all_predictions.append(predictions)

    data_with_predictions = pd.concat(all_predictions)

    col_name = f'pred_{threshold}'
    data_with_predictions[col_name] = data_with_predictions['probability'] >= threshold

    ppv = precision_score(data_with_predictions[response_var], data_with_predictions[col_name])
    
    return ppv

In [5]:
ppv = get_precision_score(eth_ohlc, model, predictors, response_var, threshold, start=start)
ppv

0.5986842105263158

### Fit the model on all the data, then save it into a file as a serialized model
##### joblib is for serializing models in order to save them to disk. Note that pickle and joblib are both modules that serialize Python objects into a byte stream, and deserializes the byte stream back to Python objects. However, joblib is useful when the model involves a lot of data and arrays, and is generally recommended for scikit-learn models because of its efficiency with these objects.

In [6]:
model.fit(eth_ohlc[predictors], eth_ohlc[response_var])

MODEL_NAME = 'random_forest_ohlc'

joblib.dump(model, f'../models/ohlc/{MODEL_NAME}.pkl')

['../models/ohlc/random_forest_ohlc.pkl']

### Upload the serialized model into hopsworks

In [8]:
last_date = pd.to_datetime(eth_ohlc['date'].iloc[-1]).tz_localize(None)
trained_till = last_date.strftime('%m/%d/%Y')
trained_till

'04/19/2024'

In [10]:
# def upload_model(model_name, model_description, metric_type, metric_value):
upload_model(MODEL_NAME, f"Predict {response_var}, Threshold {threshold}, Trained Until {trained_till}", "PPV", ppv)

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.




To ensure compatibility please install the latest bug fix release matching the minor version of your backend (3.7) by running 'pip install hopsworks==3.7.*'



Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/582805
Connected. Call `.close()` to terminate connection gracefully.


Uploading: 100.000%|██████████| 1109337/1109337 elapsed<00:01 remaining<00:004.32it/s]
Model export complete: 100%|██████████| 6/6 [00:07<00:00,  1.20s/it]                   

Model created, explore it at https://c.app.hopsworks.ai:443/p/582805/models/random_forest_ohlc/1



