In [161]:
import pandas as pd
import sys
import os
sys.path.append(os.path.abspath('../'))
from src.hopsworks_connections import pull_data, upload_data
import pandas_ta as ta
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score
import numpy as np




# Print the current working directory
current_working_directory = os.getcwd()
print("Current Working Directory:", current_working_directory)

Current Working Directory: /Users/davydsadovskyy/GitBackedShit/crypto-prediction/notebooks


### Get raw ethereum ohlc data

In [5]:
def get_cleaned_raw_data(filename):
    eth_ohlc_raw = pd.read_csv(f'../data/raw/ohlc/{filename}', parse_dates=True)
    def convert_value(value):
        """
        Converts a string value to a float. Removes $ signs, and converts
        billion (bn), million (m), and thousand (K) values to their numeric equivalents.
        """
        value = value.replace('$', '')  # Remove $ sign to simplify processing
        if value[-1].lower() == 'm':
            return float(value[:-1]) * 1_000_000
        elif value[-1].lower() == 'b':
            return float(value[:-1]) * 1_000_000_000
        elif value[-1].lower() == 'k':
            return float(value[:-1]) * 1_000
        elif value[-2:].lower() == 'bn':  # Handle 'bn' for billions
            return float(value[:-2]) * 1_000_000_000
        else:
            return float(value)

    eth_ohlc_raw.columns = [col.lower() for col in eth_ohlc_raw.columns]
    eth_ohlc_raw.rename(columns={'volume(eth)': 'volume_eth', 'market cap': 'market_cap'}, inplace=True)

    for col in ['open', 'high', 'low', 'close', 'volume', 'market_cap']:
        eth_ohlc_raw[col] = eth_ohlc_raw[col].apply(convert_value)
    eth_ohlc_raw['date'] = pd.to_datetime(eth_ohlc_raw['date'])

    return eth_ohlc_raw

In [153]:
df_raw = get_cleaned_raw_data('ethereum.csv')
df_raw = df_raw.iloc[::-1].reset_index(drop=True) # reverse order

### Add response variables - these will be used to also generate some predictors later

In [152]:
def add_response_variables(df):
    
    df_tranformed = df.copy()
    df_tranformed = df_tranformed.iloc[::-1].reset_index(drop=True) # reverse order

    # Create binary response variable for percent returns tomorrow
    df_tranformed["tmw_avg_high_close"] = (df_tranformed["high"].shift(-1) + df_tranformed["close"].shift(-1)) / 2
    df_tranformed['tmw_percent_increase'] = ((df_tranformed['tmw_avg_high_close'] - df_tranformed['close']) / df_tranformed['tmw_avg_high_close'])
    intervals = [1 + i * 0.25 for i in range(int((3 - 1) / 0.25) + 1)] # 1-3, by .25
    for i in intervals:
        threshold = i / 100
        column_name = f"tmw_{str(i).replace('.', '_')}_percent_increase_binary"
        df_tranformed[column_name] = (df_tranformed['tmw_percent_increase'] >= threshold).astype(int)

    return df_tranformed

In [24]:
df_with_response_vars = add_response_variables(df_raw)
len(df_with_response_vars)

3162

## Predictor Feature Engineering

### Adding simple predictors based on momentum of the price and seasonality: 

In [12]:
def add_simple_predictors(df_with_response_vars):
    
    df_transformed = df_with_response_vars.copy()

    ###### Predictor variables about the price momentum

    horizons = [2,5,10,25,50,100] 
    predictors = []
    for horizon in horizons:
        ### Exponential Moving Average (EMA) for past horizon
        ema_col = f"ema_{horizon}"
        df_transformed[ema_col] = df_transformed["close"] / ta.ema(df_transformed["close"], length=horizon)
        predictors.append(ema_col)

        ### Relative Strength Index (RSI) for past horizon
        rsi_col = f"rsi_{horizon}"
        df_transformed[rsi_col] = ta.rsi(df_transformed["close"], length=horizon)
        predictors.append(rsi_col)

        ### Simple Moving Average (SMA) for past horizon
        sma_col = f"sma_{horizon}"
        df_transformed[sma_col] = df_transformed["close"] / ta.sma(df_transformed["close"], length=horizon)
        predictors.append(sma_col)

        ### Sum of the number of days that had a certain percent increase for the past horizon
        columns = [
            # 'tmw_positive_percent_increase_binary',
            'tmw_1_0_percent_increase_binary',
            'tmw_1_25_percent_increase_binary',
            'tmw_1_5_percent_increase_binary',
            'tmw_1_75_percent_increase_binary',
            'tmw_2_0_percent_increase_binary',
            'tmw_2_25_percent_increase_binary',
            'tmw_2_5_percent_increase_binary',
            'tmw_2_75_percent_increase_binary',
            'tmw_3_0_percent_increase_binary'
        ]
        for col in columns:
            percent = col.split('tmw_')[1].split('_percent')[0]
            trend_col = f"last_{horizon}_day_{percent}_percent_increase_count"
            if col in df_transformed.columns:
                df_transformed[trend_col] = df_transformed[col].shift(1).rolling(horizon).sum()
            else:
                print('PREDICTORS NOT ADDED - You must run add_response_vars() first so that predictors can be created based on that info')
                break    

    ###### Predictor variables about seasonality/cycles

    month_names = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december']
    for i, month in enumerate(month_names):
        df_transformed[month] = (df_transformed['date'].dt.month == i).astype(int)

    days = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
    for i, day in enumerate(days):
        df_transformed[day] = (df_transformed['date'].dt.dayofweek == i).astype(int)

    return df_transformed

In [128]:
df_with_response_and_simple_preds = add_simple_predictors(df_with_response_vars)
len(df_with_response_and_simple_preds)

3162

In [130]:
# Drop NA values
df_with_response_and_simple_preds = df_with_response_and_simple_preds.dropna()
len(df_with_response_and_simple_preds)

3061

### Adding predictors with insight from the paper "101 Formulaic Alphas, by Zura Kakushadze, 2015"

In [155]:
##### Add columns that will be inputs to the 101 formulaic alphas the author defines

df_with_advanced_alphas_predictors = df_raw.copy()

# adv{d} = average daily dollar volume for the past d days
periods = [2, 5, 10, 25, 50, 100]
for period in periods:

    # Calculate adv{d} = average daily dollar volume for the past d days
    column_name = f'adv_{period}'
    df_with_advanced_alphas_predictors[column_name] = (df_with_advanced_alphas_predictors['volume'] 
                                                         * df_with_advanced_alphas_predictors['close']).rolling(window=period).mean()
    
    # Calculate Volume-Weighted Average Price (VWAP)    
    column_name = f'vwap_{period}'
    df_with_advanced_alphas_predictors[column_name] = (
        df_with_advanced_alphas_predictors['close'] * df_with_advanced_alphas_predictors['volume']
    ).rolling(window=period).sum() / df_with_advanced_alphas_predictors['volume'].rolling(window=period).sum()

In [162]:
##### Functions necessary for transforming input into alphas (predictors)

def rank(x, ascending=False):
    return df_with_advanced_alphas_predictors[x].rank(ascending=ascending)

def delta(x, days):
    return df_with_advanced_alphas_predictors[x] - df_with_advanced_alphas_predictors[x].shift(days)

def correlation(x, y, d):
    return df_with_advanced_alphas_predictors[x].rolling(window=d).corr(df_with_advanced_alphas_predictors[y])

def log(x):
    return np.log(x)

In [163]:
# Alpha#2: (-1 * correlation(rank(delta(log(volume), 2)), rank(((close - open) / open)), 6))
df_with_advanced_alphas_predictors["alpha_2"] = -1 * correlation(rank(delta(log('volume'), 2)), 
                                                      rank(((df_with_advanced_alphas_predictors['close'] - 
                                                             df_with_advanced_alphas_predictors['open']) / 
                                                             df_with_advanced_alphas_predictors['open'])), 6)

df_with_advanced_alphas_predictors["alpha_2"]



KeyError: "None of [Index([ 12.00944930402536, 13.421282081782097, 13.184774637525726,\n         12.9123828126164,  14.22097566607244, 14.603967918328545,\n       15.226497531674536, 15.341566861459324,  14.77102200299171,\n       15.096444403426338,\n       ...\n       23.693680302516114, 24.004177052734065,  23.85876005287556,\n       23.662427759012008, 23.562344300455024, 23.464105860871612,\n       23.325955522390796, 23.579736043166893, 22.909317113684505,\n        22.82739999121662],\n      dtype='float64', length=3162)] are in the [columns]"

### Get set of predictors that have low correlation

In [86]:
predictors = []
periods = [2, 5, 10, 25, 50, 100]
percent_increase_counts = ['1_0', '1_25', '1_5', '1_75', '2_0', '2_25', '2_5', '2_75', '3_0']
for period in periods:
    for indicator in ['ema', 'rsi', 'sma']:
        predictors.append(f'{indicator}_{period}')
    for percent in percent_increase_counts:
        predictors.append(f'last_{period}_day_{percent}_percent_increase_count')
months = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december']
predictors.extend(months)
weekdays = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
predictors.extend(weekdays)
len(predictors)

91

In [110]:
# returns a list of noncorrelated predictors
def drop_predictors_with_high_correlation(data, predictors_list, correlation_threshold):

    corr_matrix = data[predictors_list].corr()

    # Threshold for high correlation
    high_corr_threshold = correlation_threshold

    high_corr_pairs = []
    for col in corr_matrix.columns:
        for row in corr_matrix.index:
            if col != row and corr_matrix.at[row, col] > high_corr_threshold:
                # Sort the pair to avoid duplicates like (A, B) and (B, A)
                pair = tuple(sorted([row, col]))
                if pair not in high_corr_pairs:
                    high_corr_pairs.append(pair)

    # Identify and remove correlation pairs if the second element occurs as a first element in the list of all pairs
    first_elements = [pair[0] for pair in high_corr_pairs]
    filtered_pairs = [pair for pair in high_corr_pairs if pair[1] not in first_elements]

    # Find the unique first elements of all the pairs
    predictors_high_cor = list(set(pair[0] for pair in filtered_pairs))

    print("original number of predictors in model: ", len(predictors))
    print("number of unique highly correlated predictors to be dropped from model: ", len(predictors_high_cor))

    # Removing elements from predictors that are in predictors_high_cor
    predictors_low_cor = [x for x in predictors if x not in predictors_high_cor]
    len(predictors_low_cor)

    print("number of predictors remaining in model: ", len(predictors_low_cor))

    return predictors_low_cor

In [139]:
predictors_low_cor = drop_predictors_with_high_correlation(df_with_response_and_simple_preds, predictors, 0.85)
print('\n', predictors_low_cor)

original number of predictors in model:  91
number of unique highly correlated predictors to be dropped from model:  32
number of predictors remaining in model:  59

 ['rsi_2', 'sma_2', 'last_2_day_1_0_percent_increase_count', 'last_2_day_1_25_percent_increase_count', 'last_2_day_1_5_percent_increase_count', 'last_2_day_1_75_percent_increase_count', 'last_2_day_2_0_percent_increase_count', 'last_2_day_3_0_percent_increase_count', 'rsi_5', 'sma_5', 'last_5_day_1_0_percent_increase_count', 'last_5_day_1_25_percent_increase_count', 'last_5_day_1_5_percent_increase_count', 'last_5_day_1_75_percent_increase_count', 'last_5_day_2_0_percent_increase_count', 'last_5_day_3_0_percent_increase_count', 'rsi_10', 'last_10_day_1_0_percent_increase_count', 'last_10_day_1_25_percent_increase_count', 'last_10_day_1_5_percent_increase_count', 'last_10_day_1_75_percent_increase_count', 'last_10_day_3_0_percent_increase_count', 'last_25_day_1_0_percent_increase_count', 'last_25_day_1_25_percent_increase_c

## Train Model

In [123]:
def get_precision_score(data, model, predictors, response_var, threshold, start=1000, step=100):
    
    def predict(train, test, predictors, model):
        model.fit(train[predictors], train[response_var])
        probability = model.predict_proba(test[predictors])[:,1]
        proba_series = pd.Series(probability, index=test.index, name="probability")
        combined = pd.concat([test[response_var], test['tmw_percent_increase'], proba_series], axis=1)
        return combined

    all_predictions = []
    for i in range(start, data.shape[0], step):
        train = data.iloc[0:i].copy()
        test = data.iloc[i:(i+100)].copy()
        predictions = predict(train, test, predictors, model)
        all_predictions.append(predictions)

    data_with_predictions = pd.concat(all_predictions)

    col_name = f'pred_{threshold}'
    data_with_predictions[col_name] = data_with_predictions['probability'] >= threshold

    ppv = precision_score(data_with_predictions[response_var], data_with_predictions[col_name])
    
    return ppv

In [140]:
model = RandomForestClassifier(n_estimators=250, min_samples_split=200, random_state=1)
response_var = 'tmw_1_0_percent_increase_binary'
threshold = 0.55

ppv = get_precision_score(df_with_response_and_simple_preds, model, predictors_low_cor, response_var, threshold)
ppv

0.6113074204946997