In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datatable as dt # Fast data reading/writing
import seaborn as sns
import matplotlib.pyplot as plt

We need a newer sklearn version:

In [None]:
!pip install scikit-learn==1.0.1

First, we load the asset details so that we can load the data in the order of their asset id. Filenames are based on asset names.

In [None]:
asset_details = pd.read_csv('/kaggle/input/g-research-crypto-forecasting/asset_details.csv', index_col='Asset_ID')
names = asset_details.sort_index().Asset_Name.values

In [None]:
df = dt.fread(f"/kaggle/input/crypto-challenge-mlii-project-feature-eng-2/bitcoin.jay").to_pandas()
df.drop('index', axis=1, inplace=True)
df.set_index('timestamp', inplace=True)
df[df.isnull().values]

Since all the dataframes are too big to be stored, trained and evaluated at once, we define a function to do it in one go:

In [None]:
from sklearn.model_selection import TimeSeriesSplit
from lightgbm import LGBMRegressor
from scipy.stats import pearsonr 
from glob import iglob
from datetime import timedelta

importances_dict = {} # Dict of feature importances
all_preds = [] # This will be all the made predictions stored into a dataframe
all_trues = [] # All the true target values excluding ones that we have no predictions for (the first fold)
asset_ids = [] # List of numpy arrays filled with the asset id for each prediction, we use it to get the weights later

def cv_evaluate(asset_name, used_features, ensemble=False, global_weight=None, num_folds=5):
    if asset_name != 'all':
        df = dt.fread(f"/kaggle/input/crypto-challenge-mlii-project-feature-eng-2/{asset_name.lower().replace(' ', '_')}.jay").to_pandas() # Load asset data
    else:
        df = []
        for filename in iglob("/kaggle/input/crypto-challenge-mlii-project-feature-eng-2/*.jay"):
            df.append(dt.fread(filename).to_pandas()) # Load asset data
        df = pd.concat(df)
        
    df.drop('index', axis=1, inplace=True)
    df.set_index('timestamp', inplace=True)
    X, y = df.drop('Target', axis=1)[used_features], df.Target
    
    if ensemble:
        all_df = []
        num_assets = 0
        for filename in iglob("/kaggle/input/crypto-challenge-mlii-project-feature-eng-2/*.jay"):
            asset_df = dt.fread(filename).to_pandas()
            asset_df['asset_name'] = filename.split('/')[-1].split('.')[0]
            all_df.append(asset_df) # Load asset data
            num_assets += 1
        all_df = pd.concat(all_df)
        all_df.drop('index', axis=1, inplace=True)
        multindex = pd.MultiIndex.from_frame(all_df[['timestamp', 'asset_name']])
        all_df.set_index(multindex, inplace=True)
        all_df.sort_index(inplace=True)
        X_all, y_all = all_df.drop('Target', axis=1)[used_features], all_df.Target
        index_diff = (X.index[0] - X_all.index[0][0]).seconds
    
    # Get chronological folds using time series split
    tss = TimeSeriesSplit(n_splits=num_folds, gap=15, test_size=90*24*60) # 90 days, 15 min gap
    split_indices = list(tss.split(X)) # Split returns generator, convert to list so that we don't have to worry about emptying it

    
    params = {'lambda_l1': 0.004498875792752676, 'lambda_l2': 0.03243290696956152, 'num_leaves': 60, 
              'max_depth': 6, 'min_data_in_leaf': 2496, 'learning_rate': 0.18502752618241153, 'n_estimators': 100,
              'boosting_type': 'goss', 'random_state': 1}
    
    if ensemble:
        global_model = LGBMRegressor(**params)
        
    models = []
    
    print(f'================================={asset_name.upper()}=================================')
    corrs = []
    asset_trues = [] # Same as all_trues but this is for current asset
    asset_preds = [] # Same as all_preds but this is for current asset
    for j, (train_indices, test_indices) in enumerate(split_indices):
        X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
        y_train, y_test = y[train_indices], y[test_indices]
        
        begin_train, begin_test = X_train.index[0], X_test.index[0]
        end_train, end_test = X_train.index[-1], X_test.index[-1]
        
        print(f'Fold {j+1}\nTrain set: {begin_train} - {end_train}\nTest set: {begin_test}-{end_test}')
        
        # Training the model        
        model = LGBMRegressor(**params)

        print(f'Training fold {j+1}...')
        model.fit(X_train, y_train)
        models.append(model)
        
        if ensemble:
            X_train_all = X_all.loc[begin_train:end_train]
            X_test_all = X_all.loc[begin_test:end_test]
            y_train_all = y_all.loc[begin_train:end_train]
            y_test_all = y_all.loc[begin_test:end_test]
            global_model.fit(X_train_all, y_train_all)
            
        print(f'Feature importances: {list(sorted(zip(model.feature_name_, model.feature_importances_), key=lambda x: x[1], reverse=True))}')
        
        # Add feature importances for current asset to importances dict, add new entry if it does not exist, otherwise append to list.
        for k in range(len(model.feature_name_)):
            if model.feature_name_[k] not in importances_dict.keys():
                importances_dict[model.feature_name_[k]] = [model.feature_importances_[k]]
            else:
                importances_dict[model.feature_name_[k]].append(model.feature_importances_[k])         
        
        print(f'Predicting fold {j+1}...')
        if ensemble:
            train_pred = global_weight * global_model.predict(X_train) + (1-global_weight) * model.predict(X_train)
        else:
            train_pred = model.predict(X_train)

        corr, p = pearsonr(y_train, train_pred)
        print(f'Correlation for fold {j+1}: {corr} \t p-value: {p}')

        print(f'Predicting out of fold {j+1}...')
        if ensemble:
            y_pred = global_weight * global_model.predict(X_test) + (1-global_weight) * model.predict(X_test)
        else:
            y_pred = model.predict(X_test)
        
        asset_preds.append(y_pred)
        asset_trues.append(y_test)

        corr, p = pearsonr(y_test, y_pred)
        corrs.append(corr)
        print(f'Correlation for OOF {j+1}: {corr} \t p-value: {p}')
      
    print(f'\n\nMEAN OOF CORRELATION FOR {asset_name.upper()}: {sum(corrs)/len(corrs)}\n\n')

#     if name != all:
        # Concatenate the per fold predictions so that asset_preds and asset_trues contain all values for this asset.
        # All_preds and all_trues are then updated with these values.
#         asset_preds = np.concatenate(asset_preds)
#         asset_trues = np.concatenate(asset_trues)
#         all_preds.append(asset_preds)
#         all_trues.append(asset_trues)
#         asset_id = asset_details[asset_details['Asset_Name'] == asset_n].index[0]
#         asset_ids.append(np.full((len(asset_preds)), asset_id))
    
    return models

We now use the function to load and train all the models iteratively:

In [None]:
used_features = ['MACD_crossover_norm', 'stochastic_crossover', 'RSI', 'log_ret1', 'log_ret30', 'log_ret240', 'log_ret1440', 'mfi']
asset_models = []

#global_model = cv_evaluate('all', used_features)

for name in ['Bitcoin', 'Cardano', 'Maker', 'Tron']:
    asset_models.append(cv_evaluate(name, used_features, ensemble=True, global_weight=0.5))

In [None]:
import shap
bitcoin_last = asset_models[0][-1]
data = dt.fread(f"/kaggle/input/crypto-challenge-mlii-project-feature-eng-2/bitcoin.jay").to_pandas()[used_features].iloc[:60*24*30]
shap_values = shap.TreeExplainer(bitcoin_last).shap_values(data)
shap.summary_plot(shap_values, data)



In [None]:
cardano_last = asset_models[1][-1]
shap_values = shap.TreeExplainer(cardano_last).shap_values(data)
shap.summary_plot(shap_values, data)

In [None]:
maker_last = asset_models[2][-1]
shap_values = shap.TreeExplainer(maker_last).shap_values(data)
shap.summary_plot(shap_values, data)

In [None]:
tron_last = asset_models[3][-1]
shap_values = shap.TreeExplainer(tron_last).shap_values(data)
shap.summary_plot(shap_values, data)

Visualizing feature imporatances with a boxplot:

In [None]:
importances = pd.DataFrame(importances_dict).melt(var_name='Feature', value_name='Importance')
print(importances)
plt.figure(figsize=(10, 30))
sns.boxplot(data=importances, x='Importance', y='Feature')

We create a dataframe with ids and target predictions so we can map them all to weights:

In [None]:
eval_df = pd.DataFrame({'Id': np.hstack(asset_ids), 'True': np.hstack(all_trues), 'Prediction': np.hstack(all_preds)})
eval_df = eval_df.join(asset_details, on='Id')

Now we define the correlation metric:

In [None]:
eval_df.head(800000)

In [None]:
def wcorr(y_true, y_pred, weights): # Adapted from the discussion post 'Evaluation Metric Clarification'
    sum_w = np.sum(weights)
    mean_true = np.sum(y_true * weights) / sum_w
    mean_pred = np.sum(y_pred * weights) / sum_w
    var_true = np.sum(weights * np.square(y_true - mean_true)) / sum_w
    var_pred = np.sum(weights * np.square(y_pred - mean_pred)) / sum_w

    cov = np.sum((y_true * y_pred * weights)) / sum_w - mean_true * mean_pred
    corr = cov / np.sqrt(var_true * var_pred)

    return corr

Calculating the metric (probably more representative with a standard training size) 

In [None]:
print(wcorr(eval_df['True'], eval_df['Prediction'], eval_df['Weight']))