#### References: 

#### (1) [Tutorial to the G-Research Crypto Competition](https://www.kaggle.com/cstein06/tutorial-to-the-g-research-crypto-competition)
#### (2) [Detailed API Introduction](https://www.kaggle.com/sohier/detailed-api-introduction)
#### (3) [Recreating Target](https://www.kaggle.com/alexfir/recreating-target)
#### (4) [Starter LGBM Pipeline](https://www.kaggle.com/julian3833/g-research-starter-lgbm-pipeline)
#### (5) [How to submit lagged features](https://www.kaggle.com/tomforbes/gresearch-submitting-lagged-features-via-api)

## Import and load dfs

In [None]:
import pandas as pd
import time
import numpy as np
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression
import gresearch_crypto
from pathlib import Path
import gc
import shutil
from itertools import product

TRAIN_CSV = '/kaggle/input/g-research-crypto-forecasting/train.csv'
ASSET_DETAILS_CSV = '/kaggle/input/g-research-crypto-forecasting/asset_details.csv'

def read_csv_strict(file_name='/kaggle/input/g-research-crypto-forecasting/train.csv'):
    df = pd.read_csv(file_name)
    df['datetime'] = pd.to_datetime(df['timestamp'], unit='s')
    # embargo by 3 days
    df = df[df['datetime'] < '2021-06-10 00:00:00']
    return df

In [None]:
df_train = read_csv_strict()

In [None]:
df_asset_details = pd.read_csv(ASSET_DETAILS_CSV).sort_values("Asset_ID")
df_asset_details

## Training

In [None]:
### fill gaps in timestamps

dfs = []

ts = df_train.timestamp.unique().tolist()

for name, g in df_train[['timestamp', 'Asset_ID']].set_index('timestamp').groupby('Asset_ID'):
    # clearly target is not calculated with method='pad' on price variables
    dfs += [g.reindex(range(g.index[0], g.index[-1]+60, 60), method='pad').reset_index()]
    
df_train = pd.concat(dfs).sort_values('timestamp', kind='mergesort').reset_index(drop=True).merge(df_train, how='left', on =['Asset_ID', 'timestamp'])
df_train = df_train.loc[df_train.timestamp.isin(ts)].reset_index(drop=True)

del dfs, ts
gc.collect()

In [None]:
### start lag target calculations here

def log_return(series, periods=1):
    return np.log(series).diff(periods=periods)

df_train['lr15'] = df_train.groupby('Asset_ID')['Close'].apply(lambda x: log_return(x, 15))
df_train['lr16'] = df_train.groupby('Asset_ID')['Close'].apply(lambda x: log_return(x, 16))
df_train = df_train.merge(df_asset_details[['Asset_ID','Weight']], how='left', on = 'Asset_ID')
df_train['m'] = df_train['lr15']*df_train['Weight']
df_train['m'] = df_train.groupby('timestamp')['m'].transform('sum') / np.sum(df_asset_details['Weight'])

df_train = df_train.drop('Weight', axis=1)
gc.collect()

In [None]:
def log_return(series, periods=1):
    return np.log(series).diff(periods=periods)

# # A utility function to build features around lags.
# def get_features_hist(df, row=False):
    
#     ### features to consider. See potential features...
#     ### note that we predicting returns 15 minutes ahead. This minutes price data is therefore not sufficient. We must roll the variables, 15, 30, 90, 250, 1250
       
#     df_feat = df[['Open', 'High', 'Low', 'Close', 'Volume', 'Count', 'VWAP', 'lr15', 'm', 'lr16', 'Asset_ID']].copy()

#     if df_feat.shape[0]<3750:
#         df_feat['beta_num'] = np.nan
#         df_feat['m2'] = np.nan
#     else:
#         df_feat['beta_num'] = (df_feat['lr15']*df_feat['m']).rolling(3750).mean().values
#         df_feat['m2'] = (df_feat['m']*df_feat['m']).rolling(3750).mean().values
        
#     if row:
#         # first .iloc as far back as we need, compute feature, then downsize until .iloc[-1]
#         df_feat = df_feat.iloc[-1]
#         # Grab columns to divide
#         divcols = ['Open', 'High', 'Low', 'Close', 'VWAP']
#         # divide every element of divcols with every element of divcols
#         df_feat = pd.concat([df_feat, pd.Series((df_feat[divcols].to_numpy()[:,np.newaxis] / df_feat[divcols].to_numpy()).reshape(len(divcols) * len(divcols)),
#                                         index=map('/'.join, (product(divcols, divcols)))).drop(['Open/Open', 'High/High', 'Low/Low', 'Close/Close', 'VWAP/VWAP'])])
#     else:
#         # Grab columns to divide
#         divcols = ['Open', 'High', 'Low', 'Close', 'VWAP']
#         # divide every element of divcols with every element of divcols
#         df_feat = pd.concat([df_feat.reset_index(drop=True), pd.DataFrame((df_feat[divcols].to_numpy()[..., None] / df_feat[divcols].to_numpy()[:, None]
#                                                                            ).reshape((len(df_feat),len(divcols) * len(divcols))),
#                                                                             columns=map('/'.join, (product(divcols, divcols))))], axis=1)
#         df_feat = df_feat.drop(['Asset_ID','Open/Open', 'High/High', 'Low/Low', 'Close/Close', 'VWAP/VWAP'], axis=1)
        
#     df_feat['beta'] = np.nan_to_num(df_feat['beta_num'] / df_feat['m2'], nan=0., posinf=0., neginf=0.)
#     df_feat['target_lag'] = df_feat['lr15'] - df_feat['beta']*df_feat['m']
#     df_feat['volume2count'] = df_feat['Volume'] / (df_feat['Count'] + 1)
    
#     return df_feat.replace([np.inf, -np.inf], np.nan)



# A utility function to build features around lags.
def get_features_hist(df, row=False):
    
    ### features to consider. See potential features...
    ### note that we predicting returns 15 minutes ahead. This minutes price data is therefore not sufficient. We must roll the variables, 15, 30, 90, 250, 1250
       
    df_feat = df[['Open', 'High', 'Low', 'Close', 'Volume', 'Count', 'VWAP', 'lr15', 'm', 'lr16', 'Asset_ID']].copy()

    if df_feat.shape[0]<3750:
        df_feat['beta_num'] = np.nan
        df_feat['m2'] = np.nan
    else:
        df_feat['beta_num'] = (df_feat['lr15']*df_feat['m']).rolling(3750).mean().values
        df_feat['m2'] = (df_feat['m']*df_feat['m']).rolling(3750).mean().values
        
    if row:
        # first .iloc as far back as we need, compute feature, then downsize until .iloc[-1]
        df_feat = df_feat.iloc[-1]
#         mean_price = df_feat[['Open', 'High', 'Low', 'Close']].mean()
#         med_price = df_feat[['Open', 'High', 'Low', 'Close']].median()
        df_feat['upper_shadow'] = df_feat['High'] / df_feat[['Close', 'Open']].max()
        df_feat['lower_shadow'] = df_feat[['Close', 'Open']].min() / df_feat['Low']
    else:
#         mean_price = df_feat[['Open', 'High', 'Low', 'Close']].mean(axis=1)
#         med_price = df_feat[['Open', 'High', 'Low', 'Close']].median(axis=1)
        df_feat['upper_shadow'] = df_feat['High'] / df_feat[['Close', 'Open']].max(axis=1)
        df_feat['lower_shadow'] = df_feat[['Close', 'Open']].min(axis=1) / df_feat['Low']
        df_feat = df_feat.drop('Asset_ID', axis=1)
        
    df_feat['beta'] = np.nan_to_num(df_feat['beta_num'] / df_feat['m2'], nan=0., posinf=0., neginf=0.)
    df_feat['target_lag'] = df_feat['lr15'] - df_feat['beta']*df_feat['m']

        ### Sense checks
#         print((df_feat['Target'] - df_feat.groupby('Asset_ID')['target_lag'].shift(-16)).abs().mean())
#         print(df_feat.loc[(df_feat.Target.isnull())&(df_feat.target_lag.notnull())].shape)
#         print(df_feat.loc[(df_feat.Target.notnull())&(df_feat.target_lag.isnull())].shape)        
        
    df_feat['open2close'] = df_feat['Close'] / df_feat['Open']
    df_feat['high2low'] = df_feat['High'] / df_feat['Low']
           
#     df_feat['high2mean'] = df_feat['High'] / mean_price
#     df_feat['low2mean'] = df_feat['Low'] / mean_price
#     df_feat['high2median'] = df_feat['High'] / med_price
#     df_feat['low2median'] = df_feat['Low'] / med_price
    df_feat['volume2count'] = df_feat['Volume'] / (df_feat['Count'] + 1)
    df_feat['close2vwap'] = df_feat['Close'] / df_feat['VWAP']
    
    return df_feat.replace([np.inf, -np.inf], np.nan)

# this function prevents leakage of data into other splits, not within split...
# Adapted from numerai tournament
def get_time_series_cross_val_splits(data, cv = 5, embargo = 3750):
    all_train_timestamps = data['timestamp'].unique()
    len_split = len(all_train_timestamps) // cv
    test_splits = [all_train_timestamps[i * len_split:(i + 1) * len_split] for i in range(cv)]
    # fix the last test split to have all the last timestamps, in case the number of timestamps wasn't divisible by cv
    rem = len(all_train_timestamps) - len_split*cv
    if rem>0:
        test_splits[-1] = np.append(test_splits[-1], all_train_timestamps[-rem:])

    train_splits = []
    for test_split in test_splits:
        test_split_max = int(np.max(test_split))
        test_split_min = int(np.min(test_split))
        # get all of the timestamps that aren't in the test split
        train_split_not_embargoed = [e for e in all_train_timestamps if not (test_split_min <= int(e) <= test_split_max)]
        # embargo the train split so we have no leakage. Note timestamps are expressed in seconds, so multiply by 60
        embargo_sec = 60*embargo
        train_split = [e for e in train_split_not_embargoed if
                       abs(int(e) - test_split_max) > embargo_sec and abs(int(e) - test_split_min) > embargo_sec]
        train_splits.append(train_split)

    # convenient way to iterate over train and test splits
    train_test_zip = zip(train_splits, test_splits)
    return train_test_zip

MODEL_FOLDER = "models"

def save_model(model, name):
    try:
        Path(MODEL_FOLDER).mkdir(exist_ok=True, parents=True)
    except Exception as ex:
        pass
    pd.to_pickle(model, f"{MODEL_FOLDER}/{name}.pkl")


def load_model(name):
    path = Path(f"{MODEL_FOLDER}/{name}.pkl")
    if path.is_file():
        model = pd.read_pickle(f"{MODEL_FOLDER}/{name}.pkl")
    else:
        path = Path(f"/kaggle/input/lgbm-embargocv-weightedpearson-lagtarget/{MODEL_FOLDER}/{name}.pkl")
        if path.is_file():
            model = pd.read_pickle(f"/kaggle/input/lgbm-embargocv-weightedpearson-lagtarget/{MODEL_FOLDER}/{name}.pkl")
        else:
            model = False
    return model


def get_model_and_valid_preds_per_asset(df, asset_id):
    
    df_proc = get_features_hist(df.copy())
    df_proc['y'] = df['Target'].values
    df_proc['timestamp'] = df['timestamp'].values
    feature_cols = df_proc.drop(['timestamp', 'y'], axis=1).columns
    impfeats = np.zeros(len(feature_cols))
    
    model_params = {"n_estimators": 100,
                    "learning_rate": 0.01,
                    "max_depth": 4,
                    "num_leaves": (2 ** 4 - 1),
                    "colsample_bytree": 0.8,
                    "subsample": 0.8}
    # keep track of some prediction columns
#     ensemble_cols = set()
#     pred_cols = set()
    
    ncv = 5
    train_test_zip = get_time_series_cross_val_splits(df_proc, cv = ncv)
    # get out of sample training preds via embargoed time series cross validation
    print("entering time series cross validation loop")
    for split, train_test_split in enumerate(train_test_zip):
        gc.collect()
        print(f"doing split {split+1} out of {ncv}")
        train_split, test_split = train_test_split
        train_split_index = df_proc['timestamp'].isin(train_split)
        test_split_index = df_proc['timestamp'].isin(test_split)

        # train a model on the training split (and save it for future use)
        split_model_name = f"model_asset{asset_id}_split{split+1}_cv{ncv}"
        split_model = load_model(split_model_name)
        if not split_model:
            print(f"training model: {split_model_name}")
            split_model = LGBMRegressor(**model_params)
            split_model.fit(df_proc.loc[train_split_index, feature_cols], df_proc.loc[train_split_index,'y'])
            save_model(split_model, split_model_name)
        
        impfeats += split_model.feature_importances_ / ncv
        # now we can predict on the test part of the split
        expected_features = split_model.booster_.feature_name()
        print(f"predicting {split_model_name}")
        df_proc.loc[test_split_index, 'preds'] = split_model.predict(df_proc.loc[test_split_index, expected_features])
        
        # lets try adjusting for negative correlations... simple pearson for now
        check_corrs = df_proc.loc[test_split_index, 'preds'].corr(df_proc.loc[test_split_index, 'y'])
        print(check_corrs)
        neglist = []
        if check_corrs<0:
            df_proc.loc[test_split_index, 'preds'] = -df_proc.loc[test_split_index, 'preds']
            neglist+=[split_model_name]
       
        # sense check
        print(np.count_nonzero(np.isnan(df_proc.loc[test_split_index, 'preds'])))

        # remember that we made all of these different pred columns
#         pred_cols.add(f"preds_{split_model_name}")
    # sense check
    print(np.count_nonzero(np.isnan(df_proc['preds'])))
        # To Do:  add neutralization of most riskiest features. 10  should be fine...
    display(pd.DataFrame({'Value': impfeats, 'Feature': feature_cols}).sort_values(by="Value", ascending=False).head(5))    
#     print("creating ensemble")
#     df_proc["ensemble_all"] = df_proc[pred_cols].mean(axis=1)
#     ensemble_cols.add("ensemble_all")        
        
    return df_proc['preds'], neglist

In [None]:
cv = 5
path = Path(f"/kaggle/input/lgbm-embargocv-weightedpearson-lagtarget/{MODEL_FOLDER}/model_asset13_split1_cv{cv}.pkl")
path2 = Path(f"{MODEL_FOLDER}/model_asset13_split1_cv{cv}.pkl")
if path2.is_file()|path.is_file():
    print("models already exist")
    expected_features = load_model(f"model_asset13_split1_cv{cv}").booster_.feature_name()
    acond = df_train["Asset_ID"] == 0
    feature_cols = get_features_hist(df_train.loc[acond].iloc[-3750:]).columns
    if set(expected_features) != set(feature_cols):
        print(f"New features are available! Might want to retrain the model.")
        shutil.rmtree(MODEL_FOLDER)
else:
    weights=[]
    corrdf = pd.DataFrame()
    neglist = []
    for asset_id, asset_name in zip(df_asset_details['Asset_ID'], df_asset_details['Asset_Name']):
        print(f"Training model for {asset_name:<16} (ID={asset_id:<2})")
        acond = df_train["Asset_ID"] == asset_id
        val_preds, neglist_asset = get_model_and_valid_preds_per_asset(df_train.loc[acond], asset_id)
        neglist += neglist_asset
        val_df = pd.DataFrame(val_preds.values, columns=['preds'])
        val_df['Target'] = df_train.loc[acond, 'Target'].values
        val_df['timestamp'] = df_train.loc[acond, 'timestamp'].values
        val_df['Weight'] = df_asset_details.loc[df_asset_details["Asset_ID"] == asset_id, 'Weight'].values[0]
        corrdf = pd.concat([corrdf, val_df])

    with open('neglist.txt', 'w') as filehandle:
        for listitem in neglist:
            filehandle.write('%s\n' % listitem)
        
    corrdf['wpreds'] = corrdf['Weight']*corrdf['preds']
    corrdf['wpreds_mean'] = corrdf.groupby('timestamp')['wpreds'].transform('sum') / corrdf.groupby('timestamp')['Weight'].transform('sum')
    corrdf['preds_mean'] = corrdf.groupby('timestamp')['preds'].transform('mean')
    
    corrdf = corrdf.dropna()
    print(corrdf.shape)
    
    # adjusting predictions
    corrdf['preds2'] = 0.8*corrdf['preds'] + 0.2*corrdf['preds_mean']
    corrdf['preds3'] = 0.9*corrdf['preds'] + 0.1*corrdf['preds_mean']
    corrdf['preds4'] = 0.8*corrdf['preds'] + 0.2*corrdf['wpreds_mean']
    corrdf['preds5'] = 0.9*corrdf['preds'] + 0.1*corrdf['wpreds_mean']
    
    def weighted_pearson(a,b):
        w = corrdf.Weight.values
        sumw = w.sum()
        ma = (w*a).sum() / sumw
        mb = (w*b).sum() / sumw

        # updated in line with competition guidelines
        covab = (a*b*w).sum() / sumw - ma*mb #(w*(a-ma)*(b-mb)).sum()/sumw
        covaa = (w*(a-ma)*(a-ma)).sum() / sumw
        covbb = (w*(b-mb)*(b-mb)).sum() / sumw

        return covab/np.sqrt(covaa*covbb)

    display(corrdf[['Target','preds', 'preds2', 'preds3', 'preds4', 'preds5']].corr(method=weighted_pearson))

In [None]:
# Check the model interface
rowtrain = df_train.iloc[-1]
rowhist = df_train.loc[df_train.Asset_ID==rowtrain['Asset_ID']].iloc[-3750:]
x = get_features_hist(rowhist, row=True).drop('Asset_ID')
cv = 5
y_pred = 0
for split in range(cv):
    split_model_name = f"model_asset{rowtrain['Asset_ID']}_split{split+1}_cv{cv}"
    split_model = load_model(split_model_name)
    y_pred += split_model.predict([x])[0] / cv
print(y_pred)

# define an empty list
neglist = []
# open file and read the content in a list
if path2.is_file():
    with open('neglist.txt', 'r') as filehandle:
        for line in filehandle:
            # remove linebreak which is the last character of the string
            currentName = line[:-1]
            # add item to the list
            neglist.append(currentName)
elif path.is_file():
    with open('/kaggle/input/lgbm-embargocv-weightedpearson-lagtarget/neglist.txt', 'r') as filehandle:
        for line in filehandle:
            # remove linebreak which is the last character of the string
            currentName = line[:-1]
            # add item to the list
            neglist.append(currentName)
else:
    with open('neglist.txt', 'r') as filehandle:
        for line in filehandle:
            # remove linebreak which is the last character of the string
            currentName = line[:-1]
            # add item to the list
            neglist.append(currentName)

del x, df_train, split_model, rowtrain, rowhist
gc.collect()

In [None]:
env = gresearch_crypto.make_env()
iter_test = env.iter_test()

history = pd.DataFrame()
lenassets = len(df_asset_details)
histmax = 3750*lenassets
retmax = 16*lenassets

sumweights = np.sum(df_asset_details['Weight'])

cv = 5
rangecv = range(cv)
split_models = {}
for r in range(14):
    for split in rangecv:
        split_model_name = f"model_asset{r}_split{split+1}_cv{cv}"
        split_models[split_model_name] = load_model(split_model_name)

def pred_func(df):
    df_feat = get_features_hist(df, row=True)
    asset_id = int(df_feat.Asset_ID)
    df_feat = df_feat.drop('Asset_ID')
    negswitch = []
    for i in range(5):
        if f"model_asset{asset_id}_split{i+1}_cv5" in neglist:
            negswitch+=[-1]
        else:
            negswitch+=[1]
            
    y_pred = (negswitch[0]*split_models[f"model_asset{asset_id}_split1_cv5"].predict([df_feat])[0] + negswitch[1]*split_models[
                f"model_asset{asset_id}_split2_cv5"].predict([df_feat])[0] + negswitch[2]*split_models[f"model_asset{asset_id}_split3_cv5"].predict(
                [df_feat])[0] + negswitch[3]*split_models[f"model_asset{asset_id}_split4_cv5"].predict([df_feat])[0] + negswitch[4]*split_models[
                f"model_asset{asset_id}_split5_cv5"].predict([df_feat])[0]) / cv
    return y_pred

## Predict & submit


In [None]:
for (df_test, df_pred) in iter_test:
#     t0 = time.time()

    # filling gaps with missing assets
    history = pd.concat([history, df_test[['Asset_ID', 'Close', 'row_id', 'Open', 'High', 'Low', 'Volume', 'Count', 
                                           'VWAP']].set_index('Asset_ID').reindex(df_asset_details['Asset_ID'].values).reset_index()], ignore_index=True)
    
    test_rows = history.row_id.isin(df_test.row_id)
    test_assets = history.Asset_ID.isin(df_test.Asset_ID)
    weights = df_asset_details.loc[df_asset_details.Asset_ID.isin(df_test.Asset_ID), 'Weight'].values
       
    lenhist = history.shape[0]
    
    # ensuring sufficient history for each asset
    if lenhist>histmax:
        history = history.iloc[-histmax:]

#     # because 'm' requires the whole group of assets, it must be computed prior to the loop below... 
    history_test = history.loc[test_rows, ['Asset_ID', 'Close']].copy()
    
    if lenhist<(retmax+1):
        history_test['lr15'] = np.nan
        history_test['lr16'] = np.nan
    else:
        # .take([-16,-17]).loc[history_test.Asset_ID]
        takes = history.loc[test_assets].groupby('Asset_ID')['Close'].take([-16,-17]).groupby(level=0)
        history_test['lr15'] = np.log(history_test['Close'].values / takes.take([0]).values)
        history_test['lr16'] = np.log(history_test['Close'].values / takes.take([1]).values)
        
    history_test['m'] = (history_test['lr15'].values*weights).sum() / sumweights
    history.loc[test_rows, ['lr15', 'm', 'lr16']] = history_test[['lr15', 'm', 'lr16']].values
    
    df_pred['Target'] = history.loc[test_assets].groupby('Asset_ID').apply(pred_func).reindex(df_test['Asset_ID'].values).values
    
    # try average by weight instead, but careful as some assets are not avaialble
#     df_pred.loc['Target'] = 0.8*df_pred.loc['Target'] + 0.2*df_pred.loc['Target'].mean()

    # Send submissions
    env.predict(df_pred)
#     print(time.time()-t0)