In [None]:
## The main model from: https://www.kaggle.com/code/sugghi/training-3rd-place-solution
import os
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import gresearch_crypto
import time
import datetime

import pickle
import gc

from tqdm import tqdm

n_fold = 7
seed0 = 8586
use_supple_for_train = False

# If True, the period used to evaluate Public LB will not be used for training.
# Set to False on final submission.
not_use_overlap_to_train = True
csv_path = '/kaggle/input/stock-pred2/'
csv_file = 'stock.csv'

In [None]:
lags = [10,30,60,300,900]

In [None]:
params = {
    'early_stopping_rounds': 100,
    'objective': 'regression',
    'metric': 'rmse',
#    'metric': 'corr',
#     'metric': 'None',
    'boosting_type': 'gbdt',
    'max_depth': 5,
    'verbose': -1,
    'max_bin':600,
    'min_data_in_leaf':50,
    'learning_rate': 0.03,
    'subsample': 0.7,
    'subsample_freq': 1,
    'feature_fraction': 1,
    'lambda_l1': 0.5,
    'lambda_l2': 2,
    'seed':seed0,
    'feature_fraction_seed': seed0,
    'bagging_fraction_seed': seed0,
    'drop_seed': seed0,
    'data_random_seed': seed0,
    'extra_trees': True,
    'extra_seed': seed0,
    'zero_as_missing': True,
    "first_metric_only": True
         }

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
df = pd.read_csv(csv_path+csv_file, usecols=['date','stock_id', 'close'])

In [None]:
import re
df.columns=["Asset_ID", "timestamp", "Close"]
for i in range(df.shape[0]):
    df["Asset_ID"][i] = re.findall(r'\d+',df["Asset_ID"][i])
    df["Asset_ID"][i] = int(df["Asset_ID"][i][0])
a = df.copy()
df.loc[:,"timestamp"] = pd.to_datetime(a.loc[:,"timestamp"])

# **Create Target**

* **Select 14 Stocks**

In [None]:
m = df['Asset_ID'].sort_values(ascending = True).index[:].tolist()
df = df.loc[m]
print(df[:100])
id_list = []
t = 0
num = 0
for i in df['Asset_ID']:
    if(i>t):
        t = i
        id_list.append(t)
        df.loc[df["Asset_ID"] == t, "Asset_ID"] = num
        num = num + 1
        if (num>=14):
            break
print(id_list) ## Stocks' real id names
df = df.loc[df["Asset_ID"]<14]

In [None]:
m = df['timestamp'].sort_values(ascending = True).index[:].tolist()
df = df.loc[m]
for i in range(14):    
    df.loc[df["Asset_ID"] == i, "Target"] = df.loc[df["Asset_ID"] == i]["Close"].shift(-1)/df.loc[df["Asset_ID" ]== i]["Close"] - 1
#df = df.loc[df["Asset_ID"]<14

In [None]:
df

# Preprocessing

In [None]:
df.head
df_train = df

In [None]:
df_train[:14]

In [None]:
%%time
train_merged = pd.DataFrame()
train_merged[df_train.columns] = 0
for id in tqdm( range(14) ):
    train_merged = train_merged.merge(df_train.loc[df_train["Asset_ID"] == id, ['timestamp', 'Close','Target']].copy(), on="timestamp", how='outer',suffixes=['', "_"+str(id)])
        
train_merged = train_merged.drop(df_train.columns.drop("timestamp"), axis=1)
train_merged = train_merged.sort_values('timestamp', ascending=True)
display(train_merged.head())

In [None]:
# forward fill
# Set an upper limit on the number of fills, since there may be long term gaps.
for id in range(14):
#     print(id, train_merged[f'Close_{id}'].isnull().sum())   # Number of missing before forward fill
    train_merged[f'Close_{id}'] = train_merged[f'Close_{id}'].fillna(method='ffill', limit=400)
    train_merged[f'Target_{id}'] = train_merged[f'Target_{id}'].fillna(0.0, limit=400)
#     print(id, train_merged[f'Close_{id}'].isnull().sum())   # Number of missing after forward fill
train_merged = train_merged[train_merged["timestamp"] >= "2010-01-01"]
train_merged

# Feature Engineering

**Six Types of Features:**
* **log_close/mean_n_idy:**  devide today's close price of the stock whose Asset_ID is y by average of the stock's close price over the past n days, then take logarithm of it.
* **log_return_n_idy:**   devide today's close price of the stock whose Asset_ID is y by its close price in last nth day , then take logarithm of it.
* **mean_close/mean_n:**   today's average of all stocks' log_close/mean_n_id.
* **mean_log_returns:**  today's average of all stocks' log_return_n_id.
* **log_close/mean_n-mean_close/mean_n_idy:** subtract mean_close/mean_n from log_close/mean_n_idy.
* **log_return_n-mean_log_returns_n_idy:** subtract mean_return_n from log_return_n_idy.


In [None]:
def get_features(df, train=True):   
    if train == True:
        #totimestamp = lambda s: np.int32(time.mktime(datetime.datetime.strptime(s, "%d/%m/%Y").timetuple()))
        #valid_window = [totimestamp("12/03/2021")]
        valid_window = "2021-03-12"
#         valid_window = [totimestamp("15/08/2021")]  #検証用
        df['train_flg'] = np.where(df['timestamp']>=valid_window, 0,1)

        #supple_start_window = [totimestamp("22/09/2021")]
        #if use_supple_for_train:
        #    df['train_flg'] = np.where(df['timestamp']>=supple_start_window[0], 1 ,df['train_flg']  )

   
    for id in range(14):    
        for lag in lags:
            df[f'log_close/mean_{lag}_id{id}'] = np.log( (np.array(df[f'Close_{id}']) /  np.roll(np.append(np.convolve( np.array(df[f'Close_{id}']), np.ones(lag)/lag, mode="valid"), np.ones(lag-1)), lag-1)).astype("float")  )
            df[f'log_return_{lag}_id{id}'] = np.log( (np.array(df[f'Close_{id}']) /  np.roll(np.array(df[f'Close_{id}']), lag)).astype("float")  )
    for lag in lags:
        df[f'mean_close/mean_{lag}'] =  np.mean(df.iloc[:,df.columns.str.startswith(f'log_close/mean_{lag}_id')], axis=1)
        df[f'mean_log_returns_{lag}'] = np.mean(df.iloc[:,df.columns.str.startswith(f'log_return_{lag}_id')] ,    axis=1)
        for id in range(14):
            df[f'log_close/mean_{lag}-mean_close/mean_{lag}_id{id}'] = np.array( df[f'log_close/mean_{lag}_id{id}']) - np.array( df[f'mean_close/mean_{lag}']  )
            df[f'log_return_{lag}-mean_log_returns_{lag}_id{id}']    = np.array( df[f'log_return_{lag}_id{id}'])     - np.array( df[f'mean_log_returns_{lag}'] )

    if train == True:
        for id in range(14):
            df = df.drop([f'Close_{id}'], axis=1)
        #oldest_use_window = [totimestamp("12/01/2019")]
        oldest_use_window = "2017-01-12"
        df = df[  df['timestamp'] >= oldest_use_window   ]

    return df

In [None]:
%%time
feat = get_features(train_merged)
feat

In [None]:
# define features for LGBM
not_use_features_train = ['timestamp', 'train_flg']
for id in range(14):
    not_use_features_train.append(f'Target_{id}')

features = feat.columns 
features = features.drop(not_use_features_train)
features = list(features)
# display(features)  
len(features)

In [None]:
del train_merged
del df_train
gc.collect()

# Training

In [None]:
# define the evaluation metric
def correlation(a, train_data):
    
    b = train_data.get_label()
    
    a = np.ravel(a)
    b = np.ravel(b)

    len_data = len(a)
    mean_a = np.sum(a) / len_data
    mean_b = np.sum(b) / len_data
    var_a = np.sum(np.square(a - mean_a)) / len_data
    var_b = np.sum(np.square(b - mean_b)) / len_data

    cov = np.sum((a * b))/len_data - mean_a*mean_b
    corr = cov / np.sqrt(var_a * var_b)

    return 'corr', corr, True

# For CV score calculation
def corr_score(pred, valid):
    len_data = len(pred)
    mean_pred = np.sum(pred) / len_data
    mean_valid = np.sum(valid) / len_data
    var_pred = np.sum(np.square(pred - mean_pred)) / len_data
    var_valid = np.sum(np.square(valid - mean_valid)) / len_data

    cov = np.sum((pred * valid))/len_data - mean_pred*mean_valid
    corr = cov / np.sqrt(var_pred * var_valid)

    return corr

# For CV score calculation
def wcorr_score(pred, valid, weight):
    len_data = len(pred)
    sum_w = np.sum(weight)
    mean_pred = np.sum(pred * weight) / sum_w
    mean_valid = np.sum(valid * weight) / sum_w
    var_pred = np.sum(weight * np.square(pred - mean_pred)) / sum_w
    var_valid = np.sum(weight * np.square(valid - mean_valid)) / sum_w

    cov = np.sum((pred * valid * weight)) / sum_w - mean_pred*mean_valid
    corr = cov / np.sqrt(var_pred * var_valid)

    return corr

* **Define a new objective function for position management**
* *also a weight setting in lgb.dataset() can achieve it*

In [None]:
def weight_rmse(preds, train_data, alpha1 = 1., alpha2 = 1.5):
    labels = train_data.get_label()
    k = preds - labels
    grad = np.where(labels>0, 2*alpha1*k, 2*alpha2*k)
    hess = np.where(labels>0, 2*alpha1, 2*alpha2)
    return grad, hess
            
        
        

* *from: https://blog.amedama.jp/entry/lightgbm-cv-feature-importance 
(used in nyanp's Optiver solution)*

In [None]:
# from: https://blog.amedama.jp/entry/lightgbm-cv-feature-importance
# (used in nyanp's Optiver solution)
def plot_importance(importances, features_names = features, PLOT_TOP_N = 20, figsize=(10, 10)):
    importance_df = pd.DataFrame(data=importances, columns=features)
    sorted_indices = importance_df.median(axis=0).sort_values(ascending=False).index
    sorted_importance_df = importance_df.loc[:, sorted_indices]
    plot_cols = sorted_importance_df.columns[:PLOT_TOP_N]
    _, ax = plt.subplots(figsize=figsize)
    ax.grid()
    ax.set_xscale('log')
    ax.set_ylabel('Feature')
    ax.set_xlabel('Importance')
    sns.boxplot(data=sorted_importance_df[plot_cols],
                orient='h',
                ax=ax)
    plt.show()

In [None]:
# from: https://www.kaggle.com/code/nrcjea001/lgbm-embargocv-weightedpearson-lagtarget/
def get_time_series_cross_val_splits(data, cv = n_fold, embargo = 3):#embargo = 3750
    all_train_timestamps = data['timestamp'].unique()
    len_split = len(all_train_timestamps) // cv
    test_splits = [all_train_timestamps[i * len_split : (i + 1) * len_split] for i in range(cv)]
    # fix the last test split to have all the last timestamps, in case the number of timestamps wasn't divisible by cv
    rem = len(all_train_timestamps) - len_split*cv
    if rem>0:
        test_splits[-1] = np.append(test_splits[-1], all_train_timestamps[-rem:])

    train_splits = []
    for test_split in test_splits:
        #test_split_max = int(np.max(test_split))
        #test_split_min = int(np.min(test_split))
        test_split_max = test_split.max()
        test_split_min = test_split.min()
        ## *get all of the timestamps that aren't in the test split
        train_split_not_embargoed = [e for e in all_train_timestamps if not (test_split_min <= e <= test_split_max)]
        # embargo the train split so we have no leakage. Note timestamps are expressed in seconds, so multiply by 60
        ## *embargo_sec = 60*embargo
        embargo_sec = pd.to_timedelta(embargo,"D")
        train_split = [e for e in train_split_not_embargoed if
                       abs(e - test_split_max) > embargo_sec and abs(e - test_split_min) > embargo_sec]
        train_splits.append(train_split)

    ## *convenient way to iterate over train and test splits
    train_test_zip = zip(train_splits, test_splits)
    return train_test_zip

In [None]:
def get_Xy_and_model_for_asset(df_proc, asset_id):
    df_proc = df_proc.loc[  (df_proc[f'Target_{asset_id}'] == df_proc[f'Target_{asset_id}'])  ]
    if not_use_overlap_to_train:
        df_proc = df_proc.loc[  (df_proc['train_flg'] == 1)  ]
    
# EmbargoCV
    train_test_zip = get_time_series_cross_val_splits(df_proc, cv = n_fold, embargo = 3)
    print("entering time series cross validation loop")
    importances = []
    oof_pred = []
    oof_valid = []
    
    for split, train_test_split in enumerate(train_test_zip):
        gc.collect()
        
        print(f"doing split {split+1} out of {n_fold}")
        train_split, test_split = train_test_split
        train_split_index = df_proc['timestamp'].isin(train_split)
        test_split_index = df_proc['timestamp'].isin(test_split)
    
        train_dataset = lgb.Dataset(df_proc.loc[train_split_index, features],
                                    #df_proc.loc[train_split_index, f'Target_{asset_id}'].values, 
                                    label = df_proc.loc[train_split_index, f'Target_{asset_id}'].values,
                                    feature_name = features, 
                                   )
        val_dataset = lgb.Dataset(df_proc.loc[test_split_index, features], 
                                  df_proc.loc[test_split_index, f'Target_{asset_id}'].values, 
                                  feature_name = features, 
                                 )
        print(val_dataset)

        print(f"number of train data: {len(df_proc.loc[train_split_index])}")
        print(f"number of val data:   {len(df_proc.loc[test_split_index])}")

        model = lgb.train(params = params,
                          train_set = train_dataset, 
                          valid_sets=[train_dataset, val_dataset],
                          valid_names=['tr', 'vl'],
                          num_boost_round = 5000,
                          verbose_eval = 100,     
                          feval = correlation,
                          #fobj = weight_rmse
                         )
        importances.append(model.feature_importance(importance_type='gain'))
        
        file = f'trained_model_id{asset_id}_fold{split}.pkl'
        pickle.dump(model, open(file, 'wb'))
        print(f"Trained model was saved to 'trained_model_id{asset_id}_fold{split}.pkl'")
        print("")
            
        oof_pred += list(  model.predict(df_proc.loc[test_split_index, features])        )
        oof_valid += list(   df_proc.loc[test_split_index, f'Target_{asset_id}'].values    )
    
    
    plot_importance(np.array(importances),features, PLOT_TOP_N = 20, figsize=(10, 5))

    return oof_pred, oof_valid

In [None]:
oof = [ [] for id in range(14)   ]

all_oof_pred = []
all_oof_valid = []
all_oof_weight = []
weight_0 = [1/14 for _ in range(14)]
for asset_id, asset_name in zip(range(14), id_list):
    print(f"Training model for {asset_name:<16} (ID={asset_id:<2})")
    oof_pred, oof_valid = get_Xy_and_model_for_asset(feat, asset_id)
    weight_temp = float(weight_0[asset_id])
    all_oof_pred += oof_pred
    all_oof_valid += oof_valid
    all_oof_weight += [weight_temp] * len(oof_pred)
    
    oof[asset_id] = corr_score(     np.array(oof_pred)   ,    np.array(oof_valid)    )
    
    print(f'OOF corr score of {asset_name} (ID={asset_id}) is {oof[asset_id]:.5f}. (Weight: {float(weight_temp):.5f})')
    print('')
    print('')

In [None]:
# ls -lh
feat

In [None]:
woof = 0
for id in range(14):
    woof += oof[id]*weight_0[id]
#    woof += oof[id] * float(  df_asset_details.loc[  df_asset_details['Asset_ID'] == id  , 'Weight'   ] )
#woof = woof / df_asset_details['Weight'].sum()
woof = woof / sum(weight_0)
print(f'OOF corr scores are;')
for oof_score in oof:
    print(f'      {oof_score:.5f}')
print(f'  simple average corr score: {np.mean(oof):.5f}.')
print(f'weighted average corr score: {woof:.5f}.')
print(f'')

all_oof_wcorr = wcorr_score(     np.array(all_oof_pred),    np.array(all_oof_valid),  np.array(all_oof_weight)   )
print(f'        weighted corr score: {all_oof_wcorr:.5f}.')

In [None]:
feat.loc[feat["train_flg"]==0]

# Position management

In [None]:
df_corr = pd.DataFrame()
asset_id = 0
data = feat.loc[feat["train_flg"]==0]
for asset_id in range(14):
    pred_corr_list = []   
    for i in range(7):
        model_pred = []
        res = open(f'./trained_model_id{asset_id}_fold{i}.pkl',"rb")
        model = pickle.load(res)
        df_proc = feat.loc[feat["train_flg"]==0]    
        df_proc = df_proc.loc[ :, [f'Target_{asset_id}' ,"timestamp" ]]   
        pred = model.predict(feat.loc[feat["train_flg"]==0, features])
        model_pred.append(pred)
        t = corr_score(df_proc[f'Target_{asset_id}'],pred)
        pred_corr_list.append(t)
    pred_list = np.array(model_pred).sum(axis = 0)
    df_corr[f'pred_corr_{asset_id}'] = pred_corr_list
    data[f'pred_{asset_id}'] = pred_list
    pred_corr_list

In [None]:
pred_feat = ['timestamp']
fact_feat = ['timestamp']
for asset_id in range(14):
    pred_feat.append(f'pred_{asset_id}')
    fact_feat.append(f'Target_{asset_id}')                     
pred = data[pred_feat]
fact = data[fact_feat]

In [None]:
## Define Value Function
def Value_pred(p, p_yesterday, time, alpha = 0.005):
    v = np.array(p*pred.loc[pred['timestamp']==time].iloc[:,1:] - alpha * np.square(p-p_yesterday))
    return np.sum(v, axis = None)
def Value_fact(p, p_yesterday, time, alpha = 0.005):
    v = np.array(p*fact.loc[fact['timestamp']==time].iloc[:,1:] - alpha * np.square(p-p_yesterday))
    return np.sum(v, axis = None)
def diff_Value_pred(p, p_yesterday, time, alpha = 0.005):
    return np.sum(pred.loc[pred['timestamp']==time].iloc[:,1:] - alpha * 2*(p-p_yesterday))
## According to derivative of Value_pred(p)
def get_p(p_yesterday, time, alpha = 0.005):
    p = (np.array(pred.loc[pred['timestamp']==time].iloc[:,1:]) + 2*alpha*p_yesterday)/(2*alpha)
    p = 1/2*(p + abs(p))
    p = p/(p.sum()+1e-6)
    return p[0]

In [None]:
## Decide position strategy on value functions above
## Create a dataframe to display the relationship between our position strategy and income return
p_V_feat = ['timestamp', 'Value_p', 'Value_f']
for asset_id in range(14):
    p_V_feat.append(f'position_{asset_id}')    
p_V = pd.DataFrame(columns = p_V_feat)
p_V.set_index(['timestamp'])
time = pd.to_datetime('2021-03-12')
p_yesterday = np.array(weight_0)
while(time < pd.to_datetime('2022-06-08')):
    today = [time]
    p = get_p(p_yesterday, time, alpha = 0.005)
    Value_p = Value_pred(p, p_yesterday, time, alpha = 0.005)
    today.append(Value_p)
    Value_f = Value_fact(p, p_yesterday, time, alpha = 0.005)
    today.append(Value_f)
    today.extend(p)
    p_V.loc[time,:] = today
    p_yesterday = p
    time = pred.loc[pred['timestamp']>time].iloc[0,0]

In [None]:
p_V

In [None]:
## Display achieved positive return days
p_V.loc[p_V["Value_f"]>0]

# **Return Rate every 10 days (compound interest)**

In [None]:
time_0 = p_V.index[0]
def rr_cal(days):
    rr = 1
    date = pd.to_timedelta(days,"D")+time_0
    for time in p_V.index[:days]:
        if(pd.isnull(p_V.loc[time,"Value_f"])):
            continue
        else:
            rr = rr*(1+p_V.loc[time,"Value_f"])        
    return(date,rr)

In [None]:
df_rr = pd.DataFrame(columns = ["timestamp","rr"])
df_rr.set_index = ["timestamp"]
for days in range(9,300,10):
    df_rr.loc[rr_cal(days)[0],:]=rr_cal(days)

In [None]:
df_rr["rr"]

In [None]:
#改正cv之后提升最大