In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd
import numpy as np
import time
import gc

# from sklearn.linear_model import RidgeCV, Ridge
# from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
# from sklearn.model_selection import train_test_split

from lightgbm import LGBMRegressor
# from xgboost import XGBRegressor
from sklearn.model_selection import TimeSeriesSplit
from scipy.stats.stats import pearsonr
from tqdm import tqdm

import gresearch_crypto

# Warningの無効化
import warnings
warnings.simplefilter("ignore")

# データフレームcolumの全表示
pd.set_option("display.max_columns", None)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Load Data

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        # else:
            # df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
df_asset_details = pd.read_csv(r"/kaggle/input/g-research-crypto-forecasting/asset_details.csv").sort_values("Asset_ID")
df_asset_details

In [None]:
# val = df_asset_details.loc[ df_asset_details["Asset_ID"] == 3, "Asset_Name"]
# val.values[0]

In [None]:
def read_csv_strict(file_name=r"/kaggle/input/g-research-crypto-forecasting/train.csv"):
    df = pd.read_csv(file_name).pipe(reduce_mem_usage)
    df["datetime"] = pd.to_datetime(df["timestamp"], unit="s")
#     df = df[df["datetime"] > "2019-06-13 00:00:00"]
    df = df[df["datetime"] >= "2020-05-13 00:00:00"]
    return df

In [None]:
df_train = read_csv_strict(file_name=r"/kaggle/input/g-research-crypto-forecasting/supplemental_train.csv")
df_train.head()

In [None]:
df_supp_train = read_csv_strict(file_name=r"/kaggle/input/g-research-crypto-forecasting/supplemental_train.csv")
df_supp_train.head()

In [None]:
df_test = read_csv_strict(file_name=r"/kaggle/input/g-research-crypto-forecasting/example_test.csv")
df_test.head()

In [None]:
print(df_train["timestamp"].min(), df_train["timestamp"].max())

In [None]:
print(df_supp_train["timestamp"].min(), df_supp_train["timestamp"].max())

In [None]:
df_train = pd.concat([df_train, df_supp_train, df_test])
df_train.head()

In [None]:
df_train.shape, df_supp_train.shape, df_test.shape

## Outliers

In [None]:
## Negative volume implies lack of liquidity in the market
df_train[df_train.Volume < 0]

Negative volume indicates that the market lacked liquidity for these instances; in other words, the bid and ask price  failed to overlap.

## Check for NaNs

In [None]:
## https://www.kaggle.com/c/g-research-crypto-forecasting/discussion/286778
## https://www.kaggle.com/cstein06/tutorial-to-the-g-research-crypto-competition?scriptVersionId=84976518&cellId=51

# df_train[(df_train.Target.isna()) & (df_train.Asset_ID==8)]
df_train[df_train.Target.isna()].groupby("Asset_ID")["timestamp"].count()

In [None]:
df_train[(df_train.Target.isna()) & (df_train.Asset_ID==8)].describe(percentiles=[0.25, 0.50, 0.75, 0.90, 0.95, 0.99])

## Sort by Asset and Timestamp

In [None]:
df_train = df_train.sort_values(by=["Asset_ID", "timestamp"]
                               ,ascending=True)

df_train.head(20)

## Dealing with missing data

In [None]:
df_train.info()

In [None]:
print("total missing: ")
print(df_train.isna().sum())
print("----------------------------")
print("percent missing: ")
print(df_train.isna().sum() * 100. / df_train.shape[0])

In [None]:
df_train.head()

## Ffill for each group for Target missing value

In [None]:
## https://stackoverflow.com/questions/16345583/fill-in-missing-pandas-data-with-previous-non-missing-value-grouped-by-key
df_train["Target"] = df_train.sort_values(["Asset_ID", "timestamp"]).groupby("Asset_ID")["Target"].ffill()
df_train["VWAP"] = df_train.sort_values(["Asset_ID", "timestamp"]).groupby("Asset_ID")["VWAP"].ffill()
df_train.head(20)

In [None]:
df_train.isna().sum()

In [None]:
df_train[ (df_train["Target"].isna()) & (df_train["Asset_ID"] == 10)]

In [None]:
df_train["Target"] = df_train["Target"].fillna(0)
# df_train["VWAP"] = df_train["VWAP"].fillna(0)

In [None]:
df_train[ (df_train["Target"].isna()) & (df_train["Asset_ID"] == 10)]

In [None]:
df_train["Target"].isna().sum()

## Utility Functions

In [None]:
# Two new features from the competition tutorial
def upper_shadow(df):
    return df["High"] - np.maximum(df["Close"], df["Open"])

def lower_shadow(df):
    return np.minimum(df["Close"], df["Open"]) - df["Low"]

## notebook: crypto-prediction-technical-analysis-features
def SM_A_M(df, colname, n):
    mean = df[colname].rolling(window=n).mean()
    median = df[colname].rolling(window=n).median()
    
    return mean, median

def EMA1(df, colname, n):
    """
    https://qiita.com/MuAuan/items/b08616a841be25d29817
    """
    a= 2/(n+1)
    return df[colname].ewm(alpha=a).mean()

def MACD(df, colname, span1=12, span2=26, span3=9):
    """
    Compute MACD
    # https://www.learnpythonwithrune.org/pandas-calculate-the-moving-average-convergence-divergence-macd-for-a-stock/
    """
    
    exp1 = EMA1(df, colname, span1)
    exp2 = EMA1(df, colname, span2)
    macd = 100 * (exp1 - exp2) / exp2
    signal = macd.ewm(alpha=2./(span3+1)).mean() ##EMA1(df, colname, macd, span3)

    return macd, signal

def BollingerBand(df, colname, window, no_of_std):
    mean = df[colname].rolling(window=window).mean()
    std = df[colname].rolling(window=window).std() 
    bb_high = mean + no_of_std * std
    bb_low = mean - no_of_std * std
    
    return bb_high, bb_low, std

def rsiFunc(df, colname, n=14):
    prices = df[colname].values
    
    deltas = np.diff(prices)
    seed = deltas[:n+1]
    up = seed[seed>=0].sum()/n
    down = -seed[seed<0].sum()/n
    rs = up/down
    rsi = np.zeros_like(prices)
    rsi[:n] = 100. - 100./(1.+rs)

    for i in range(n, len(prices)):
        delta = deltas[i-1] # cause the diff is 1 shorter

        if delta>0:
            upval = delta
            downval = 0.
        else:
            upval = 0.
            downval = -delta

        up = (up*(n-1) + upval)/n
        down = (down*(n-1) + downval)/n

        rs = up/down
        rsi[i] = 100. - 100./(1.+rs)

    return rsi

## https://stackoverflow.com/questions/42138357/pandas-rolling-slope-calculation
def calc_slope(x):
    slope = np.polyfit(range(len(x)), x, 1)[0]
    return slope

In [None]:
## set min_periods=2 to allow subsets less than 60.
## use [4::5] to select the results you need.
# result = data.rolling(60, min_periods=2).apply(calc_slope)[4::5]

## Feature Engineering

In [None]:
def get_feat(df):
    df["High-Low"] = df["High"] - df["Low"]
    df["Close-Open"] = df["Close"] - df["Open"]

    df["dayofweek"] = df["datetime"].dt.dayofweek
    df["weekofyear"] = df["datetime"].dt.weekofyear
    # df_train['upper_shadow'] = upper_shadow(df=df_train)
    # df_train['lower_shadow'] = lower_shadow(df=df_train)

    df["dayofweek"] = df["dayofweek"].astype(np.int8)
    df["weekofyear"] = df["weekofyear"].astype(np.int8)
    # df_train['upper_shadow'] = df_train['upper_shadow'].astype(np.float32)
    # df_train['lower_shadow'] = df_train['lower_shadow'].astype(np.float32)

    df = df.drop(["datetime", "VWAP", "Open", "High", "Low"], axis=1)
    gc.collect()
    # df_train = reduce_mem_usage(df_train)
    # df_train.describe()

    macd, signal = MACD(df=df, colname="Close")
    df['MACD_' + "Close" + '_macd'] = macd
    df['MACD_' + "Close" + '_signal'] = signal

    df['MACD_' + "Close" + '_macd'] = df['MACD_' + "Close" + '_macd'].astype(np.float32)
    df['MACD_' + "Close" + '_signal'] = df['MACD_' + "Close" + '_signal'].astype(np.float32)

    arr_n = [5, 10, 15]
    # arr_cols = ["Close", "VWAP", "Volume"]
    arr_cols = ["Close", "Volume"]
    no_of_std = 2.5

    for colname in arr_cols: 
        print("colname = " + colname)
        for n in arr_n:
            print("n = "+ str(n))
            df['Target_lag_' + str(n)] = df['Target'].shift(n)
            df['Target_lag_' + str(n)].fillna(value=df["Target"], inplace=True)
            
#             print("Slope") ## takes too long
#             slope = df["Target"].rolling(window=n).apply(calc_slope)[4::5]
#             df['SM_' + "Target" + '_slope_' + str(n)] = slope
#             df['SM_' + "Target" + '_slope_' + str(n)].fillna(value=0.1, inplace=True)
#             df['SM_' + "Target" + '_slope_' + str(n)] = df['SM_' + "Target" + '_slope_' + str(n)].astype(np.float32)

            print("SMA")
            mean, median = SM_A_M(df=df, colname=colname, n=n)
            df['SM_' + colname + '_mean_' + str(n)] = mean
            df['SM_' + colname + '_median_' + str(n)] = median

            df['SM_' + colname + '_mean_' + str(n)] = df['SM_' + colname + '_mean_' + str(n)].astype(np.float32)
            df['SM_' + colname + '_median_' + str(n)] = df['SM_' + colname + '_median_' + str(n)].astype(np.float32)

            df['SM_' + colname + '_mean_' + str(n)].fillna(value=df[colname], inplace=True)
            df['SM_' + colname + '_median_' + str(n)].fillna(value=df[colname], inplace=True)
            

            print("EMA1")
            ewmean = EMA1(df=df_train, colname=colname, n=n)
            df['EWM_' + colname + '_mean_' + str(n)] = ewmean
            df['EWM_' + colname + '_mean_' + str(n)] = df['EWM_' + colname + '_mean_' + str(n)].astype(np.float32)
            df['EWM_' + colname + '_mean_' + str(n)].fillna(value=1, inplace=True)
            
            if colname in ['VWAP', 'Volume']:
                continue

            if n==5:
                continue
                
            print("RSI")
            rsi = rsiFunc(df=df,  colname=colname, n=n)
            df['RSI_' + str(n)] = rsi
            df['RSI_' + str(n)] = df['RSI_' + str(n)].astype(np.float32)
            df['RSI_' + str(n)].fillna(value=1, inplace=True)
            

            print("BollingerBand")
            bb_high, bb_low, std = BollingerBand(df=df, colname=colname, window=n, no_of_std=no_of_std)
            df['SM_' + colname + '_std_' + str(n)] = std
            df['SM_' + colname + '_BB_High_' + str(n)] = bb_high
            df['SM_' + colname + '_BB_Low_' + str(n)] = bb_low

            df['SM_' + colname + '_std_' + str(n)] = df['SM_' + colname + '_std_' + str(n)].astype(np.float32)
            df['SM_' + colname + '_BB_High_' + str(n)] = df['SM_' + colname + '_BB_High_' + str(n)].astype(np.float32)
            df['SM_' + colname + '_BB_Low_' + str(n)] = df['SM_' + colname + '_BB_Low_' + str(n)].astype(np.float32)
            df['SM_' + colname + '_std_' + str(n)].fillna(value=0, inplace=True)
            df['SM_' + colname + '_BB_High_' + str(n)].fillna(value=df[colname], inplace=True) 
            df['SM_' + colname + '_BB_Low_' + str(n)].fillna(value=df[colname], inplace=True) 
            
            
            gc.collect()

    df = reduce_mem_usage(df)
#     df.fillna(value=0, inplace=True)
    df.head()

    return df

In [None]:
df_train.describe()

In [None]:
df_train = get_feat(df=df_train)
df_train.memory_usage(deep=True)

In [None]:
df_train.info(verbose=False, memory_usage='deep')

In [None]:
gc.collect()

In [None]:
import memory_profiler
m1 = memory_profiler.memory_usage()
m1

In [None]:
df_train.head(10)

In [None]:
gc.collect()

In [None]:
df_test = df_train[~((df_train["group_num"].isna()) & (df_train["row_id"].isna()))]
print(df_test.shape)

gc.collect()

df_train.drop(  df_train[~((df_train["group_num"].isna()) & (df_train["row_id"].isna()))].index, inplace=True)
print(df_train.shape)

df_test.drop(["Target"], axis=1, inplace=True)
df_train.drop(["group_num", "row_id"], axis=1, inplace=True)

df_train.shape, df_test.shape

In [None]:
df_train.head()

In [None]:
df_train.isna().sum()

## Training with LGBM

In [None]:
# 10-fold time series cross validation
def timecv_model(model, X, y):
    tfold = TimeSeriesSplit(n_splits = 5)
    pcc_list = []
    for _, (train_index, test_index) in tqdm(enumerate(tfold.split(X), start=1)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        clf = model.fit(X_train, y_train)
        pred = clf.predict(X_test)
        pcc = pearsonr(pred, y_test) 
        pcc_list.append(pcc[0])
    
    return pcc_list

def cv_result(model, X, y):
    model_name = model.__class__.__name__
    pcc_ = timecv_model(model, X, y)
    for i, pcc in enumerate(pcc_):
        print(f'{i}th fold: {model_name} PCC: {pcc:.4f}')
    print(f'\n{model_name} average PCC: {np.mean(pcc_):.4f}')

In [None]:
def train_lgb(model, X, y):
    print("model fitting ...")
    clf = model.fit(X, y)
    print("predicting ...")
    pred = clf.predict(X)
    pcc = pearsonr(pred, y)[0]
    
    print("pcc = ", str(pcc))
    gc.collect()
    
    return model

In [None]:
# train = df_train[df_train['timestamp'] < 1630432800] ## original train data
# valid = df_train[df_train['timestamp'] >= 1630432800] ## supplementary data

# train.drop(['timestamp', 'Asset_ID'], axis = 1, inplace = True)
# valid.drop(['timestamp', 'Asset_ID'], axis = 1, inplace = True)

# X_train = train.drop(['Target'], axis = 1)
# y_train = train['Target']
# X_valid = valid.drop(['Target'], axis = 1)
# y_valid = valid['Target']

# print(X_train.shape)
# print(X_valid.shape)

In [None]:
# lgb_model = LGBMRegressor(n_estimators = 420,
#                           max_depth = 17,
#                           num_leaves = 36,
#                           learning_rate = 0.09786397238905313, 
#                           min_child_samples = 49, 
#                           colsample_bytree=0.7,
#                           subsample = 0.8683175057718733,
#                           seed = 0)

## 'colsample_bytree': 0.4000480946836777, 'max_depth': 17, 'num_leaves': 35, 'learning_rate': 0.06891972238739223, 'n_estimators': 576, 'min_child_samples': 47, 'subsample': 0.6058272745943716

lgb_model = LGBMRegressor(n_estimators = 576,
                          max_depth = 17,
                          num_leaves = 35,
                          learning_rate = 0.06891972238739223, 
                          min_child_samples = 47, 
                          colsample_bytree=0.4000480946836777,
                          subsample = 0.6058272745943716,
                          seed = 42)

# cv_result(lgb_model, X, y)
# lgb_model.fit(X_train, y_train, eval_set = [(X_train, y_train), (X_valid, y_valid)],
#              verbose = 0, early_stopping_rounds = 50)
# pred = model.predict(X_valid)
# pcc = pearsonr(pred, y_valid)[0]
# print(pcc)
y = df_train["Target"]
X = df_train.drop(['timestamp', "Target"], axis=1)

gc.collect()

model = train_lgb(lgb_model, X, y)
model

In [None]:
def get_prediction(row):
    y_pred = df_test.loc[((df_test["timestamp"] == row["timestamp"]) & (df_test["Asset_ID"] == row["Asset_ID"])), "Target"]
    print(y_pred.values[0])
    return y_pred.values[0]

## Hyperparameter Tuning with Optuna

In [None]:
# train = df_train[df_train['timestamp'] < '2020-09-20']
# valid = df_train[df_train['timestamp'] >= '2020-09-20']

# train.drop(['timestamp', 'Asset_ID'], axis = 1, inplace = True)
# valid.drop(['timestamp', 'Asset_ID'], axis = 1, inplace = True)

# X_train = train.drop(['Target'], axis = 1)
# y_train = train['Target']
# X_valid = valid.drop(['Target'], axis = 1)
# y_valid = valid['Target']

# print(X_train.shape)
# print(X_valid.shape)

In [None]:
# from optuna.samplers import TPESampler
# import optuna

# sampler = TPESampler(seed = 0)

# def objective(trial):
#     params = {
#         'objective': 'regression',
#         'verbose': -1,
#         'max_depth': trial.suggest_int('max_depth',5, 20),
#         'num_leaves': trial.suggest_int('num_leaves', 10, 40),
#         'learning_rate': trial.suggest_float("learning_rate", 1e-5, 0.1),
#         'n_estimators': trial.suggest_int('n_estimators', 500, 2500),
#         'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
#         'subsample': trial.suggest_float('subsample', 0.4, 1)}
    
#     model = LGBMRegressor(**params)
#     model.fit(X_train, y_train, eval_set = [(X_train, y_train), (X_valid, y_valid)],
#              verbose = 0, early_stopping_rounds = 50)
#     pred = model.predict(X_valid)
#     pcc = pearsonr(pred, y_valid)[0]
#     return pcc

# study_model = optuna.create_study(direction = 'maximize', sampler = sampler)
# study_model.optimize(objective, n_trials = 20) 

In [None]:
# # select best trial and parameter
# trial = study_model.best_trial
# best_params = trial.params

# print('Best params from optuna: \n', best_params)

In [None]:
# optuna.visualization.plot_optimization_history(study_model)

In [None]:
# optuna.visualization.plot_slice(study_model)

In [None]:
# optuna.visualization.plot_param_importances(study_model)

In [None]:
# opt_model = LGBMRegressor(**best_params)

# cv_result(opt_model, X, y)

## Predict and Submit

In [None]:
df_pred = pd.read_csv(r"/kaggle/input/g-research-crypto-forecasting/example_test.csv")
df_pred.head()

In [None]:
df_sample_sub = pd.read_csv(r"/kaggle/input/g-research-crypto-forecasting/example_test.csv")
df_sample_sub.head()

In [None]:
df_test.columns

In [None]:
X_test = df_test.drop(["group_num", "row_id", "timestamp"], axis=1)
df_test["Target"] = model.predict(X_test)
df_test.head()

In [None]:
df_test.head()

In [None]:
df_pred["Target"] = 0.0

for j, row in df_test.iterrows():
    print(row["Target"])
    df_pred.loc[((df_pred["group_num"] == int(row["group_num"])) & (df_pred["row_id"] == int(row["row_id"]))), "Target"] = row["Target"]
    print("pred " + str(row["row_id"])  + " " + str(df_pred.loc[df_pred["row_id"] == row["row_id"], "Target"]))

In [None]:
df_pred