## Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from tqdm.auto import tqdm

from sklearn.linear_model import Ridge
from sklearn.metrics import log_loss
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold

import gresearch_crypto

# Warningの無効化
import warnings
warnings.simplefilter("ignore")

# データフレームcolumの全表示
pd.set_option("display.max_columns", None)

## Load Data

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        # else:
            # df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
df_asset_details = pd.read_csv(r"../input/g-research-crypto-forecasting/asset_details.csv").sort_values("Asset_ID")
df_asset_details

In [None]:
def read_csv_strict(file_name="/kaggle/input/g-research-crypto-forecasting/train.csv"):
    df = pd.read_csv(file_name).pipe(reduce_mem_usage)
    df["datetime"] = pd.to_datetime(df["timestamp"], unit="s")
    df = df[df["datetime"] < "2021-06-13 00:00:00"]
    return df

In [None]:
df_train = read_csv_strict()
df_train

# Training

## Utility functions to train a model for one asset

In [None]:
# technical indicators
def RSI(close: pd.DataFrame, period: int = 14) -> pd.Series:
    # https://gist.github.com/jmoz/1f93b264650376131ed65875782df386
    """See source https://github.com/peerchemist/finta
    and fix https://www.tradingview.com/wiki/Talk:Relative_Strength_Index_(RSI)
    Relative Strength Index (RSI) is a momentum oscillator that measures the speed and change of price movements.
    RSI oscillates between zero and 100. Traditionally, and according to Wilder, RSI is considered overbought when above 70 and oversold when below 30.
    Signals can also be generated by looking for divergences, failure swings and centerline crossovers.
    RSI can also be used to identify the general trend."""

    delta = close.diff()

    up, down = delta.copy(), delta.copy()
    up[up < 0] = 0
    down[down > 0] = 0

    _gain = up.ewm(com=(period - 1), min_periods=period).mean()
    _loss = down.abs().ewm(com=(period - 1), min_periods=period).mean()

    RS = _gain / _loss
    return pd.Series(100 - (100 / (1 + RS)))

def EMA1(x, n):
    """
    https://qiita.com/MuAuan/items/b08616a841be25d29817
    """
    a= 2/(n+1)
    return pd.Series(x).ewm(alpha=a).mean()

def MACD(close : pd.DataFrame, span1=12, span2=26, span3=9):
    """
    Compute MACD
    # https://www.learnpythonwithrune.org/pandas-calculate-the-moving-average-convergence-divergence-macd-for-a-stock/
    """
    exp1 = EMA1(close, span1)
    exp2 = EMA1(close, span2)
    macd = 100 * (exp1 - exp2) / exp2
    signal = EMA1(macd, span3)

    return macd, signal

In [None]:
# Two new features from the competition tutorial
def upper_shadow(df):
    return df["High"] - np.maximum(df["Close"], df["Open"])

def lower_shadow(df):
    return np.minimum(df["Close"], df["Open"]) - df["Low"]

# A utility function to build features from the original df
# It works for rows to, so we can reutilize it.
def get_features(df,row=False):
    features = []
    keys = ["Count", "Open", "High", "Low", "Close", "Volume", "VWAP"]

    # df_feat = df[["Count", "Open", "High", "Low", "Close", "Volume", "VWAP"]].copy()
    df_feat = df.copy()
    df_feat["Upper_Shadow"] = upper_shadow(df_feat)
    df_feat["Lower_Shadow"] = lower_shadow(df_feat)
    features += ["Upper_Shadow", "Lower_Shadow",]

    ## Ad dsome more feats
    df_feat["Close/Open"] = df_feat["Close"] / df_feat["Open"] 
    df_feat["Close-Open"] = df_feat["Close"] - df_feat["Open"] 
    df_feat["High-Low"] = df_feat["High"] - df_feat["Low"] 
    df_feat["High/Low"] = df_feat["High"] / df_feat["Low"]
    features += ["Close/Open", "Close-Open", "High-Low", "High/Low",]

    if row:
        df_feat['Mean'] = df_feat[['Open', 'High', 'Low', 'Close']].mean()
    else:
        df_feat['Mean'] = df_feat[['Open', 'High', 'Low', 'Close']].mean(axis=1)
    df_feat["High/Mean"] = df_feat["High"] / df_feat["Mean"]
    df_feat["Low/Mean"] = df_feat["Low"] / df_feat["Mean"]
    df_feat["Volume/Count"] = df_feat["Volume"] / (df_feat["Count"] + 1)
    features += ["Mean", "High/Mean", "Low/Mean", "Volume/Count",]

    ## possible seasonality, datetime  features (unlikely to me meaningful, given very short time-frames)
    ### to do: add cyclical features for seasonality
    times = pd.to_datetime(df["timestamp"],unit="s",infer_datetime_format=True)
    if row:
        df_feat["hour"] = times.hour  # .dt
        df_feat["dayofweek"] = times.dayofweek 
        df_feat["day"] = times.day 
    else:
        df_feat["hour"] = times.dt.hour  # .dt
        df_feat["dayofweek"] = times.dt.dayofweek 
        df_feat["day"] = times.dt.day 
    #df_feat.drop(columns=["time"],errors="ignore",inplace=True)  # keep original epoch time, drop string
    
    if row:
        df_feat["Median"] = df_feat[["Open", "High", "Low", "Close"]].median()
    else:
        df_feat["Median"] = df_feat[["Open", "High", "Low", "Close"]].median(axis=1)
    df_feat["High/Median"] = df_feat["High"] / df_feat["Median"]
    df_feat["Low/Median"] = df_feat["Low"] / df_feat["Median"]
    features += ["Median", "High/Median", "Low/Median",]


    df_feat["Log_n_Close"] = np.log(df_feat["Close"])
    features += ["Log_n_Close",]

    for col in ['Open', 'High', 'Low', 'Close', 'VWAP']:
        df_feat[f"Log_1p_{col}"] = np.log1p(df_feat[col])
        features += [f"Log_1p_{col}",]

    # 基準線
    #max26 = df_feat["High"].rolling(window=26).max()
    #min26 = df_feat["Low"].rolling(window=26).min()
    #df_feat["basic_line"] = (max26 + min26) / 2
    #features += ["basic_line",]
    
    # 転換線
    #high9 = df_feat["High"].rolling(window=9).max()
    #low9 = df_feat["Low"].rolling(window=9).min()
    #df_feat["turn_line"] = (high9 + low9) / 2
    #features += ["turn_line",]

    # RSI
    #df_feat["RSI"] = RSI(df_feat["Close"], 14)

    # MACD
    macd, macd_signal = MACD(df_feat["Close"], 12, 26, 9) 
    df_feat["MACD"] = macd
    df_feat["MACD_signal"] = macd_signal
    features += ["MACD", "MACD_signal",]
    
    df_feat = df_feat[keys + features]
    
    return df_feat

In [None]:
def get_Xy_and_model_for_asset(df_train, asset_id):
    df = df_train[df_train["Asset_ID"] == asset_id]
   
    # TODO: Try different features here!
    df_proc = get_features(df)
    df_proc["y"] = df["Target"]
    #df_proc = df_proc.dropna(how="any")
    df_proc = df_proc.replace([np.inf, -np.inf], np.nan).dropna(how="any")
    
    X = df_proc.drop("y", axis=1)
    y = df_proc["y"]

    # -----------------------------------
    # Stratified K-Fold
    # -----------------------------------
    # StratifiedKFoldクラスを用いて層化抽出による分割を行う
    kf = KFold(n_splits=4, shuffle=False, random_state=71)    # 時系列順に並んだデータのためshuffle=Falseとする
    for tr_idx, va_idx in kf.split(X, y):
        tr_x, va_x = X.iloc[tr_idx], X.iloc[va_idx]
        tr_y, va_y = y.iloc[tr_idx], y.iloc[va_idx]
        # Modelクラスは、fitで学習し、predictで予測値の確率を出力する

        # データのスケーリング
        scaler = RobustScaler()
        tr_x = scaler.fit_transform(tr_x)
        va_x = scaler.transform(va_x)
        #test_x = scaler.transform(test_x)

        # 線形モデルの構築・学習
        model = Ridge(alpha=1.0)
        model.fit(tr_x, tr_y)

    return tr_x, tr_y, model

## Loop over all assets

In [None]:
%%time
Xs = {}
ys = {}
models = {}

for asset_id, asset_name in zip(df_asset_details["Asset_ID"], df_asset_details["Asset_Name"]):
    print(f"Training model for  {asset_name:<16} (ID={asset_id:<2})")
    X, y, model = get_Xy_and_model_for_asset(df_train, asset_id)
    Xs[asset_id], ys[asset_id], models[asset_id] = X, y, model

In [None]:
%%time
# Check the model interface
x = get_features(df_train.iloc[1], row=True)
y_pred = models[0].predict([x])
y_pred[0]

# Predict & submit

References: [Detailed API Introduction](https://www.kaggle.com/sohier/detailed-api-introduction)

Something that helped me understand this iterator was adding a pdb checkpoint inside of the for loop:

```python
import pdb; pdb.set_trace()
```

See [Python Debugging With Pdb](https://realpython.com/python-debugging-pdb/) if you want to use it and you don't know how to.


In [None]:
env = gresearch_crypto.make_env()
iter_test = env.iter_test()

for i, (df_test, df_pred) in enumerate(iter_test):
    for j , row in df_test.iterrows():
        
        model = models[row["Asset_ID"]]
        x_test = get_features(row, row=True)
        y_pred = model.predict([x_test])[0]
        
        df_pred.loc[df_pred["row_id"] == row["row_id"], "Target"] = y_pred
        
        
        # Print just one sample row to get a feeling of what it looks like
        if i == 0 and j == 0:
            display(x_test)

    # Display the first prediction dataframe
    if i == 0:
        display(df_pred)

    # Send submissions
    env.predict(df_pred)