In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb 
import sklearn
import os
import json
from scipy.stats import spearmanr
import matplotlib.pyplot as plt
import logging
import json
from tqdm.notebook import tqdm
import seaborn as sns
SKIPS = []

In [None]:
!pip install talib-binary

In [None]:
import talib

In [None]:
configs_1 = {
    "RET": (1, 100),
    "STD": (2, 100),
    "RET_STD": (2, 100),
    "RSI": (2, 100),
    "ATR": (2, 100),
    "MFI": (2, 100),
    "VOL": (2, 100),
    "TRIX": (2, 100),
    "EFFICIENCY": (10, 100),
    "EI": (10, 100)
}

configs_2 = {
    "MACD_HIST": [(2, 50, 10), (3, 200, 10)],
    "DEMA": [(2, 50, 2), (3, 200, 5)],
    "MACD": [(2, 50, 2), (3, 200, 5)],
    "EVENT_MACROSS": [(1, 50, 2), (3, 200, 5)],
}

In [None]:
total_df = pd.read_feather("../input/pipeline-building-dataprepare/train.feather")
total_df['datetime'] = pd.to_datetime(total_df['timestamp'], unit='s')

In [None]:
# All features
def RET(df, n):
    return df['Close'].pct_change(n)

def STD(df, n):
    return df['Close'].pct_change(1).rolling(n).std()

def RET_STD(df, n):
    return RET(df, n) * STD(df, n)

def RSI(df, n):
    return talib.RSI(df['Close'], n)

def ATR(df, n):
    return talib.ATR(df["High"], df.Low, df.Close, n)

def MFI(df, n):
    return talib.MFI(df['High'], df['Low'], df['Close'], df['Volume'], n)

def VOL(df, n):
    ret = df['Close'].pct_change(1)
    return np.sqrt((ret ** 2).rolling(n).mean())

def TRIX(df, n):
    return talib.TRIX(df['Close'], n)

def MACD(df, fast, slow):
    return talib.MACD(df.Close, fast, slow)[0]

def MACD_HIST(df, fast, slow):
    return talib.MACD(df.Close, fast, slow)[2]

def DEMA(df, n1, n2):
    return np.log(df['Close'].rolling(n1).mean() / df['Close'].rolling(n2).mean())

def EFFICIENCY(df, n):
    speed = (df.Close - df.Close.shift(n))
    volatility = (df.Close - df.Close.shift(1)).abs().rolling(n).sum()
    return speed / volatility

def EI(df, n):
    neg = df.Close - df.Low.rolling(n).min()
    high = df.High.rolling(n).max() - df.Close
    ei = high / neg
    ei = np.clip(ei, -100, 100)
    return ei

def EVEMT_BIGGER_VOLUME(df):
    return (df.Volume > df.Volume.shift(1)).astype(float)

def EVENT_MACROSS(df, fast, slow):
    fast_s = df.Close.rolling(fast).mean()
    slow_s = df.Close.rolling(slow).mean()
    return (fast_s > slow_s).astype(float)

In [None]:
def get_best_timeperiod_for_one_asset(ASSET):
    df = total_df[total_df.Asset_ID == ASSET].fillna(method="ffill").dropna()
    df = df.set_index("datetime")
    if ASSET == "Maker":
        df = df.loc["2020-08-04":]
    elif ASSET == "Monero":
        df = df.loc["2018-11-05":]
    elif ASSET == "Stellar":
        df = df.loc["2018-07-14":]
    df.drop("timestamp", inplace=True, axis=1)
    df["raw_target"] = df['Close'].pct_change(15).shift(-15)
    best_period = {}

    for func_name, range_ in configs_1.items():
        best_period[func_name] = {}
        func = eval(func_name)
        corr_s = {}
        for i in tqdm(range(range_[0], range_[1])):
            lag_ret = func(df, i)
            corr_s[i] = lag_ret.corr(df['Target'])
        corr_s = pd.Series(corr_s)
        avg = corr_s.mean()
        # In order to prevent overfitting. Try not use the period with the highest correlation
        if avg > 0:
            t = corr_s.sort_values(ascending=False).index[3]
            print("Momentum Feature. Best Period: {}".format(t))
        else:
            t = corr_s.sort_values(ascending=True).index[3]
            print("Mean-Revertion Feature. Best Period: {}".format(t))
        best_period[func_name]["timeperiod"] = int(t)
        best_period[func_name]["correlation"] = int(corr_s[t])
        corr_s.name = ASSET + "-" + func_name
        corr_s.plot(figsize=(15, 5), title=ASSET + "-" + func_name)
        plt.show()
        print(corr_s.sort_values().head(5))
        print(corr_s.sort_values().tail(5))

    for func_name, range_ in configs_2.items():
        best_period[func_name] = {}
        func = eval(func_name)
        fs_params = [(fast, slow) for fast in range(range_[0][0], range_[0][1], range_[0][2]) for slow in range(range_[1][0], range_[1][1], range_[1][2]) if fast<slow]
        dis_res = []
        for f, s in tqdm(fs_params):
            dis = func(df, f, s)
            dis_res.append(dis.corr(df['Target']))
        fs_corr = pd.Series(dis_res, index=pd.MultiIndex.from_tuples(fs_params))
        fs_corr.name = ASSET + "-" + func_name
        avg = fs_corr.mean()
        if avg > 0:
            t = fs_corr.sort_values(ascending=False).index[3]
            print("Momentum Feature. Best Period: {}".format(t))
        else:
            t = fs_corr.sort_values(ascending=True).index[3]
            print("Mean-Revertion Feature. Best Period: {}".format(t))
        best_period[func_name]["timeperiod"] = str(t)
        best_period[func_name]["correlation"] = int(fs_corr[t])
        plt.figure(figsize=(15, 15))
        print(fs_corr.sort_values().head(5))
        print(fs_corr.sort_values().tail(5))
        sns.heatmap(fs_corr.unstack())
        plt.show()


    with open("./best_period_{}.json".format(ASSET), "w") as f:
        json.dump(best_period, f)

In [None]:
for asset in total_df.Asset_ID.unique():
    if asset in SKIPS:
        continue
    get_best_timeperiod_for_one_asset(asset)