# Description
In many cases to make a baseline prediction it's good to follow conservative strategy, or "it will be the same" rule. But what exactly will be the same: some value, derivative of some value or something else depends on the task. 

We are trying to "forecast short term returns" (to be more specific returns of crypto assets in next 15 minutes), so let's calculate what "short-term return" we would have received if we bought this currency some "short term" ago. Say, from 10 to 20 minutes ago (averaging by time seem to be good idea due to high volatility). So let's calculate `Z = log(Close / Mean(Close_lag_10_to_20_minutes) )` and correlation of Z with target value.

There are three options:
1. Positive correlation between Z and target. It may be interpreted as "tendency to keep grow or fall".
2. Negative correlation between Z and target. It may be interpreted as "tendency to smooth grow or fall by opposite movement in immediate future".
3. No correlation between Z and Target. 

It's a good place to check you intuition: what option seems more probable?

In [None]:
import os
import pandas as pd
import numpy as np
import gc
import math
import gresearch_crypto

PRICE_COLUMN = "Close"
INPUT_PATH = "../input/g-research-crypto-forecasting"
FOLDS = 15 #apporx. 3 months

WINDOW = 10 
SHIFT = 10

def get_preprocess_data(df_data, df_assets, folds=10):
    df_data['Time'] = pd.to_datetime(df_data['timestamp'], unit='s')
    df_data['w'] = df_data['Asset_ID'].map(
        df_assets.set_index(keys='Asset_ID')['Weight'])
    ids = list(df_assets.Asset_ID)
    chunks = []
    for id in ids:
        df_asset = df_data[df_data.Asset_ID == id].copy()
        df_asset.sort_values(by='Time', inplace=True)
        df_asset.set_index(keys='Time', inplace=True)
        df_asset['p1'] = df_asset[PRICE_COLUMN].shift(freq='-1T')
        df_asset['p16'] = df_asset[PRICE_COLUMN].shift(freq='-16T')
        df_asset['r'] = np.log(df_asset.p16/df_asset.p1)
        df_asset.reset_index(inplace=True)
        chunks.append(df_asset)
    df_data = pd.concat(chunks)
    df_data.sort_values(by='Time', inplace=True)
    
    df_data.sort_values(by='Time', inplace=True)
    df_data.set_index(keys='Time', inplace=True)
    
    min_value = df_data["timestamp"].min()
    step = (df_data["timestamp"].max() - min_value) / folds
    df_data["fold_id"] = ((df_data["timestamp"] - min_value - 1) / step).astype(np.int32)
    
    return df_data

def get_df_rolling_simple(df, df_assets, field="Close", new_field="Close_RMEAN", 
                          window=10, shift=10):
    """Apply rolling funcion to some column and calculate mean value of this column 
    on [-shift - window, -shift] records interval (if 0 is current record).
    Shift is usefull to work with target-based features: 
    one should not look at values of target in 15 or less steps back
    """
    chunks = []
    for asset_id in df_assets["Asset_ID"]:
        df_asset = df[(df["Asset_ID"] == asset_id)].copy()
        df_asset[new_field] = df_asset[field].rolling(
            window=window, min_periods=window, center=False).mean().values
        df_asset[new_field] = df_asset[new_field].shift(freq=f"{shift}T")
        df_asset.reset_index(inplace=True)
        chunks.append(df_asset)
    df_result = pd.concat(chunks)
    df_result.sort_values(by='Time', inplace=True)
    df_result.set_index(keys='Time', inplace=True)
    return df_result
    
def corr(a, b, w):
    """weighed correlation 
    """
    def cov(x, y):
        return np.sum(
            w * (x - np.average(x, weights=w)) *
            (y - np.average(y, weights=w))) / np.sum(w)
    return cov(a, b) / np.sqrt(cov(a, a) * cov(b, b))

gc.collect()
df_assets: pd.DataFrame = pd.read_csv(os.path.join(INPUT_PATH, 
                                                   "asset_details.csv")).sort_values("Asset_ID")
df_train = pd.read_csv(
        os.path.join(INPUT_PATH, "train.csv"),
        dtype={'timestamp': 'int64', 'Asset_ID': 'int8', 'Count': 'int32',
               'row_id': 'int32', 'Count': 'int32', 'Open': 'float64',
               'High': 'float64', 'Low': 'float64', 'Close': 'float64',
               'Volume': 'float64', 'VWAP': 'float64', 'Target': 'float64'})
df_train: pd.DataFrame = get_preprocess_data(df_train, df_assets, folds=FOLDS)
df_train: pd.DataFrame = get_df_rolling_simple(df_train, df_assets, window=WINDOW, shift=SHIFT)
df_train["Z"] = np.log(df_train["Close"] / df_train["Close_RMEAN"])

In [None]:
corrs = []
fold_id = None
asset_id = None
for field in ["Z",]:
    for fold_id in list(df_train["fold_id"].unique()) + [None,]:
        for asset_id in list(df_assets["Asset_ID"].unique()) + [None,]:
            filter = ((df_train["Target"].notna()) &
                      (df_train[field].notna()) &
                      ((df_train["fold_id"] == fold_id) | (fold_id == None)) &
                      ((df_train["Asset_ID"] == asset_id) | (asset_id == None)))
            records = len(df_train[filter])
            if records > 10000:
                score = corr(df_train.loc[filter, "Target"],
                             df_train.loc[filter, field],
                             df_train.loc[filter, "w"])
                corrs.append({"field": field, "fold_id": fold_id, "Asset_ID": asset_id, 
                              "score": score, "records": records})
df_corr = pd.DataFrame(corrs)
#print(df_corr[ (df_corr["Asset_ID"].isna()) ])

# Score for each asset (correlation of target and Z calculated for each asset on entire time interval)

In [None]:
filter = (df_corr["fold_id"].isna()) & (df_corr["Asset_ID"].notna()) 
print(df_corr[filter].sort_values("score").join(
    df_assets.set_index("Asset_ID"), on="Asset_ID")[["Asset_ID", "Asset_Name", "score", "records"]])
#df_corr[filter]["score"].hist()


Only one asset shows notable positive correlation, and it's a Maker coin. One can speculate whether it differs from other coin in some fundamental way (Maker coin have a tricky description), but for such small correlation it may be coincidence.

# Score by folds (weighted correlation of target and Z calculated for each folds)

In [None]:
filter = (df_corr["fold_id"].notna()) & (df_corr["Asset_ID"].isna()) 
print(df_corr[filter].sort_values("fold_id")[["fold_id", "score", "records"]])

# Score

In [None]:
df_train.fillna({"Z":df_train["Z"].mean()}, inplace=True)
filter = (df_train["Target"].notna())
score = corr(df_train[filter]["Target"], -1 * df_train[filter]["Z"], df_train[filter]["w"])
print(f'Total score: {score:.5f}')

# Submit simplified solution
Do not average price in past - simply get value 15 minutes back and calculate `-1 * log(Close / Close_lag_15_minutes) 

In [None]:
env = gresearch_crypto.make_env()
iter_test = env.iter_test()

reverse = True
prices = {}
for i, (df_test, df_pred) in enumerate(iter_test):
    for j, row in df_test.iterrows():
        key = f'{row["timestamp"]}-{row["Asset_ID"]}'
        key_prev = f'{row["timestamp"] - 15 * 60}-{row["Asset_ID"]}'
        prices[key] = row["Close"]
        df_pred.loc[df_pred["row_id"] == row["row_id"], "Target"] = 0
        if key_prev in prices.keys():
            df_pred.loc[df_pred["row_id"] == row["row_id"], "Target"] = \
                (-1 if reverse else 1) * \
                math.log(row["Close"]/prices[key_prev])
    env.predict(df_pred)