# Let's discover our data

To forecast cryptocurrency prices, we should use all the trading features like price, volume, open, high, low values which are presents in the dataset.

* Close Price — It is the market close price for currency for that particular day.
* High Price — It is highest price of currency for the day.
* Low Price — It is the lowest price for currency for that day.
* Open Price — It is market open price for currency for that day.
* Volume — The volume of currency that is being in trade for that day.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import gc
import gresearch_crypto

import matplotlib.pyplot as plt
%matplotlib inline

# Constants definition
DATA_PATH = "../input/g-research-crypto-forecasting/train.csv"
ASSET_DETAILS = "../input/g-research-crypto-forecasting/asset_details.csv"

# Some EDA and visualizations

In [None]:
# Read train data
raw_data = pd.read_csv(DATA_PATH)
raw_data.head()

In [None]:
# Read asset information (containing more details about each cryptocurrency)
asset_data = pd.read_csv(ASSET_DETAILS)
asset_data.head()

In [None]:
# Let's join them on Asset_ID
data = pd.merge(raw_data, asset_data, on='Asset_ID', how='inner')
data.head()

In [None]:
# For the visualization we'll only use the bitcoin Cash (which have an asset_id = 2)
bitcoin_cash_data = data.loc[data['Asset_ID'] == 2][-10000:]

In [None]:
print(len(bitcoin_cash_data))
bitcoin_cash_data.head()

In [None]:
# Let's see the pattern of each feature individually
def plot_features(df):
    fig, axs = plt.subplots(3, 2)
    fig.set_size_inches(18.5, 10.5)
    axs[0, 0].plot(df["timestamp"], df["Open"], 'tab:blue')
    axs[0, 0].set_title('Open')
    axs[0, 1].plot(df["timestamp"], df["High"], 'tab:orange')
    axs[0, 1].set_title('High')
    axs[1, 0].plot(df["timestamp"], df["Low"], 'tab:green')
    axs[1, 0].set_title('Low')
    axs[1, 1].plot(df["timestamp"], df["Close"], 'tab:red')
    axs[1, 1].set_title('Close')
    axs[2, 0].plot(df["timestamp"], df["Volume"], 'tab:red')
    axs[2, 0].set_title('Volume')
    axs[2, 1].plot(df["timestamp"], df["VWAP"], 'tab:red')
    axs[2, 1].set_title('VWAP')

In [None]:
plot_features(bitcoin_cash_data)

In [None]:
# Plot the evolution of bitcoin cash
def evolution_split(df, test_size=0.2):
    split_row = len(df) - int(test_size * len(df))
    train_data = df.iloc[:split_row]
    test_data = df.iloc[split_row:]
    return train_data, test_data
train, test = evolution_split(bitcoin_cash_data, test_size=0.2)

def plot_evolution(x1, x2, line1, line2, label1=None, label2=None, title='', lw=2):
    fig, ax = plt.subplots(1, figsize=(13, 7))
    ax.plot(x1, line1, label=label1, linewidth=lw)
    ax.plot(x2, line2, label=label2, linewidth=lw)
    ax.set_ylabel('Target', fontsize=14)
    ax.set_title(title, fontsize=16)
    ax.legend(loc='best', fontsize=16)
plot_evolution(train["timestamp"], test["timestamp"], train["Target"], test["Target"], 'training', 'test', title='')

# Prepare features

In [None]:
def get_features(df):
    df_feat = df[['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP']].copy()
    df_feat['Upper_Shadow'] = df_feat['High'] - np.maximum(df_feat['Close'], df_feat['Open'])
    df_feat['Lower_Shadow'] = np.minimum(df_feat['Close'], df_feat['Open']) - df_feat['Low']
    
    df_feat['lower_shadow'] = np.minimum(df_feat['Close'], df_feat['Open']) - df_feat['Low']
    df_feat['high2low'] = df_feat['High'] / df_feat['Low']
    df_feat['volume2count'] = df_feat['Volume'] / (df_feat['Count'] + 1)
    
    return df_feat

In [None]:
def get_data_for_asset(df_train, asset_id):
   
    df = df_train[df_train["Asset_ID"] == asset_id]    
    df_proc = get_features(df)
    df_proc['y'] = df['Target']
    df_proc = df_proc.dropna(how="any")
    
    X = df_proc.drop("y", axis=1)
    y = df_proc["y"]
    
    return X, y

# Experiment some regression models

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [None]:
benchmarking_models = {
    "lr": LinearRegression(),
    "dt": DecisionTreeRegressor(max_depth=2),
    "rf": RandomForestRegressor(n_estimators = 2000, random_state = 42),
    "xgboost": XGBRegressor(n_estimators=2000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8),
    "lgbm": LGBMRegressor(n_estimators=2000, num_leaves=500, learning_rate=0.1)
}

In [None]:
def model_training(X,y, model_name):
    # Model training
    model = benchmarking_models[model_name]
    model.fit(X, y)
    
    return model

In [None]:
%%time
Xs = {}
ys = {}
models = {}
model_name = "lgbm" # You can switch to any other model

for asset_id, asset_name in zip(asset_data['Asset_ID'], asset_data['Asset_Name']):
    print(f"Training model for {asset_name:<16} (ID={asset_id:<2})")
    X, y = get_data_for_asset(raw_data, asset_id)
    X_train, X_test, y_train, y_test = train_test_split(X, y.tolist(), test_size=0.2)   
    model = model_training(X_train,y_train, model_name)
    Xs[asset_id], ys[asset_id], models[asset_id] = X, y, model
    preds = model.predict(X_test).squeeze()
    print("MAE of " +model_name + "on " + asset_name+ "dataset: ", mean_absolute_error(preds, y_test))
    gc.collect()

# Submission

In [None]:
# Prediction and submission
env = gresearch_crypto.make_env()
iter_test = env.iter_test()

for i, (df_test, df_pred) in enumerate(iter_test):
    for j, row in df_test.iterrows():
        
        try:
            model = models[row['Asset_ID']]
            x_test = get_features(row)
            y_pred = model.predict([x_test])[0]

            df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = y_pred
        
        except:
            print(f'{i}-th iteration of the test dataset, {j}-th row - there was the exception, then set Target = 0')
            df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = 0
            
        # Print just one sample row to get a feeling of what it looks like        
        if i == 0 and j == 0:
            print('Example of the x_test data')
            display(x_test)
    # Display the first prediction dataframe
    if i == 0:
        print('Example of the prediction for test data')
        display(df_pred)
    df_pred['Target'] = df_pred['Target'].fillna(0)

    # Send submissions
    env.predict(df_pred)

We can add cross validation to fine-tune the selected model base on the available hyperparameters for each one.