# Crypto Buys Prediction using Deep Learning - RNN Model

This project uses data from [G-Research Crypto forecasting competition](link) to build a deep learning model to predict when to buy a Crypto asset.

The following are done in this notebook:
- normalize raw `volume` and `closing price` across different crypto assets so that they can be compared on comparable basis
- created a collection of sequential data, each containing past 60 periods to predict `buys` in the next period
- build RNN model to predict BTC `buys` with >80% accuracy

## Importing Packages

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn import preprocessing
from collections import deque
import random
import time
import math

## Reading Data from csv

In [None]:
data_folder = "../input/g-research-crypto-forecasting/"
!ls $data_folder

# Reading `csv`s into pandas dataframes
crypto_df = pd.read_csv(data_folder + 'train.csv')
asset_details = pd.read_csv(data_folder + 'asset_details.csv')

# Adding Crypto symbols to the dataframes for easy reference
asset_symbols_map = {
    0: 'BNB',
    1: 'BTC',
    2: 'BCH',
    3: 'ADA',
    4: 'DOGE',
    5: 'EOS.IO',
    6: 'ETH',
    7: 'ETC',
    8: 'IOTA',
    9: 'LTC',
    10: 'MKR',
    11: 'XMR',
    12: 'XLM',
    13: 'TRON'
}
crypto_df['Symbol'] = crypto_df.Asset_ID.map(asset_symbols_map)
asset_details['Symbol'] = asset_details.Asset_ID.map(asset_symbols_map)

## Data Preprocessing

In [None]:
# how many preceeding periods used to predict future prices
SEQ_LEN = 60

# how many periods forward are we predicting
FUTURE_PERIOD_PREDICT = 3 

# which crypto to predict
CRYPTO_TO_PREDICT = "BTC"

def classify(current, future):
    """
    Compare current vs future crypto price, 
    returns 1 if future price is greater or equal
    with current price, else returns 0.
    """
    if future >= current:
        return 1
    return 0

def preprocess(df):
    """
    Wrapper function to preprocess df for ML input.
    
    Args:
        df: dataframe consisting crypto data (features and target)
        
    Returns:
        x_train, y_train, x_test, y_test: train and test dfs
            ready for ML modeling.
    """
    # Normalizing each features
    df_norm = df.apply(lambda col: scale_normalize(col))
    
    # Creating sequential data
    seq_data = create_sequential_data(df_norm)    
    
    # Train test split
    x_train, y_train, x_test, y_test = train_test_split_seq(seq_data)
    return x_train, y_train, x_test, y_test

def scale_normalize(col):
    """
    Calculates the `pct_change` of a pandas series
    and normalize the values.
    
    Args:
        col: pandas series of float numbers (crypto prices)
        
    Returns:
        pandas series of processed columns
    """
    if 'Buy' in col.name:
        # Skipping preprocessing for `Buy` column
        # Removing first row to sync with preprocessed columns
        return col[1:] 
    else:
        col_processed = col.pct_change()
        col_processed.dropna(inplace=True)
        
        col_processed = preprocessing.scale(col_processed)
        return col_processed
    
def create_sequential_data(df, sequence_length=SEQ_LEN):
    """
    Given a `df`, generate a list (sequence) of smaller `dfs`,
    with each one being a slice of the original `df` of length
    `sequence_length`
    
    Args:
        df: pandas DataFrame to generate the seq data from
        sequence_length: the number of rows in each slice
        
    Returns:
        List of numpy arrays containing features and target
            i.e. [(feature1, target1), (feature2, target2), ..]
    """
    seq_data = []
    
    for row_idx in range(0, len(df)-sequence_length):
        seq_data.append(
            [df.iloc[row_idx:row_idx+sequence_length, :-1],
             df.iloc[row_idx+sequence_length, -1]]
        )
    
    return seq_data

def train_test_split_seq(seq_data, test_frac=0.1):
    """
    Custom function to split the sequential data to 
    train and test sets for ML modeling.
    
    Args:
        seq_data: list of np.arrays containing (feature, target)
        test_frac: proportion of data to assign as test set
    """
    test_size = math.ceil(len(seq_data)*test_frac)
    
    train_seq = seq_data[:-test_size]
    print('==TRAIN==')
    print_seq_split_stats(train_seq)
    x_train = np.array(list(map(lambda seq: seq[0], train_seq)))
    y_train = np.array(list(map(lambda seq: [seq[1]], train_seq)))
    
    print()
    
    test_seq = seq_data[-test_size:]
    print('==TEST==')
    print_seq_split_stats(test_seq)
    x_test = np.array(list(map(lambda seq: seq[0], test_seq)))
    y_test = np.array(list(map(lambda seq: [seq[1]], test_seq)))
    
    return x_train, y_train, x_test, y_test

def print_seq_split_stats(seq_data):
    """
    Prints the amount of data in the sequence and the distribution
    of the target split.
    """
    total = len(seq_data)
    buys = sum(map(lambda seq: seq[1], seq_data))
    dont_buys = len(list(filter(lambda seq: seq[1]==0, seq_data)))
    print(f'Total: {total}')
    print(f'Buys: {buys} ({round(buys/total*100, 0)}%)')
    print(f'Dont buys: {dont_buys} ({round(dont_buys/total*100, 0)}%)')

# `crypto_df` consists of rows split by timestamp and crypto asset.
# For this modeling, want the data to be split by timestamp only so that
#   each row contains data for each crypto asset that can be used as
#   `features` for the model.
df = crypto_df.head(400000).groupby(['timestamp', 'Symbol']).max().unstack()
df = df[['Volume', 'Close']]
df.columns = ['_'.join(col).strip() for col in df.columns.values]
df.sort_index(inplace=True)

# Creating `target` column
df['Future_BTC'] = df['Close_BTC'].shift(-FUTURE_PERIOD_PREDICT)
df['Buy_BTC'] = df.apply(lambda row: classify(row.Close_BTC, row.Future_BTC), axis=1)
display(df[['Close_BTC', 'Future_BTC', 'Buy_BTC']])

# Preprocessing raw data into ML ready inputs
x_train, y_train, x_test, y_test = preprocess(df)

## Building RNN Model

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint

# Setting parameters
EPOCHS = 10
BATCH_SIZE = 64
NAME = f'{SEQ_LEN}-SEQ-{FUTURE_PERIOD_PREDICT}-PRED-{int(time.time())}'

# Building RNN Model
model = Sequential()
model.add(LSTM(128, input_shape=(x_train.shape[1:]), return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(LSTM(128, input_shape=(x_train.shape[1:]), return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(LSTM(128, input_shape=(x_train.shape[1:])))
model.add(Dropout(0.2))
model.add(BatchNormalization())
          
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(2, activation='softmax'))

# Compiling RNN Model
opt = tf.keras.optimizers.Adam(learning_rate=0.001, decay=1e-6)
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=opt,
    metrics=['accuracy']
)

# Setting up tensorbaord
tensorboard = TensorBoard(log_dir=f'logs/{NAME}')

filepath = 'RNN_Final-{epoch:02d}-{accuracy:.3f}'
checkpoint = ModelCheckpoint(
    'models/{}.model'.format(
        filepath, 
        monitor='val_acc', 
        verbose=1, 
        save_best_only=True, 
        mode='max')
)

history = model.fit(
    x_train, 
    y_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(x_test, y_test),
    callbacks=[tensorboard, checkpoint]
)