In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# **Install Dependencies**

In [2]:
# Install ta-lib
url = 'https://launchpad.net/~mario-mariomedina/+archive/ubuntu/talib/+files'
!wget $url/libta-lib0_0.4.0-oneiric1_amd64.deb -qO libta.deb
!wget $url/ta-lib0-dev_0.4.0-oneiric1_amd64.deb -qO ta.deb
!dpkg -i libta.deb ta.deb
!pip install -q ta-lib
!rm -r /kaggle/working/*.deb

In [3]:
#Install Binance API
!pip install -q python-binance

# **Create Dataset**
I'll create the dataset with Binance API

In [4]:
import pandas as pd
import numpy as np
import datetime
from binance.client import Client

In [5]:
# Initialize Binance API public client
client = Client()

# The list of pairs to look into. You may add more if you want.
coins = ['ADA','ATOM','BAT','BNB','BTC','BCH','DOGE','ETC','ETH','FIL','LTC','MKR','NEO','ONT','SOL','UNI','XRP','VET','ZRX']
pairs = [f'{coin}USDT' for coin in coins]

# Create the dataset
df = []
for pair in pairs:
    datas = client.get_historical_klines(pair, 
                                         client.KLINE_INTERVAL_1DAY, 
                                         "1 Jan, 2000")
    datas = np.array(datas)[:-1,:6]
    datas = pd.DataFrame(datas,columns=['date', 'Open', 'High', 'Low', 'Close', 'Volume'])
    datas[['Open', 'High', 'Low', 'Close', 'Volume']] = datas[['Open', 'High', 'Low', 'Close', 'Volume']].astype('float32')
    datas.insert(1,'pair',pair)
    df.append(datas)
    
df = pd.concat(df, ignore_index=True)
df.sort_values('date',inplace=True,ignore_index=True) #Sort time
df['date'] = pd.to_datetime(df['date'], unit='ms').dt.strftime('%Y%m%d').astype(int) #Convert date to YYYYmmdd format

In [6]:
df

## Add era columns

In [7]:
# In this example, 1 era = 20 days
era_size = 20
earliest = pd.to_datetime(df.date.min(),format='%Y%m%d')
earliest = datetime.date(earliest.year,earliest.month,earliest.day)
eras = ((datetime.date.today()-earliest)/era_size).days+2
time = [int((earliest+n*datetime.timedelta(days=era_size)).strftime('%Y%m%d')) for n in range(eras)]
df.insert(2, 'era', '')
for era, (start, end) in enumerate(zip(time[:-1],time[1:])):
    df.loc[df.date.isin(range(start, end)), ['era']] = era
    
df.era = df.era.astype('str')

In [8]:
df

# **Feature Engineering**
I'll use ta-lib to create technical indicator features

In [9]:
import talib

In [10]:
%%time

metric = 'Close'
# Moving Avg
'''
    "The 5-, 10-, 20- and 50-day moving averages are often used"
    https://www.investopedia.com/ask/answers/122414/what-are-most-common-periods-used-creating-moving-average-ma-lines.asp
'''
MAs = [5, 10, 20, 50]
for MA in MAs:
    df[f'ma_{str(MA)}'] = df.groupby('pair')[metric].transform(lambda x: talib.MA(x, timeperiod=MA).astype('float32'))

# Exponential Moving Avg
'''
    "Short-term traders typically rely on the 12- or 26-day EMA"
    https://www.investopedia.com/ask/answers/122414/what-are-most-common-periods-used-creating-moving-average-ma-lines.asp
'''
EMAs = [12, 26]
for EMA in EMAs:
    df[f'ema_{str(EMA)}'] = df.groupby('pair')[metric].transform(lambda x: talib.EMA(x, timeperiod=EMA).astype('float32'))

# Stocastic
stoch = df.groupby('pair', sort=False).apply(lambda d: talib.STOCH(d['High'],d['Low'],d[metric]))
stoch = stoch.transform(lambda x: np.vstack(x))
df[f'stoch_slowk'], df[f'stoch_slowd'] = np.hstack(stoch).astype('float32')

# Moving average convergence divergence (MACD)
macd = df.groupby('pair', sort=False)[metric].apply(lambda x: talib.MACD(x))
macd = macd.transform(lambda x: np.vstack(x))
df[f'macd'], df[f'macdsignal'], _ = np.hstack(macd).astype('float32')

# Bollinger bands
bbands = df.groupby('pair', sort=False)[metric].apply(lambda x: talib.BBANDS(x))
bbands = bbands.transform(lambda x: np.vstack(x))
df[f'bbands_upperband'], _, df[f'bbands_lowerband'] = np.hstack(bbands).astype('float32')

# Relative strength index (RSI)
df[f'rsi_14'] = df.groupby('pair', sort=False)[metric].transform(lambda x: talib.RSI(x).astype('float32'))

# Average directional index (ADX)
adx = df.groupby('pair', sort=False).apply(lambda d: talib.ADX(d['High'],d['Low'],d[metric]))
df[f'adx_14'] = np.hstack(adx).astype('float32')

# Average True Range (ATR) 
atr = df.groupby('pair', sort=False).apply(lambda d: talib.ATR(d['High'],d['Low'],d[metric]))
df[f'atr_14'] = np.hstack(atr).astype('float32')

# fill nan
df.fillna(method='ffill', inplace=True) # safe fillna method for a forecasting task
df.fillna(method='bfill', inplace=True) # just in case ... making sure no nan

df

# **Create the target**


In [11]:
from scipy.stats import percentileofscore

In [12]:
def get_targets(close,lag,th):
    raw_target = pd.DataFrame(close[lag:].values/close[:-lag].values-1, columns=['raw_target'])
    
    bull = raw_target.query(f'raw_target>{th}')
    bull = bull.apply(lambda x: np.round(percentileofscore(bull, x[0])*2/200)/2+0.5, axis=1)
    
    bear = raw_target.query(f'raw_target<{-th}')
    bear = bear.apply(lambda x: np.round(percentileofscore(bear, x[0])*2/200)/2-1.0, axis=1)
    
    neutral = raw_target.query(f'raw_target>={-th} and raw_target<={th}')
    neutral= neutral.apply(lambda x: 0, axis=1)
    
    nan = np.empty(lag,)
    nan[:] = np.nan
    target = np.concatenate((pd.concat([bull,bear,neutral], sort=False).sort_index().values, nan))
    return target

In [13]:
lags = [1,4,7,10,20]
targets = []
for lag in lags:
    targets.append(df.groupby('pair', sort=False)['Close'].transform(get_targets, lag, th=0.01))
targets = pd.concat(targets, axis=1)
targets.columns = [f'target_{lag}d' for lag in lags]

In [14]:
targets.describe()

In [15]:
df = pd.concat([df,targets], axis=1)
df

# **Summary**

In [16]:
df.describe()

In [17]:
print('Pairs:\n',df.pair.unique(),'\n')
print('Columns:\n',list(df.columns),'\n')
print('Dataset size:\n', len(df),'\n')

In [18]:
df

# **Save Dataset**

In [19]:
df.to_csv('crypto_dataset.csv')
df.to_parquet('crypto_dataset.parquet')