# DataPipeline 
This notebook is used for fetching and cleaning the price data which will be used in the backtest of the model

### Dimensions of data:
1) 15 minute candles
2) 100 most populair trading pairs
3) 18 months of data

### 0) Parameters for Data 

In [1]:
MONTHS_OF_DATA = 18
AMOUNT_OF_PRODUCTS = 100

### 1) Preparations for data download

#### 1.1) Imports and API keys

In [2]:
#from binance import Client
import pandas as pd
import datetime as dt
import numpy as np
import json
from os.path import exists
from tqdm import trange, tqdm

In [8]:
with open('API_keys.json') as f:
    data = json.load(f)
api_key = data['API-Key']
api_secret = data['Secret-Key']

#### 1.2) Connecting to the binance API

In [9]:
client = Client(api_key, api_secret)
candle = client.KLINE_INTERVAL_15MINUTE

### 2) Data download and cleaning

#### 2.1) Create list of products to add

In [42]:
# first fetch all products then filter them for: USDT market then for 
dictionary = client.get_products()['data']
df_products = pd.DataFrame(dictionary)
df_filtered_products = df_products[(df_products['q'] == 'USDT')&(df_products['st'] == 'TRADING')]
df_sorted_products = df_filtered_products.sort_values(by='qv', ascending = False)
#save the list of products to use in a list
list_of_tickers = list(df_sorted_products.s.head(AMOUNT_OF_PRODUCTS).values)

#### 2.2) Data downloader

In [None]:
end = dt.datetime.today()- dt.timedelta(days=1)
start = end - dt.timedelta(days=30*MONTHS_OF_DATA)

END = end.strftime('%-d %b, %Y')
START = start.strftime('%-d %b, %Y')

In [123]:
# purpose of function is to fetch price data for a specific symbol
def get_klines_data(symbol):
    rename_map = {
        0: 'open_time',
        1: 'open', 
        2: 'high', 
        3: 'low', 
        4: 'close', 
        5: 'volume',
        6: 'close_time', 
        7: 'qav', 
        8: 'num_trades',
        9: 'taker_base_vol', 
        10: 'taker_quote_vol', 
        11: 'ignore'
    }
    klines = client.get_historical_klines(symbol, candle, START, END)
    data = pd.DataFrame(klines).rename(rename_map, axis= 1) 
    data.open_time = [dt.datetime.fromtimestamp(x/1000.0) for x in data.open_time]
    data.close_time = [dt.datetime.fromtimestamp(x/1000.0) for x in data.close_time]
    return data

In [126]:
#searches for the historic data for all the specified tickers 
for ticker in tqdm(list_of_tickers):
    path = 'price_data/{}.csv'.format(ticker)
    #checks if it still needs to save data
    if not exists(path):
        #fetch data
        symbol_data = get_klines_data(ticker)
        #save data
        symbol_data.to_csv(path)

100%|██████████| 100/100 [07:17<00:00,  4.38s/it]


In [4]:
import os
list_of_tickers = os.listdir('price_data')

#### 2.3) Merge all data

In [5]:
price_data = False
for idx, ticker in tqdm(enumerate(list_of_tickers)):
    ticker = ticker.split('.')[0]
    path = 'price_data/{}.csv'.format(ticker)
    data = pd.read_csv(path)
    #if this is first loop instantiate the dataframe
    if idx == 0:
        price_data = data[['close_time','close']]
        price_data = price_data.set_index('close_time')
        price_data = price_data.rename({'close':ticker},axis=1)
    else:
        data = data[['close_time','close']]
        data = data.set_index('close_time')
        data = data.rename({'close':ticker},axis=1)
        data = data[~data.index.duplicated(keep='first')]
        #now merge the dataframes
        #price_data = pd.merge(left=price_data,right=data,how='left',on='close_time')
        price_data = price_data.join(data, how='outer')

    price_data = price_data[~price_data.index.duplicated(keep='first')]

100it [00:24,  4.06it/s]


#### 2.4) Clean dataset

In [6]:
#cleaning of the dataset
#price_data = price_data.dropna(thresh=100, axis=1,inplace=False)
#price_data = price_data.dropna(axis=1, thresh=len(price_data)-2)
#price_data = price_data.dropna()
#rounding the index
price_data.index = pd.to_datetime(price_data.index).round('min')
#drop the stablecoins
not_allowed = ['BUSDUSDT','TUSDUSDT','USDCUSDT','PAXUSDT','USDCTUSD','USDCPAX','PAXTUSD','BTCNGN', 'EURUSDT','GBPUSDT', 'AUDUSDT']
price_data = price_data.drop(not_allowed, axis=1, errors = 'ignore')

#### 2.5) Calculate the returns and log-returns for the dataset

In [13]:
price_data = price_data[(price_data.index.hour % 24 == 0)&(price_data.index.minute  == 0)]

In [15]:
price_data

Unnamed: 0_level_0,APEUSDT,RUNEUSDT,JASMYUSDT,FORTHUSDT,NEARUSDT,AAVEUSDT,DOTUSDT,DOGEUSDT,ROSEUSDT,FILUSDT,...,ONEUSDT,ADAUSDT,KNCUSDT,WINGUSDT,XRPUSDT,RADUSDT,XLMUSDT,AXSUSDT,GALAUSDT,MATICUSDT
close_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-03-11,,6.4438,,,5.4419,416.373,38.4861,0.056654,0.14736,43.0973,...,0.04228,1.15490,2.061,34.383,0.46630,,0.41496,4.04195,,0.29617
2021-03-12,,6.4700,,,5.9956,406.447,37.0995,0.056046,0.15905,42.0052,...,0.04150,1.11418,2.172,33.449,0.45209,,0.40493,4.57993,,0.43100
2021-03-13,,5.9582,,,7.0296,372.478,35.3053,0.055815,0.18209,44.5565,...,0.05991,1.04270,2.161,32.206,0.43791,,0.38751,4.99984,,0.40596
2021-03-14,,6.1207,,,7.3984,413.438,37.4360,0.058377,0.19270,49.2621,...,0.05370,1.09668,2.311,35.535,0.45863,,0.40546,5.36429,,0.42331
2021-03-15,,5.6658,,,6.7528,391.314,37.1438,0.059453,0.23519,56.5610,...,0.08143,1.07630,2.222,41.491,0.44559,,0.39702,6.09708,,0.39763
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-08-28,4.687,1.9260,0.008573,4.81,3.8740,81.200,6.9900,0.062780,0.06345,5.7500,...,0.01993,0.44370,1.783,11.190,0.33390,2.138,0.10480,13.69000,0.05022,0.79580
2022-08-29,4.824,1.9120,0.008721,4.86,3.8780,81.300,7.0400,0.063690,0.06503,5.7600,...,0.01981,0.44050,1.689,10.870,0.33440,2.125,0.10520,13.73000,0.05102,0.81290
2022-08-30,4.908,1.9470,0.008866,4.92,3.9920,84.800,7.2400,0.063430,0.06565,5.9800,...,0.02058,0.44790,1.697,11.070,0.33210,2.121,0.10510,13.69000,0.05195,0.81400
2022-08-31,4.829,1.8920,0.008652,4.84,4.2140,84.900,7.0200,0.061510,0.06322,5.7300,...,0.02050,0.45100,1.750,10.650,0.32760,2.078,0.10240,13.70000,0.05072,0.81460


In [16]:
log_ret = np.log(price_data / price_data.shift(1))
#log_ret.dropna(inplace=True)

In [9]:
pct_ret = price_data.pct_change(1)
#pct_ret.dropna(inplace=True)

#### 2.6) Save the data

In [10]:
log_ret.to_csv('log_returns.csv')
pct_ret.to_csv('pct_returns.csv')
price_data.to_csv('price_data.csv')