# DataPipeline 
This notebook is used for fetching and cleaning the price data which will be used in the backtest of the model

### Dimensions of data:
1) 15 minute candles
2) 100 most populair trading pairs
3) 18 months of data

### 0) Parameters for Data 

In [1]:
MONTHS_OF_DATA = 18
AMOUNT_OF_PRODUCTS = 100

### 1) Preparations for data download

#### 1.1) Imports and API keys

In [3]:
#from binance import Client
import pandas as pd
import datetime as dt
import numpy as np
import json
from os.path import exists
from tqdm import trange, tqdm

In [8]:
with open('API_keys.json') as f:
    data = json.load(f)
api_key = data['API-Key']
api_secret = data['Secret-Key']

#### 1.2) Connecting to the binance API

In [9]:
client = Client(api_key, api_secret)
candle = client.KLINE_INTERVAL_15MINUTE

### 2) Data download and cleaning

#### 2.1) Create list of products to add

In [42]:
# first fetch all products then filter them for: USDT market then for 
dictionary = client.get_products()['data']
df_products = pd.DataFrame(dictionary)
df_filtered_products = df_products[(df_products['q'] == 'USDT')&(df_products['st'] == 'TRADING')]
df_sorted_products = df_filtered_products.sort_values(by='qv', ascending = False)
#save the list of products to use in a list
list_of_tickers = list(df_sorted_products.s.head(AMOUNT_OF_PRODUCTS).values)

#### 2.2) Data downloader

In [None]:
end = dt.datetime.today()- dt.timedelta(days=1)
start = end - dt.timedelta(days=30*MONTHS_OF_DATA)

END = end.strftime('%-d %b, %Y')
START = start.strftime('%-d %b, %Y')

In [123]:
# purpose of function is to fetch price data for a specific symbol
def get_klines_data(symbol):
    rename_map = {
        0: 'open_time',
        1: 'open', 
        2: 'high', 
        3: 'low', 
        4: 'close', 
        5: 'volume',
        6: 'close_time', 
        7: 'qav', 
        8: 'num_trades',
        9: 'taker_base_vol', 
        10: 'taker_quote_vol', 
        11: 'ignore'
    }
    klines = client.get_historical_klines(symbol, candle, START, END)
    data = pd.DataFrame(klines).rename(rename_map, axis= 1) 
    data.open_time = [dt.datetime.fromtimestamp(x/1000.0) for x in data.open_time]
    data.close_time = [dt.datetime.fromtimestamp(x/1000.0) for x in data.close_time]
    return data

In [126]:
#searches for the historic data for all the specified tickers 
for ticker in tqdm(list_of_tickers):
    path = 'price_data/{}.csv'.format(ticker)
    #checks if it still needs to save data
    if not exists(path):
        #fetch data
        symbol_data = get_klines_data(ticker)
        #save data
        symbol_data.to_csv(path)

100%|██████████| 100/100 [07:17<00:00,  4.38s/it]


In [9]:
import os
list_of_tickers = os.listdir('price_data')

#### 2.3) Merge all data

In [43]:
price_data = False
for idx, ticker in tqdm(enumerate(list_of_tickers)):
    ticker = ticker.split('.')[0]
    path = 'price_data/{}.csv'.format(ticker)
    data = pd.read_csv(path)
    #if this is first loop instantiate the dataframe
    if idx == 0:
        price_data = data[['close_time','close']]
        price_data = price_data.set_index('close_time')
        price_data = price_data.rename({'close':ticker},axis=1)
    else:
        data = data[['close_time','close']]
        data = data.set_index('close_time')
        data = data.rename({'close':ticker},axis=1)
        data = data[~data.index.duplicated(keep='first')]
        #now merge the dataframes
        #price_data = pd.merge(left=price_data,right=data,how='left',on='close_time')
        price_data = price_data.join(data, how='outer')

    price_data = price_data[~price_data.index.duplicated(keep='first')]

100it [00:26,  3.74it/s]


#### 2.4) Clean dataset

In [44]:
#cleaning of the dataset
#price_data = price_data.dropna(thresh=100, axis=1,inplace=False)
#price_data = price_data.dropna(axis=1, thresh=len(price_data)-2)
#price_data = price_data.dropna()
#rounding the index
price_data.index = pd.to_datetime(price_data.index).round('min')
#drop the stablecoins
not_allowed = ['BUSDUSDT','TUSDUSDT','USDCUSDT','PAXUSDT','USDCTUSD','USDCPAX','PAXTUSD','BTCNGN', 'EURUSDT','GBPUSDT', 'AUDUSDT']
price_data = price_data.drop(not_allowed, axis=1, errors = 'ignore')

#### 2.5) Calculate the returns and log-returns for the dataset

In [52]:
price_data = price_data[price_data.index.hour % 4 == 0]

In [53]:
log_ret = np.log(price_data / price_data.shift(1))
#log_ret.dropna(inplace=True)

In [54]:
pct_ret = price_data.pct_change(1)
#pct_ret.dropna(inplace=True)

#### 2.6) Save the data

In [57]:
log_ret.to_csv('log_returns.csv')
pct_ret.to_csv('pct_returns.csv')
price_data.to_csv('price_data.csv')