In [1]:
import pandas as pd
import numpy as np
import itertools
base_path = 'data'

In [2]:
# SYMBOLS = ['BTCUSDT', 'ETHUSDT', 'ADAUSDT', 'BNBUSDT', 'XRPUSDT', 'SOLUSDT', 'DOTUSDT', 'DOGEUSDT', 'AVAXUSDT', 'UNIUSDT']
SYMBOLS = ['NEOUSDT', 'LTCUSDT', 'EOSUSDT', 'XLMUSDT', 'SHIBUSDT', 
           'IOTAUSDT', 'ETCUSDT', 'LINKUSDT', 'XMRUSDT', 
           'MATICUSDT', 'ALGOUSDT', 'MANAUSDT', 'APEUSDT', 'FTMUSDT', 
           'GMTUSDT', 'BONDUSDT', 'ATOMUSDT', 'NEARUSDT', 'RUNEUSDT', 
           'SANDUSDT', 'WAVESUSDT', 'TRXUSDT']
RES_LIST = ['1min', '5min', '1h', '1d']

In [3]:
def preprocess_crypto(sym, res):
    filename = f"{sym}_{res}.csv"
    print(f"Handling {filename}")
    df = pd.read_csv(f"{base_path}/{sym}_{res}.csv", index_col=0)
    countBefore = df.shape[0]
    # round down to nearest minute (ignore seconds)
    df['date'] = df['date'].values.astype('<M8[m]')
    df['date'] = df['date'].astype(str)
    df = df.sort_values(['date', 'tic']).reset_index(drop=True)

    # insert missing rows from date_range
    if res == '1d':
        df['date'] = pd.to_datetime(df['date']).dt.date.astype('str')   
    list_date = list(pd.date_range(df['date'].min(), df['date'].max(), freq=res).astype(str))
    countExpected = len(list_date)    
    list_ticker = df["tic"].unique().tolist()
    combination = list(itertools.product(list_date, list_ticker))
    combination_df =  pd.DataFrame(combination, columns=["date", "tic"])
    processed_full = combination_df.merge(df, on=["date", "tic"], how="left")

    # forward-filling close price to OHLC, set Vol to 0
    processed_full[['close']] = processed_full[['close']].fillna(method='ffill')
    processed_full.volume.fillna(0, inplace=True) # [['volume']] = processed_full[['volume']].fillna(0)
    processed_full.open.fillna(processed_full.close, inplace=True)
    processed_full.high.fillna(processed_full.close, inplace=True)
    processed_full.low.fillna(processed_full.close, inplace=True)
    processed_full['datex'] = pd.to_datetime(processed_full['date'])
    processed_full['day'] = processed_full.datex.dt.day_of_week
    processed_full = processed_full.drop('datex', axis=1)
    processed_full = processed_full.reset_index(drop=True)
    countNew = processed_full.shape[0]
    # processed_full.open.fillna(processed_full.close, inplace=True)
    print(f"Start: {countBefore} Expected: {countExpected} New: {countNew} Diff.: {countNew-countExpected}")
    print(f"Added: {countNew-countBefore}")
    print("----------------------------------------------------------")

    processed_full.to_csv(f"{base_path}/new/{sym}_{res}.csv")
    return processed_full
    

In [4]:
# res = '1d'
# sym = 'BTCUSDT'
# df = preprocess_crypto(sym, res)

In [4]:
for sym in SYMBOLS:
    for res in RES_LIST:
        df = preprocess_crypto(sym, res)

Handling NEOUSDT_1min.csv
Start: 2415388 Expected: 2423520 New: 2423520 Diff.: 0
Added: 8132
----------------------------------------------------------
Handling NEOUSDT_5min.csv
Start: 483088 Expected: 484704 New: 484704 Diff.: 0
Added: 1616
----------------------------------------------------------
Handling NEOUSDT_1h.csv
Start: 40271 Expected: 40392 New: 40392 Diff.: 0
Added: 121
----------------------------------------------------------
Handling NEOUSDT_1d.csv
Start: 1683 Expected: 1683 New: 1683 Diff.: 0
Added: 0
----------------------------------------------------------
Handling LTCUSDT_1min.csv
Start: 2382316 Expected: 2390400 New: 2390400 Diff.: 0
Added: 8084
----------------------------------------------------------
Handling LTCUSDT_5min.csv
Start: 476472 Expected: 478080 New: 478080 Diff.: 0
Added: 1608
----------------------------------------------------------
Handling LTCUSDT_1h.csv
Start: 39719 Expected: 39840 New: 39840 Diff.: 0
Added: 121
---------------------------------

In [6]:
# processed_full[processed_full['date'] > '2021-04-20 04:10:00'].head(10)
# isnan = processed_full[processed_full['date'].isin(df['date']) == False]