In [None]:
import pandas as pd
import numpy as np
import itertools
base_path = 'data'

In [None]:
# SYMBOLS = ['BTCUSDT', 'ETHUSDT', 'ADAUSDT', 'BNBUSDT', 'XRPUSDT', 'SOLUSDT', 'DOTUSDT', 'DOGEUSDT', 'AVAXUSDT', 'UNIUSDT']
SYMBOLS = ['BTCUSDT', 'ETHUSDT', 'LTCUSDT', 'AVAXUSDT', 'LINKUSDT', 'UNIUSDT', 'TLMUSDT', 'AXSUSDT', 'DOGEUSDT', 'SHIBUSDT']
RES_LIST = ['1min', '5min', '30min', '1h', '6h', '12h', '1d']

In [None]:
def preprocess_crypto(sym, res):
    filename = f"{sym}_{res}.csv"
    print(f"Handling {filename}")
    df = pd.read_csv(f"{base_path}/thesis-step2/{sym}_{res}.csv", index_col=0)
    countBefore = df.shape[0]
    # round down to nearest minute (ignore seconds)
    df['date'] = df['date'].values.astype('<M8[m]')
    df['date'] = df['date'].astype(str)
    df = df.sort_values(['date', 'tic']).reset_index(drop=True)

    # insert missing rows from date_range
    if res == '1d':
        df['date'] = pd.to_datetime(df['date']).dt.date.astype('str')   
    list_date = list(pd.date_range(df['date'].min(), df['date'].max(), freq=res).astype(str))
    countExpected = len(list_date)    
    list_ticker = df["tic"].unique().tolist()
    combination = list(itertools.product(list_date, list_ticker))
    combination_df =  pd.DataFrame(combination, columns=["date", "tic"])
    processed_full = combination_df.merge(df, on=["date", "tic"], how="left")

    # forward-filling close price to OHLC, set Vol to 0
    processed_full[['close']] = processed_full[['close']].fillna(method='ffill')
    processed_full.volume.fillna(0, inplace=True) # [['volume']] = processed_full[['volume']].fillna(0)
    processed_full.open.fillna(processed_full.close, inplace=True)
    processed_full.high.fillna(processed_full.close, inplace=True)
    processed_full.low.fillna(processed_full.close, inplace=True)
    processed_full['datex'] = pd.to_datetime(processed_full['date'])
    processed_full['day'] = processed_full.datex.dt.day_of_week
    processed_full = processed_full.drop('datex', axis=1)
    processed_full = processed_full.reset_index(drop=True)
    countNew = processed_full.shape[0]
    # processed_full.open.fillna(processed_full.close, inplace=True)
    print(f"Start: {countBefore} Expected: {countExpected} New: {countNew} Diff.: {countNew-countExpected}")
    print(f"Added: {countNew-countBefore}")
    print("----------------------------------------------------------")

    processed_full.to_csv(f"{base_path}/thesis-step3/{sym}_{res}.csv")
    return processed_full
    

In [None]:
# res = '1d'
# sym = 'BTCUSDT'
# df = preprocess_crypto(sym, res)

In [None]:
for sym in SYMBOLS:
    for res in RES_LIST:
        df = preprocess_crypto(sym, res)