In [1]:
import os
import json
import pandas as pd
from dotenv import load_dotenv

load_dotenv('env')

True

In [47]:
def to_dataframe(base_dir, sub_dir, fname):
    file_path = f'{base_dir}/{sub_dir}/{fname}.json'
    try:
        df = pd.read_json(file_path)
        df.columns = df.columns.map(str)
    except FileNotFoundError as err:
        raise (err)
    return df

def to_json(df:pd.DataFrame, tgt_dir, sub_dir, fname):
    file_path = f'{tgt_dir}/{sub_dir}/{fname}.json'
    os.makedirs(f'{tgt_dir}/{sub_dir}', exist_ok=True)
    try:
        df = pd.read_json(file_path, orient=True, lines=True)
    except FileNotFoundError as err:
        raise (err)
    return df

In [35]:
base_dir = os.getenv('EXTRACTED_DIR')
coinslist = to_dataframe(base_dir, 'coinslist', 'coinslist')
coinslist.columns

Index(['id', 'symbol', 'name', 'image', 'current_price', 'market_cap',
       'market_cap_rank', 'fully_diluted_valuation', 'total_volume',
       'high_24h', 'low_24h', 'price_change_24h',
       'price_change_percentage_24h', 'market_cap_change_24h',
       'market_cap_change_percentage_24h', 'circulating_supply',
       'total_supply', 'max_supply', 'ath', 'ath_change_percentage',
       'ath_date', 'atl', 'atl_change_percentage', 'atl_date', 'roi',
       'last_updated'],
      dtype='object')

In [36]:
def read_columns_map(path:str):
    try:
        with open(path, 'r') as f:
            json_doc = json.load(f)
    except FileNotFoundError as err:
        raise(err)
    return json_doc

transformed_dir = os.getenv('TRANSFORMED_DIR')
col_map = read_columns_map(f'{transformed_dir}/columns_map.json')
col_map

{'coins': {'id': 'coin_id', 'symbol': 'coin_symbol', 'name': 'coin_name'},
 'market_assets': {'id': 'coin_id',
  'current_price': 'current_price',
  'market_cap': 'market_cap',
  'market_cap_rank': 'market_cap_rank',
  'total_supply': 'total_supply',
  'max_supply': 'max_supply',
  'circulating_supply': 'circulating_supply',
  'ath': 'all_time_high',
  'ath_date': 'all_time_high_date',
  'atl': 'all_time_low',
  'atl_date': 'all_time_low_date',
  'last_updated': 'last_updated'},
 'candlesticks': {'0': 'timestamp',
  '1': 'open',
  '2': 'high',
  '3': 'low',
  '4': 'close'}}

In [48]:
def rename_columns(df:pd.DataFrame, column_map:dict):
    original_cols = column_map.keys()
    new_df = df[original_cols]
    new_df.rename(columns=column_map, inplace=True)
    return new_df

rename_columns(coinslist, col_map['coins'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df.rename(columns=column_map, inplace=True)


Unnamed: 0,coin_id,coin_symbol,coin_name
0,bitcoin,btc,Bitcoin
1,ethereum,eth,Ethereum
2,tether,usdt,Tether
3,ripple,xrp,XRP
4,binancecoin,bnb,BNB
...,...,...,...
195,kusama,ksm,Kusama
196,trust-wallet-token,twt,Trust Wallet
197,blur,blur,Blur
198,frax-ether,frxeth,Frax Ether


In [60]:
import glob

files = glob.glob(f'data/extracted_data/candlesticks/*.json')
files

['data/extracted_data/candlesticks\\bitcoin.json',
 'data/extracted_data/candlesticks\\ethereum.json']

In [64]:
base_dir, sub_dir = 'data/extracted_data','candlesticks'
tgt_dir = 'data/transformed_data'

fname = 'candlesticks'
os.makedirs(f'{tgt_dir}/{sub_dir}', exist_ok=True)

file_path = f'{tgt_dir}/{sub_dir}/{fname}.json'
with open(file_path, 'a') as f:

    for file in files:
        coin = os.path.basename(file).replace('.json','')
        df = to_dataframe(base_dir, sub_dir, coin)
        df = rename_columns(df, col_map['candlesticks'])
        df.insert(loc=0, column='coin_id', value=coin)
        df.to_json(f, orient='records', lines=True)


In [66]:
candlesticks = pd.read_json(file_path, orient='records', lines=True)
candlesticks

Unnamed: 0,coin_id,timestamp,open,high,low,close
0,bitcoin,2024-12-11 20:00:00,3792.50,3814.86,3762.29,3814.22
1,bitcoin,2024-12-12 00:00:00,3810.97,3846.62,3810.97,3828.11
2,bitcoin,2024-12-12 04:00:00,3832.65,3933.53,3804.42,3933.53
3,bitcoin,2024-12-12 08:00:00,3930.49,3942.65,3904.00,3910.64
4,bitcoin,2024-12-12 12:00:00,3908.85,3932.39,3898.12,3913.78
...,...,...,...,...,...,...
355,ethereum,2025-01-10 00:00:00,3205.57,3234.37,3160.58,3219.09
356,ethereum,2025-01-10 04:00:00,3224.19,3254.13,3220.15,3254.13
357,ethereum,2025-01-10 08:00:00,3254.81,3306.46,3246.48,3296.67
358,ethereum,2025-01-10 12:00:00,3292.70,3316.16,3290.74,3307.42


In [87]:
timestamp = candlesticks['timestamp'].unique()

def transform_to_time(ar):
    df = pd.DataFrame(ar, columns=['timestamp'])
    df['hour'] = ar.hour
    df['day'] = ar.day
    df['month'] = ar.month
    df['year'] = ar.year
    df['weekday'] = ar.weekday
    return df
transform_to_time(timestamp)

Unnamed: 0,timestamp,hour,day,month,year,weekday
0,2024-12-11 20:00:00,20,11,12,2024,2
1,2024-12-12 00:00:00,0,12,12,2024,3
2,2024-12-12 04:00:00,4,12,12,2024,3
3,2024-12-12 08:00:00,8,12,12,2024,3
4,2024-12-12 12:00:00,12,12,12,2024,3
...,...,...,...,...,...,...
175,2025-01-10 00:00:00,0,10,1,2025,4
176,2025-01-10 04:00:00,4,10,1,2025,4
177,2025-01-10 08:00:00,8,10,1,2025,4
178,2025-01-10 12:00:00,12,10,1,2025,4
