In [1]:
import os
import requests
import json
import pandas as pd
from datetime import datetime
import time

In [2]:
API_KEY = os.environ.get("CRYPTO_API_KEY")
CRYPTO_CURRENCY = "BTC"
LOCAL_CURRENCY = "EUR"
BATCH_SIZE = 10
API_ENDPOINT = "histohour"

# 1. API Investigation

In [3]:
request = requests.get(f"https://min-api.cryptocompare.com/data/v2/{API_ENDPOINT}?fsym={CRYPTO_CURRENCY}&tsym={LOCAL_CURRENCY}&limit={BATCH_SIZE}&{API_KEY}")
# request.content --> return type is actually bytes and not a json, but containing a json object
# bytes can be converted back to json using loads from json library
rep = json.loads(request.content)

In [4]:
rep

{'Response': 'Success',
 'Message': '',
 'Type': 100,
 'RateLimit': {},
 'Data': {'Aggregated': False,
  'TimeFrom': 1746507600,
  'TimeTo': 1746543600,
  'Data': [{'time': 1746507600,
    'high': 83548.34,
    'low': 83330.57,
    'open': 83512.31,
    'volumefrom': 19.15,
    'volumeto': 1597467.91,
    'close': 83363.69,
    'conversionType': 'direct',
    'conversionSymbol': ''},
   {'time': 1746511200,
    'high': 83467.87,
    'low': 83360.52,
    'open': 83363.69,
    'volumefrom': 22.32,
    'volumeto': 1861414.42,
    'close': 83432.99,
    'conversionType': 'direct',
    'conversionSymbol': ''},
   {'time': 1746514800,
    'high': 83492.04,
    'low': 83196.8,
    'open': 83432.99,
    'volumefrom': 38.95,
    'volumeto': 3246153.83,
    'close': 83257.66,
    'conversionType': 'direct',
    'conversionSymbol': ''},
   {'time': 1746518400,
    'high': 83313.76,
    'low': 82918.39,
    'open': 83257.66,
    'volumefrom': 91.08,
    'volumeto': 7566983.96,
    'close': 83313.7

In [5]:
data = rep['Data']['Data']
# Convert list of dictionnaries into a DataFrame
df = pd.json_normalize(data)

In [6]:
df

Unnamed: 0,time,high,low,open,volumefrom,volumeto,close,conversionType,conversionSymbol
0,1746507600,83548.34,83330.57,83512.31,19.15,1597467.91,83363.69,direct,
1,1746511200,83467.87,83360.52,83363.69,22.32,1861414.42,83432.99,direct,
2,1746514800,83492.04,83196.8,83432.99,38.95,3246153.83,83257.66,direct,
3,1746518400,83313.76,82918.39,83257.66,91.08,7566983.96,83313.76,direct,
4,1746522000,83357.11,83145.04,83313.76,38.54,3207371.58,83332.0,direct,
5,1746525600,83370.61,82941.25,83332.0,60.62,5041409.93,82975.02,direct,
6,1746529200,83076.83,82756.62,82975.02,52.45,4350319.08,82857.12,direct,
7,1746532800,82959.73,82785.46,82857.12,46.62,3864315.77,82870.26,direct,
8,1746536400,82879.31,82281.16,82870.26,139.24,11503479.44,82490.53,direct,
9,1746540000,83433.43,82481.4,82490.53,119.36,9911093.23,83033.24,direct,


As you can see, we do not have dates and hours but integers instead.
More precisely, they are timestamps which makes compatibility between systems, storage and calculation easier.
However, if we want to expose our database with these data to business users, we have to convert timestamp back to "understandable" dates, so we provide meaningful data.

In [7]:
df["time"] = df["time"].apply(lambda time: datetime.fromtimestamp(time))
df.drop(columns=["conversionType", "conversionSymbol"], inplace=True)
df

Unnamed: 0,time,high,low,open,volumefrom,volumeto,close
0,2025-05-06 07:00:00,83548.34,83330.57,83512.31,19.15,1597467.91,83363.69
1,2025-05-06 08:00:00,83467.87,83360.52,83363.69,22.32,1861414.42,83432.99
2,2025-05-06 09:00:00,83492.04,83196.8,83432.99,38.95,3246153.83,83257.66
3,2025-05-06 10:00:00,83313.76,82918.39,83257.66,91.08,7566983.96,83313.76
4,2025-05-06 11:00:00,83357.11,83145.04,83313.76,38.54,3207371.58,83332.0
5,2025-05-06 12:00:00,83370.61,82941.25,83332.0,60.62,5041409.93,82975.02
6,2025-05-06 13:00:00,83076.83,82756.62,82975.02,52.45,4350319.08,82857.12
7,2025-05-06 14:00:00,82959.73,82785.46,82857.12,46.62,3864315.77,82870.26
8,2025-05-06 15:00:00,82879.31,82281.16,82870.26,139.24,11503479.44,82490.53
9,2025-05-06 16:00:00,83433.43,82481.4,82490.53,119.36,9911093.23,83033.24


As you can see, the amount of data fetched is limited to the batch size defined at the beginning.
We could increase it to 2000 (max authorized by CoinDesk) but we would still miss data.
As advised by CoinDesk, we need then to retrieve data by batch and iterate using the earliest date available in the metadata (i.e TimeFrom)

# 2. ETL Pipeline

How to iterate if we do not know the earliest date available?
Thankfully, another endpoint provides all data (daily); we will use it to get and define the earliest available record to be able to iterate over hourly data then.

In [8]:
BATCH_SIZE = 2000

In [9]:
def get_historical_earliest_date() -> int:
    
    request = requests.get(f"https://min-api.cryptocompare.com/data/v2/histoday?fsym={CRYPTO_CURRENCY}&tsym={LOCAL_CURRENCY}&allData=true&{API_KEY}")
    rep = json.loads(request.content)

    return rep["Data"]["TimeFrom"]

In [10]:
historical_earliest_date = get_historical_earliest_date()
print(historical_earliest_date, datetime.fromtimestamp(historical_earliest_date))

1314403200 2011-08-27 02:00:00


In [11]:
def get_hourly_batch_data(toTs: int) -> int | list:

    request = requests.get(f"https://min-api.cryptocompare.com/data/v2/{API_ENDPOINT}?fsym={CRYPTO_CURRENCY}&tsym={LOCAL_CURRENCY}&limit={str(BATCH_SIZE)}&toTs={toTs}&{API_KEY}")
    
    if request.status_code != 200:
        raise Exception(request.content)

    rep = json.loads(request.content)
    data = rep["Data"]["Data"]
    batch_earliest_date = rep["Data"]["TimeFrom"]

    return (data, batch_earliest_date)

In [12]:
now = time.time()
hourly_data, batch_earliest_date = get_hourly_batch_data(now)
hourly_data, batch_earliest_date

([{'time': 1739343600,
   'high': 92810.04,
   'low': 92513.66,
   'open': 92536.28,
   'volumefrom': 84.7,
   'volumeto': 7842919.38,
   'close': 92615.79,
   'conversionType': 'direct',
   'conversionSymbol': ''},
  {'time': 1739347200,
   'high': 92994.99,
   'low': 92610.11,
   'open': 92615.79,
   'volumefrom': 56.13,
   'volumeto': 5207567.66,
   'close': 92947.71,
   'conversionType': 'direct',
   'conversionSymbol': ''},
  {'time': 1739350800,
   'high': 92959.12,
   'low': 92633.52,
   'open': 92947.71,
   'volumefrom': 69.38,
   'volumeto': 6434326.89,
   'close': 92722.29,
   'conversionType': 'direct',
   'conversionSymbol': ''},
  {'time': 1739354400,
   'high': 92775.4,
   'low': 92501.71,
   'open': 92722.29,
   'volumefrom': 46.74,
   'volumeto': 4330883.15,
   'close': 92533.05,
   'conversionType': 'direct',
   'conversionSymbol': ''},
  {'time': 1739358000,
   'high': 92862.33,
   'low': 92532.97,
   'open': 92533.05,
   'volumefrom': 66.94,
   'volumeto': 6204274.02

In [13]:
while batch_earliest_date > historical_earliest_date:

    prev_hourly_data, batch_earliest_date = get_hourly_batch_data(batch_earliest_date)
    hourly_data += list(prev_hourly_data)

    print(batch_earliest_date)

1732143600
1724943600
1717743600
1710543600
1703343600
1696143600
1688943600
1681743600
1674543600
1667343600
1660143600
1652943600
1645743600
1638543600
1631343600
1624143600
1616943600
1609743600
1602543600
1595343600
1588143600
1580943600
1573743600
1566543600
1559343600
1552143600
1544943600
1537743600
1530543600
1523343600
1516143600
1508943600
1501743600
1494543600
1487343600
1480143600
1472943600
1465743600
1458543600
1451343600
1444143600
1436943600
1429743600
1422543600
1415343600
1408143600
1400943600
1393743600
1386543600
1379343600
1372143600
1364943600
1357743600
1350543600
1343343600
1336143600
1328943600
1321743600
1314543600
1307343600


In [14]:
hourly_df = pd.json_normalize(hourly_data)

In [16]:
hourly_df

Unnamed: 0,time,high,low,open,volumefrom,volumeto,close,conversionType,conversionSymbol
0,1739343600,92810.040,92513.660,92536.280,84.70,7842919.38,92615.790,direct,
1,1739347200,92994.990,92610.110,92615.790,56.13,5207567.66,92947.710,direct,
2,1739350800,92959.120,92633.520,92947.710,69.38,6434326.89,92722.290,direct,
3,1739354400,92775.400,92501.710,92722.290,46.74,4330883.15,92533.050,direct,
4,1739358000,92862.330,92532.970,92533.050,66.94,6204274.02,92647.660,direct,
...,...,...,...,...,...,...,...,...,...
122056,1314529200,6.145,6.145,6.145,0.00,0.00,6.145,direct,
122057,1314532800,6.145,6.145,6.145,0.00,0.00,6.145,direct,
122058,1314536400,6.145,6.145,6.145,0.00,0.00,6.145,direct,
122059,1314540000,6.145,6.145,6.145,0.00,0.00,6.145,direct,


In [None]:
hourly_df["time"] = hourly_df["time"].apply(lambda time: datetime.fromtimestamp(time))
hourly_df.drop(columns=["conversionType", "conversionSymbol"], inplace=True)

In [18]:
hourly_df.sort_values(by=["time"], ascending=True, inplace=True)

In [23]:
hourly_df.reset_index(drop=True, inplace=True)
hourly_df

Unnamed: 0,time,high,low,open,volumefrom,volumeto,close
0,2011-06-06 09:00:00,0.00,0.00,0.00,0.00,0.00,0.00
1,2011-06-06 10:00:00,0.00,0.00,0.00,0.00,0.00,0.00
2,2011-06-06 11:00:00,0.00,0.00,0.00,0.00,0.00,0.00
3,2011-06-06 12:00:00,0.00,0.00,0.00,0.00,0.00,0.00
4,2011-06-06 13:00:00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...
122056,2025-05-06 13:00:00,83076.83,82756.62,82975.02,52.45,4350319.08,82857.12
122057,2025-05-06 14:00:00,82959.73,82785.46,82857.12,46.62,3864315.77,82870.26
122058,2025-05-06 15:00:00,82879.31,82281.16,82870.26,139.24,11503479.44,82490.53
122059,2025-05-06 16:00:00,83433.43,82481.40,82490.53,119.36,9911093.23,83033.24


In [None]:
def process_df(data: list) -> pd.DataFrame:

    # Convert list of json into a single DataFrame
    df = pd.json_normalize(data)

    # Convert date format: from integers (i.e timestamps) to dates
    df["time"] = df["time"].apply(lambda time: datetime.fromtimestamp(time))
    
    df.drop(columns=["conversionType", "conversionSymbol"], inplace=True)

    # Sort DataFrame from earliest to latest date and reset index
    df.sort_values(by=["time"], ascending=True, inplace=True)
    df.reset_index(drop=True, inplace=True)

    return df

In [25]:
hourly_df.to_csv("btc_hourly_data.csv")