In [None]:
# # Create a virtual environment
# python3 -m venv venv_crypto_prediction

# # Activate the virtual environment
# source venv_crypto_prediction/bin/activate

# # Install packages from requirements.txt
# pip install -r requirements.txt

# # Add the virtual environment as a Jupyter kernel
# python -m ipykernel install --user --name=venv_crypto_prediction --display-name "Python (venv_crypto_prediction)"

### Historical OHLC (open, high, low close) data was downloaded as a .csv from here https://www.coinlore.com/coin/ethereum/historical-data

In [5]:
import csv

def convert_value(value):
    """
    Converts a string value to a float. Removes $ signs, and converts
    billion (bn), million (m), and thousand (K) values to their numeric equivalents.
    """
    value = value.replace('$', '')  # Remove $ sign to simplify processing
    if value[-1].lower() == 'm':
        return float(value[:-1]) * 1_000_000
    elif value[-1].lower() == 'b':
        return float(value[:-1]) * 1_000_000_000
    elif value[-1].lower() == 'k':
        return float(value[:-1]) * 1_000
    elif value[-2:].lower() == 'bn':  # Handle 'bn' for billions
        return float(value[:-2]) * 1_000_000_000
    else:
        return float(value)


def clean_csv(input_file, output_file):
    with open(input_file, 'r', newline='') as infile, open(output_file, 'w', newline='') as outfile:
        reader = csv.reader(infile)
        writer = csv.writer(outfile)

        # Grab the header and write it straight to the output, no changes needed
        header = next(reader)
        writer.writerow(header)

        for row in reader:
            # Convert all the necessary columns
            row[1] = convert_value(row[1])  # Open
            row[2] = convert_value(row[2])  # High
            row[3] = convert_value(row[3])  # Low
            row[4] = convert_value(row[4])  # Close
            # Make sure to handle the volume correctly now
            row[5] = convert_value(row[5])  # Volume
            # Assuming Volume(ETH) doesn't need conversion, but let's be safe
            try:
                row[6] = float(row[6])  # Volume(ETH)
            except ValueError:
                row[6] = convert_value('$' + row[6])  # Adding $ to handle our conversion function
            row[7] = convert_value(row[7])  # Market Cap

            writer.writerow(row)

clean_csv('data/historical_ohlc/ethereum_ohlc.csv', 'data/historical_ohlc/ethereum_ohlc_clean.csv')


### Use CoinGecko API to get historical hourly data for a coin. Note that the data is sometimes inconsistent and fucked

In [5]:
import requests
import pandas as pd
from datetime import datetime, timedelta
import time
import os

def fetch_and_append_crypto_data(crypto, start_date, end_date, output_file):
    delta = timedelta(days=90)  # 90 days interval
    current_start_date = start_date

    # Check if the file exists; if not, initialize it with headers
    if not os.path.exists(output_file):
        with open(output_file, 'w') as f:
            f.write('date,price,market_cap,volume_24h\n')

    while current_start_date < end_date:
        current_end_date = min(current_start_date + delta, end_date)
        
        url = f"https://api.coingecko.com/api/v3/coins/{crypto}/market_chart/range"
        params = {
            'vs_currency': 'usd',
            'from': int(current_start_date.timestamp()),
            'to': int(current_end_date.timestamp())
        }
        
        response = safe_request(url, params)
        
        if response and response.status_code == 200:
            data = response.json()
            df = pd.DataFrame(data['prices'], columns=['date', 'price'])
            df['date'] = pd.to_datetime(df['date'], unit='ms')
            df['market_cap'] = pd.DataFrame(data['market_caps'])[1].values
            df['volume_24h'] = pd.DataFrame(data['total_volumes'])[1].values
            
            # Append the current chunk of data to the CSV file
            df.to_csv(output_file, mode='a', header=False, index=False)
            print(f"Appended data for chunk starting {current_start_date.strftime('%Y-%m-%d')} to the CSV file.")
        else:
            print(f"Failed to fetch data for chunk starting {current_start_date}. Status code: {response.status_code}" if response else "Failed to fetch data; no response.")
            break  # Exit the loop on failure to avoid infinite loop
        
        # Prepare for the next iteration
        current_start_date = current_end_date + timedelta(seconds=1)
        
        # Sleep to respect the rate limit of 5 calls per minute
        time.sleep(12)

def safe_request(url, params, retries=5, backoff_factor=0.5):
    for i in range(retries):
        try:
            response = requests.get(url, params=params)
            if response.status_code == 200:
                return response
            elif response.status_code == 429:
                # We're being rate-limited; back off and retry
                sleep_time = backoff_factor * (2 ** i)
                print(f"Rate limit hit. Waiting {sleep_time:.2f} seconds before retrying...")
                time.sleep(sleep_time)
            else:
                # Other errors, break the retry loop and return None
                print(f"Request failed with status code {response.status_code}.")
                return None
        except requests.exceptions.RequestException as e:
            print(f"Request exception: {e}. Retrying...")
            time.sleep(backoff_factor * (2 ** i))
    return None


In [None]:
def fetch_recent_data(crypto):
    # 20:00, or 8pm in EST is 12:00 in UTC, which is the cycle by which crypto OHLC are defined
    # end_date = datetime.now().replace(hour=20, minute=0, second=0, microsecond=0)
    # ^^^ ONLY WHEN RUNING LOCALLY, on github actions this will retrieve different data, so just use datetime.now() and schedule the run at 8pm EST = 0:00 UTC
    end_date = datetime.now()
    start_date = end_date - timedelta(days=1)

    url = f"https://api.coingecko.com/api/v3/coins/{crypto}/market_chart/range"
    params = {
        'vs_currency': 'usd',
        'from': int(start_date.timestamp()),
        'to': int(end_date.timestamp())
    }
    
    response = safe_request(url, params)
    
    if response and response.status_code == 200:
        data = response.json()
        df = pd.DataFrame(data['prices'], columns=['date', 'price'])
        df['date'] = pd.to_datetime(df['date'], unit='ms')
        df['market_cap'] = pd.DataFrame(data['market_caps'])[1].values
        df['volume_24h'] = pd.DataFrame(data['total_volumes'])[1].values
    else:
        print(f"Failed to fetch data. Status code: {response.status_code}" if response else "Failed to fetch data; no response.")
    
    # Include only the earliest day
    df['date'] = pd.to_datetime(df['date'])
    # Step 2: Extract the date part from the 'datetime' column
    df['no_hour_date'] = df['date'].dt.date
    earliest_date = df['no_hour_date'].min()
    filtered_df = df[df['no_hour_date'] == earliest_date]
    filtered_df = filtered_df.drop(columns='no_hour_date')
    
    return filtered_df

def safe_request(url, params, retries=20, backoff_factor=0.5):
    for i in range(retries):
        try:
            response = requests.get(url, params=params)
            if response.status_code == 200:
                return response
            elif response.status_code == 429:
                # We're being rate-limited; back off and retry
                sleep_time = backoff_factor * (2 ** i)
                print(f"Rate limit hit. Waiting {sleep_time:.2f} seconds before retrying...")
                time.sleep(sleep_time)
            else:
                # Other errors, break the retry loop and return None
                print(f"Request failed with status code {response.status_code}.")
                return None
        except requests.exceptions.RequestException as e:
            print(f"Request exception: {e}. Retrying...")
            time.sleep(backoff_factor * (2 ** i))

    return None

In [7]:
# Ensure the output directory exists
output_dir = '../data/raw/minute5'
os.makedirs(output_dir, exist_ok=True)
output_file = f'{output_dir}/ethereum.csv'

start_date = datetime(2020, 1, 1)
end_date = datetime.now()

fetch_and_append_crypto_data("dogecoin", start_date, end_date, output_file)
print("Data fetching complete.")

Request failed with status code 401.
Failed to fetch data; no response.
Data fetching complete.
