# Data Collection

## 0. Setup

In [1]:
# Automatic reloading
%load_ext autoreload
%autoreload 2

In [2]:
####################
# Required Modules #
####################

# Generic/Built-in
import sys
import os

# Libs
import pandas as pd


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/traitlets/config/application.py", line 1053, in launch_instance
    app.start()
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/ipyk

AttributeError: _ARRAY_API not found

In [3]:
# Get the project directory 
current_dir = os.path.abspath('') # Current '\notebooks' directory
project_dir = os.path.abspath(os.path.join(current_dir, '..')) # Move up one level to project root directory

# Add the project directory to sys.path
sys.path.append(project_dir)

# Move up to project directory
os.chdir(project_dir)
os.getcwd()

'/Users/aishwaryaiyer/Documents/GitHub/Digital-Asset-Prediction'

In [4]:
from dotenv import load_dotenv

# Load in environment variables from `.env` file.
load_dotenv()

False

## 1. FRED Data
The **Federal Reserve Economic Data (FRED)** is an online database maintained by the research department at the Federal Reserve Bank of St. Louis. It provides a wide range of economic time series data.

- The [FRED API](https://fred.stlouisfed.org/docs/api/fred/) will be used to retrieve the necessary datasets. An API key can be requested for free. Ensure that your API key is set by specifying it in the `FRED_API_KEY` environment variable.
- Alternatively, the datasets can be downloaded directly from the website itself without making an account.

In [6]:
from src.data_collection.data_scraper import fetch_data_from_fred

### 1.1. 10-Year Treasury Constant Maturity Minus 2-Year Treasury Constant Maturity
The **10-Year Treasury Constant Maturity Minus 2-Year Treasury Constant Maturity** spread measures the difference between long-term (10-year) and short-term (2-year) U.S. Treasury bond yields. A positive spread indicates a normal yield curve, suggesting confidence in economic growth, while a negative spread (inverted curve) may signal market concerns about an economic slowdown or impending recession.

As a macro-economic indicator, this spread can be used in crypto price prediction by reflecting investor sentiment and economic expectations. A widening spread may indicate optimism, which could drive higher demand for risk assets like cryptocurrencies, while an inverted spread could signal economic uncertainty, potentially leading to market volatility and lower crypto prices.

**Citation**:

Federal Reserve Bank of St. Louis, 10-Year Treasury Constant Maturity Minus 2-Year Treasury Constant Maturity [T10Y2Y], retrieved from FRED, Federal Reserve Bank of St. Louis; https://fred.stlouisfed.org/series/T10Y2Y, March 25, 2025. 


In [7]:
output_file = "data/raw/treasury_constant_maturity_spread.csv"

df_t10y2y = fetch_data_from_fred(
    series_id="T10Y2Y",
    start_date="2019-09-08",
    end_date="2025-04-04",
    output_filename=output_file
)

df_t10y2y.head()

Error fetching data from FRED API: 400 Client Error: Bad Request for url: https://api.stlouisfed.org/fred/series/observations?series_id=T10Y2Y&observation_start=2019-09-08&observation_end=2025-04-04&file_type=json


### 1.2. S&P 500
The observations for the S&P 500 represent the daily index value at market close. The market typically closes at 4 PM ET, except for holidays when it sometimes closes early.

The Federal Reserve Bank of St. Louis and S&P Dow Jones Indices LLC have reached a new agreement on the use of Standard & Poors and Dow Jones Averages series in FRED. FRED and its associated services will include 10 years of daily history for Standard & Poors and Dow Jones Averages series.

The S&P 500 is regarded as a gauge of the large cap U.S. equities market. The index includes 500 leading companies in leading industries of the U.S. economy, which are publicly held on either the NYSE or NASDAQ, and covers 75% of U.S. equities. Since this is a price index and not a total return index, the S&P 500 index here does not contain dividends. 

**Citation**:

S&P Dow Jones Indices LLC, S&P 500 [SP500], retrieved from FRED, Federal Reserve Bank of St. Louis; https://fred.stlouisfed.org/series/SP500, March 27, 2025. 

In [8]:
output_file = "data/raw/sp500.csv"

df_sp500 = fetch_data_from_fred(
    series_id="SP500",
    start_date="2019-09-08",
    end_date="2025-04-04",
    output_filename=output_file
)

df_sp500.head()

Error fetching data from FRED API: 400 Client Error: Bad Request for url: https://api.stlouisfed.org/fred/series/observations?series_id=SP500&observation_start=2019-09-08&observation_end=2025-04-04&file_type=json


## 2. CoinGecko Data
CoinGecko is a leading independent cryptocurrency data aggregator that provides comprehensive information on over 17,000 crypto assets and 1,200+ exchanges.
- For the data we are scraping, we do **not** need any API key.

In [8]:
from src.data_collection.data_scraper import fetch_top_crypto_data_from_coingecko

df_top_crypto_data = fetch_top_crypto_data_from_coingecko(
    limit=2,
    vs_currency="usd",
    days=365,
    output_filename="data/raw/top_crypto_daily_data.csv"
)

Fetching top 2 cryptocurrency assets by market cap...
Found 2 assets
[1/2] Fetching daily data for bitcoin...
✅ Successfully fetched 366 days of data for bitcoin
Waiting 6 seconds to avoid API rate limits...
[2/2] Fetching daily data for ethereum...
✅ Successfully fetched 366 days of data for ethereum
Waiting 3 seconds to avoid API rate limits...

Combining data from all assets...

✅ Daily data collection complete! Data saved as 'data/raw/top_crypto_daily_data.csv'.

Summary:
- Successfully collected data for 2 cryptocurrencies
- Total records: 732
- Date range: 2024-04-21 to 2025-04-20


## 3. Binance Market Data
Using Binance data, we will be collecting Close, Volume, Market Cap and Daily_return for each day from from 2022-03-24 to 2025-03-24
- API key is needed

In [28]:
!pip install ccxt

Collecting ccxt
  Downloading ccxt-4.4.75-py2.py3-none-any.whl.metadata (131 kB)
Collecting aiodns>=1.1.1 (from ccxt)
  Downloading aiodns-3.2.0-py3-none-any.whl.metadata (4.0 kB)
Collecting pycares>=4.0.0 (from aiodns>=1.1.1->ccxt)
  Downloading pycares-4.6.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (4.3 kB)
Downloading ccxt-4.4.75-py2.py3-none-any.whl (5.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.7/5.7 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading aiodns-3.2.0-py3-none-any.whl (5.7 kB)
Downloading pycares-4.6.0-cp311-cp311-macosx_11_0_arm64.whl (72 kB)
Installing collected packages: pycares, aiodns, ccxt
Successfully installed aiodns-3.2.0 ccxt-4.4.75 pycares-4.6.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [29]:
import ccxt
from datetime import datetime, timedelta
import pandas as pd
import time
import os
from dotenv import load_dotenv


In [None]:
load_dotenv()

class BinanceDataScraper:
    def __init__(self, api_key=None, api_secret=None):
        """
        Initialize Binance exchange connection
        
        Args:
            api_key (str, optional): Binance API key from .env
            api_secret (str, optional): Binance API secret from .env
        """
        # Fetch keys from environment variables if not provided
        api_key = api_key or os.getenv('KEY_1')
        api_secret = api_secret or os.getenv('KEY_2')
        
        # Initialize exchange
        self.exchange = ccxt.binance({
            'apiKey': api_key,
            'secret': api_secret,
            'enableRateLimit': True,
            'options': {
                'defaultType': 'spot'  # Use spot market by default
            }
        })

    def fetch_comprehensive_data(self, 
                                  symbol, 
                                  start_date='2022-03-24', 
                                  end_date='2025-03-24'):
        """
        Fetch comprehensive cryptocurrency data
        
        Args:
            symbol (str): Trading pair symbol (e.g., 'BTC/USDT')
            start_date (str): Start date in YYYY-MM-DD format
            end_date (str): End date in YYYY-MM-DD format
        
        Returns:
            pandas.DataFrame: Comprehensive cryptocurrency data
        """
        # Convert dates to timestamps
        start_timestamp = int(datetime.strptime(start_date, '%Y-%m-%d').timestamp() * 1000)
        end_timestamp = int(datetime.strptime(end_date, '%Y-%m-%d').timestamp() * 1000)
        
        # Initialize empty list to store all OHLCV data
        ohlcv_data = []
        
        # Fetch data in chunks to avoid API limitations
        current_start = start_timestamp
        while current_start < end_timestamp:
            try:
                # Fetch 500 candles at a time (Binance limit)
                candles = self.exchange.fetch_ohlcv(
                    symbol, 
                    timeframe='1d', 
                    since=current_start,
                    limit=500
                )
                
                # Break if no more data
                if not candles:
                    break
                
                # Add to data list
                ohlcv_data.extend(candles)
                
                # Update start timestamp for next iteration
                current_start = candles[-1][0] + 1
                
                # Respect rate limits
                time.sleep(self.exchange.rateLimit / 1000)
                
            except Exception as e:
                print(f"Error fetching data for {symbol}: {e}")
                break
        
        # Convert to DataFrame
        df = pd.DataFrame(ohlcv_data, columns=['timestamp', 'open', 'high', 'low', 'close', 'volume'])
        
        # Convert timestamp to datetime and set as index
        df['date'] = pd.to_datetime(df['timestamp'], unit='ms')
        
        # Calculate market cap (estimated using close price and volume)
        # Note: This is a rough estimation and may not be entirely accurate
        df['market_cap'] = df['close'] * df['volume']
        
        # Compute daily returns
        df['daily_return'] = df['close'].pct_change()
        
        # Select and rename columns
        result_df = df[['date', 'close', 'volume', 'market_cap', 'daily_return']].copy()
        result_df.columns = ['date', 'price', 'volume', 'market_cap', 'daily_return']
        
        # Add asset column
        result_df['asset'] = symbol  # Use base currency as asset name
        
        # Set date as index
        result_df.set_index('date', inplace=True)
        
        # Filter date range
        result_df = result_df.loc[start_date:end_date]
        
        return result_df

    def fetch_top_symbols(self, limit=100):
        """
        Fetch top trading symbols by volume
        
        Args:
            limit (int): Number of top symbols to return
        
        Returns:
            list: Top trading symbols
        """
        try:
            # Load markets
            self.exchange.load_markets()
            
            # Sort markets by daily volume
            markets = sorted(
                self.exchange.markets.values(), 
                key=lambda x: x.get('quote', 'USDT') == 'USDT' and x.get('active', False),
                reverse=True
            )
            
            # Filter USDT pairs and get top symbols
            usdt_pairs = [
                market['symbol'] for market in markets 
                if market['quote'] == 'USDT' and market['active']
            ]
            
            return usdt_pairs[:limit]
        
        except Exception as e:
            print(f"Error fetching top symbols: {e}")
            return []

def main():
    # Initialize scraper using environment variables
    scraper = BinanceDataScraper()
    
    # Get top trading symbols
    top_symbols = scraper.fetch_top_symbols(limit=100)
    print(f"Fetching data for {len(top_symbols)} top symbols")
    
    # Dictionary to store all data
    all_data = {}
    
    # Fetch data for each symbol
    for symbol in top_symbols:
        try:
            print(f"Fetching data for {symbol}")
            df = scraper.fetch_comprehensive_data(symbol)
            
            if not df.empty:
                all_data[symbol] = df
                print(f"✓ Collected {len(df)} days of data for {symbol}")
            
            # Optional: Add a small delay between symbol fetches
            time.sleep(1)
        
        except Exception as e:
            print(f"Error processing {symbol}: {e}")
    
    # Combine all data
    if all_data:
        # Concatenate all dataframes
        final_df = pd.concat(all_data.values())
        
        # Save to CSV
        output_filename = f"binance_crypto_data_{datetime.now().strftime('%Y%m%d')}.csv"
        final_df.to_csv(output_filename)
        
        print(f"\n✅ Data collection complete. Saved to {output_filename}")
        print(f"Symbols collected: {len(all_data)}")
        print(f"Total records: {len(final_df)}")
    else:
        print("No data collected.")

if __name__ == "__main__":
    main()

## 4. Binance Coin Data
Using Binance data, we will be collecting Open, Low, High, Close data for each coin selected
- API key is needed

In [31]:
import ccxt
import pandas as pd
from datetime import datetime, timedelta
import time
import os
from dotenv import load_dotenv

In [20]:
load_dotenv()

class BinanceOHLCScraper:
    def __init__(self, api_key=None, api_secret=None):
        """
        Initialize Binance exchange connection
        
        Args:
            api_key (str, optional): Binance API key from .env
            api_secret (str, optional): Binance API secret from .env
        """
        # Fetch keys from environment variables if not provided
        api_key = api_key or os.getenv('KEY_1')
        api_secret = api_secret or os.getenv('KEY_2')
        
        # Initialize exchange
        self.exchange = ccxt.binance({
            'apiKey': api_key,
            'secret': api_secret,
            'enableRateLimit': True,
            'options': {
                'defaultType': 'spot'  # Use spot market by default
            }
        })

    def fetch_ohlcv_data(self, 
                          symbol, 
                          start_date='2022-03-24', 
                          end_date='2025-03-24', 
                          timeframe='1d'):
        """
        Fetch OHLCV data for a given symbol and date range
        
        Args:
            symbol (str): Trading pair symbol (e.g., 'BTC/USDT')
            start_date (str): Start date in YYYY-MM-DD format
            end_date (str): End date in YYYY-MM-DD format
            timeframe (str): Candle timeframe (default: daily)
        
        Returns:
            pandas.DataFrame: OHLCV data
        """
        # Convert dates to timestamps
        start_timestamp = int(datetime.strptime(start_date, '%Y-%m-%d').timestamp() * 1000)
        end_timestamp = int(datetime.strptime(end_date, '%Y-%m-%d').timestamp() * 1000)
        
        # Initialize empty list to store all OHLCV data
        ohlcv_data = []
        
        # Fetch data in chunks to avoid API limitations
        current_start = start_timestamp
        while current_start < end_timestamp:
            try:
                # Fetch 500 candles at a time (Binance limit)
                candles = self.exchange.fetch_ohlcv(
                    symbol, 
                    timeframe=timeframe, 
                    since=current_start,
                    limit=500
                )
                
                # Break if no more data
                if not candles:
                    break
                
                # Add to data list
                ohlcv_data.extend(candles)
                
                # Update start timestamp for next iteration
                current_start = candles[-1][0] + 1
                
                # Respect rate limits
                time.sleep(self.exchange.rateLimit / 1000)
                
            except Exception as e:
                print(f"Error fetching data for {symbol}: {e}")
                break
        
        # Convert to DataFrame
        df = pd.DataFrame(ohlcv_data, columns=['timestamp', 'open', 'high', 'low', 'close', 'volume'])
        
        # Convert timestamp to datetime
        df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
        df.set_index('timestamp', inplace=True)
        
        # Filter date range
        df = df.loc[start_date:end_date]
        
        return df

    def fetch_top_symbols(self, limit=100):
        """
        Fetch top trading symbols by volume
        
        Args:
            limit (int): Number of top symbols to return
        
        Returns:
            list: Top trading symbols
        """
        try:
            # Load markets
            self.exchange.load_markets()
            
            # Sort markets by daily volume
            markets = sorted(
                self.exchange.markets.values(), 
                key=lambda x: x.get('quote', 'USDT') == 'USDT' and x.get('active', False),
                reverse=True
            )
            
            # Filter USDT pairs and get top symbols
            usdt_pairs = [
                market['symbol'] for market in markets 
                if market['quote'] == 'USDT' and market['active']
            ]
            
            return usdt_pairs[:limit]
        
        except Exception as e:
            print(f"Error fetching top symbols: {e}")
            return []

def main():
    # Initialize scraper using environment variables
    scraper = BinanceOHLCScraper()
    
    # Get top trading symbols
    top_symbols = scraper.fetch_top_symbols(limit=100)
    print(f"Fetching data for {len(top_symbols)} top symbols")
    
    # Dictionary to store all data
    all_data = {}
    
    # Fetch data for each symbol
    for symbol in top_symbols:
        try:
            print(f"Fetching data for {symbol}")
            df = scraper.fetch_ohlcv_data(symbol)
            
            if not df.empty:
                all_data[symbol] = df
                print(f"✓ Collected {len(df)} days of data for {symbol}")
            
            # Optional: Add a small delay between symbol fetches
            time.sleep(1)
        
        except Exception as e:
            print(f"Error processing {symbol}: {e}")
    
    # Combine all data
    if all_data:
        # Create a MultiIndex DataFrame
        final_df = pd.concat(all_data.values(), keys=all_data.keys(), names=['symbol'])
        
        # Save to CSV
        output_filename = f"binance_ohlc_data_{datetime.now().strftime('%Y%m%d')}.csv"
        final_df.to_csv(output_filename)
        
        print(f"\n✅ Data collection complete. Saved to {output_filename}")
        print(f"Symbols collected: {len(all_data)}")
    else:
        print("No data collected.")

if __name__ == "__main__":
    main()

NameError: name 'ccxt' is not defined