# Step 1: Data Collection

1. **get_all_coin_ids() function** - Fetch and process the coin data
- API Request: It uses the requests library to make a GET request to the CoinGecko API endpoint.
- Response Handling:
If the response is successful (status code 200), it converts the JSON data into a pandas DataFrame with columns 'Coin ID', 'Symbol', 'Name'.
- The DataFrame is printed to the console.
- The DataFrame is saved as a CSV file named all_coingecko_coin_ids.csv.

In [158]:
import requests
import pandas as pd

def get_all_coin_ids():
    """
    Fetch all coin IDs from CoinGecko and display them in a DataFrame.
    """
    url = "https://api.coingecko.com/api/v3/coins/list"
    response = requests.get(url)

    if response.status_code == 200:
        coins = response.json()

        # Convert the list of coins to a DataFrame for better readability
        df = pd.DataFrame(coins)
        df.columns = ['Coin ID', 'Symbol', 'Name']  # Rename columns for clarity

        # Display the DataFrame
        print("List of all Coin IDs from CoinGecko:")
        print(df)

        # Optionally, save the DataFrame to a CSV file
        df.to_csv("all_coingecko_coin_ids.csv", index=False)
        print("Coin IDs saved to all_coingecko_coin_ids.csv")

        return df
    else:
        print("Failed to retrieve data from CoinGecko API")
        return None

# Fetch and display all coin IDs
coin_ids_df = get_all_coin_ids()

List of all Coin IDs from CoinGecko:
                        Coin ID Symbol                      Name
0                        01coin    zoc                    01coin
1                        0chain    zcn                       Zus
2                          0dog   0dog              Bitcoin Dogs
3           0-knowledge-network    0kn       0 Knowledge Network
4                         0-mee    ome                     O-MEE
...                         ...    ...                       ...
15002                 zyncoin-2    zyn                   ZynCoin
15003                  zynecoin    zyn                  Zynecoin
15004                     zyrri    zyr                     Zyrri
15005                       zzz    zzz               GoSleep ZZZ
15006  z-z-z-z-z-fehu-z-z-z-z-z      ᚠ  Z•Z•Z•Z•Z•FEHU•Z•Z•Z•Z•Z

[15007 rows x 3 columns]
Coin IDs saved to all_coingecko_coin_ids.csv


2. **get_top_coins_by_price_change(top_n=100)** - Fetch and save top 100 coins by 24-hour price change

3. **get_top_trending_coins(top_n=100)** - Fetch and save top 100 trending coins by trading volume

4. **get_limited_trending_coins()** - Fetch and save limited trending coins from CoinGecko's trending endpoint
- All the above data are saved into a CSV file.

In [159]:
import requests
import pandas as pd
from requests.adapters import HTTPAdapter
from requests.exceptions import ConnectionError, Timeout, RequestException
from urllib3.util.retry import Retry

# Set up a requests session with retry logic
session = requests.Session()
retry_strategy = Retry(
    total=3,  # Total number of retries
    backoff_factor=1,  # Delay between retries (exponential)
    status_forcelist=[429, 500, 502, 503, 504],  # Retry for these status codes
    allowed_methods=["GET"]
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("https://", adapter)
session.mount("http://", adapter)

def get_top_coins_by_price_change(currency='usd', order='price_change_percentage_24h_desc', top_n=100):
    """
    Fetch the top coins sorted by price change over the last 24 hours.
    """
    url = "https://api.coingecko.com/api/v3/coins/markets"
    params = {
        'vs_currency': currency,
        'order': order,
        'per_page': 100,
        'sparkline': False
    }

    try:
        all_data = []
        for page in range(1, (top_n // 100) + 2):  # Fetch enough pages to cover top_n
            params['page'] = page
            response = session.get(url, params=params, timeout=10)
            response.raise_for_status()
            data = response.json()
            all_data.extend(data)
            if len(all_data) >= top_n:
                break

        df = pd.DataFrame(all_data[:top_n])
        df = df[['id', 'symbol', 'name', 'current_price', 'market_cap', 'total_volume', 'price_change_percentage_24h']]

        # Save the data to CSV
        df.to_csv("top_100_coins_by_price_change.csv", index=False)
        print("Top 100 coins by 24-hour price change saved to top_100_coins_by_price_change.csv")

        return df

    except (ConnectionError, Timeout) as e:
        print(f"Network error: {e}")
    except RequestException as e:
        print(f"Request error: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

    return None

def get_top_trending_coins(currency='usd', top_n=100):
    """
    Fetch the top coins sorted by trading volume, which can be used as a proxy for trending coins.
    """
    url = "https://api.coingecko.com/api/v3/coins/markets"
    params = {
        'vs_currency': currency,
        'order': 'volume_desc',
        'per_page': 100,
        'sparkline': False
    }

    try:
        all_data = []
        for page in range(1, (top_n // 100) + 2):
            params['page'] = page
            response = session.get(url, params=params, timeout=10)
            response.raise_for_status()
            data = response.json()
            all_data.extend(data)
            if len(all_data) >= top_n:
                break

        df = pd.DataFrame(all_data[:top_n])
        df = df[['id', 'symbol', 'name', 'current_price', 'market_cap', 'total_volume']]

        # Save the data to CSV
        df.to_csv("top_100_trending_coins.csv", index=False)
        print("Top 100 trending coins by trading volume saved to top_100_trending_coins.csv")

        return df

    except (ConnectionError, Timeout) as e:
        print(f"Network error: {e}")
    except RequestException as e:
        print(f"Request error: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

    return None

def get_limited_trending_coins():
    """
    Fetch trending coins from CoinGecko's limited /search/trending endpoint.
    """
    url = "https://api.coingecko.com/api/v3/search/trending"

    try:
        response = session.get(url, timeout=10)
        response.raise_for_status()

        data = response.json()['coins']
        trending_data = [
            {
                'id': coin['item']['id'],
                'symbol': coin['item']['symbol'],
                'name': coin['item']['name'],
                'market_cap_rank': coin['item']['market_cap_rank'],
                'price_btc': coin['item']['price_btc']
            }
            for coin in data
        ]

        df = pd.DataFrame(trending_data)

        # Save the data to CSV
        df.to_csv("trending_coins_limited.csv", index=False)
        print("Trending coins (limited from CoinGecko) saved to trending_coins_limited.csv")

        return df

    except (ConnectionError, Timeout) as e:
        print(f"Network error: {e}")
    except RequestException as e:
        print(f"Request error: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

    return None

# Fetch and save top 100 coins by 24-hour price change
get_top_coins_by_price_change(top_n=100)

# Fetch and save top 100 trending coins by trading volume
get_top_trending_coins(top_n=100)

# Fetch and save limited trending coins from CoinGecko's trending endpoint
get_limited_trending_coins()

Top 100 coins by 24-hour price change saved to top_100_coins_by_price_change.csv
Top 100 trending coins by trading volume saved to top_100_trending_coins.csv
Trending coins (limited from CoinGecko) saved to trending_coins_limited.csv


Unnamed: 0,id,symbol,name,market_cap_rank,price_btc
0,koala-ai,KOKO,KOALA AI,718,7.099297e-11
1,tars-protocol,TAI,TARS AI,622,1.961829e-06
2,moo-deng,MOODENG,Moo Deng,268,3.344119e-06
3,moonwell-artemis,WELL,Moonwell,239,1.308472e-06
4,virtual-protocol,VIRTUAL,Virtuals Protocol,160,6.868521e-06
5,aerodrome-finance,AERO,Aerodrome Finance,97,1.758054e-05
6,goatseus-maximus,GOAT,Goatseus Maximus,122,9.738508e-06
7,raydium,RAY,Raydium,92,4.7599e-05
8,solana,SOL,Solana,5,0.002568905
9,gigachad-2,GIGA,Gigachad,150,8.256003e-07


4. **get_historical_data()** - fetches historical OHLCV (Open, High, Low, Close, Volume) data for cryptocurrencies from the CoinGecko API.
- API Key Handling:
The script allows for an optional API key to be used in requests, enhancing security for API access.
- Flexible Time Range:
It correctly sets parameters based on the number of days requested, adjusting the retrieval method appropriately..
- Data Extraction:
The script processes the API response to extract price and volume data, constructing a DataFrame with the relevant OHLCV data.
- CSV Output:
The resulting DataFrame is saved to a CSV file for easy access and analysis.

In [160]:
import requests
import pandas as pd

def get_historical_data(coin_id='bitcoin', vs_currency='usd', days=1, api_key=None):
    """
    Fetch OHLCV (Open, High, Low, Close, Volume) historical data for a coin from CoinGecko.
    """
    # Define headers with the API key if provided
    headers = {}
    if api_key:
        headers['Authorization'] = f'Bearer {api_key}'

    # Define parameters based on CoinGecko's limitations for free users
    if days == 1:
        interval = 'daily'
        params = {'vs_currency': vs_currency, 'days': days, 'interval': interval}
    elif 2 <= days <= 90:
        params = {'vs_currency': vs_currency, 'days': days}  # Defaults to hourly without specifying interval
    else:
        interval = 'daily'
        params = {'vs_currency': vs_currency, 'days': days, 'interval': interval}

    url = f"https://api.coingecko.com/api/v3/coins/{coin_id}/market_chart"
    response = requests.get(url, headers=headers, params=params)
    data = response.json()

    # Check if 'prices' and 'total_volumes' are in the response
    if 'prices' not in data or 'total_volumes' not in data:
        print("Error: 'prices' or 'total_volumes' key not found in response.")
        print("Response data:", data)
        return pd.DataFrame()

    # Extract prices and volumes
    prices = data['prices']
    volumes = data['total_volumes']

    # Constructing a DataFrame
    ohlcv_data = []
    for i in range(len(prices)):  # Process each entry
        start_time = prices[i][0]  # Current timestamp in milliseconds
        end_time = prices[i + 1][0] if i + 1 < len(prices) else start_time  # Next timestamp if available

        open_price = prices[i][1]
        close_price = prices[i + 1][1] if i + 1 < len(prices) else prices[i][1]  # Next price if available
        high_price = max(prices[i][1], prices[i + 1][1]) if i + 1 < len(prices) else prices[i][1]
        low_price = min(prices[i][1], prices[i + 1][1]) if i + 1 < len(prices) else prices[i][1]
        volume = volumes[i][1]

        ohlcv_data.append({
            'startTime': start_time,  # Keep startTime as a raw timestamp
            'endTime': end_time,      # Keep endTime as a raw timestamp
            'open': open_price,
            'high': high_price,
            'low': low_price,
            'close': close_price,
            'volume': volume
        })

    df = pd.DataFrame(ohlcv_data)

    # Save the DataFrame to a CSV file
    filename = f"{coin_id}_ohlcv_data.csv"
    df.to_csv(filename, index=False)
    print(f"Data saved to {filename}")

    return df

# Example usage with API key
api_key = 'YOUR_API_KEY_HERE'  # Replace with your actual API key
ohlcv_df = get_historical_data(coin_id='bitcoin', vs_currency='usd', days=2, api_key=api_key)
print(ohlcv_df)

Data saved to bitcoin_ohlcv_data.csv
        startTime        endTime          open          high           low  \
0   1729840228300  1729843696308  67662.074279  67662.074279  67403.438143   
1   1729843696308  1729847151608  67403.438143  67601.469084  67403.438143   
2   1729847151608  1729851085402  67601.469084  67735.931002  67601.469084   
3   1729851085402  1729854695156  67735.931002  67813.470117  67735.931002   
4   1729854695156  1729858140460  67813.470117  68222.929874  67813.470117   
5   1729858140460  1729861419940  68222.929874  68222.929874  68012.581893   
6   1729861419940  1729865881588  68012.581893  68430.207436  68012.581893   
7   1729865881588  1729869278526  68430.207436  68430.207436  68397.234309   
8   1729869278526  1729872225437  68397.234309  68397.234309  67682.766645   
9   1729872225437  1729876575025  67682.766645  67704.817484  67682.766645   
10  1729876575025  1729880345708  67704.817484  67704.817484  66673.983900   
11  1729880345708  17298834

# Step 2: Clean data

In [161]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

1. large_files - Filter out empty data files

In [162]:
import os
# Define the directory containing the CSV files
directory = "/content/Trade_data"

# Define the size threshold (800 bytes)
size_threshold = 800 # 800 bytes

# Collect paths of files larger than 800 bytes
large_files = [os.path.join(directory, file) for file in os.listdir(directory)
 if file.endswith('.csv') and os.path.getsize(os.path.join(directory, file)) >= size_threshold]

print("Files larger than 800 bytes:", large_files)

print(len(large_files))

Files larger than 800 bytes: ['/content/Trade_data/GNOUSDT_per_minute_data.csv', '/content/Trade_data/USDCTRY_per_minute_data.csv', '/content/Trade_data/DOGEFDUSD_per_minute_data.csv', '/content/Trade_data/MKRTRY_per_minute_data.csv', '/content/Trade_data/USDTTRY_per_minute_data.csv', '/content/Trade_data/RADBTC_per_minute_data.csv', '/content/Trade_data/LQTYUSDT_per_minute_data.csv', '/content/Trade_data/MEMEFDUSD_per_minute_data.csv', '/content/Trade_data/USDTCZK_per_minute_data.csv', '/content/Trade_data/FDUSDUSDT_per_minute_data.csv', '/content/Trade_data/MASKBNB_per_minute_data.csv', '/content/Trade_data/SOLBTC_per_minute_data.csv', '/content/Trade_data/BNBETH_per_minute_data.csv', '/content/Trade_data/DOGEJPY_per_minute_data.csv', '/content/Trade_data/SOLUSDT_per_minute_data.csv', '/content/Trade_data/SOLETH_per_minute_data.csv', '/content/Trade_data/FDUSDTRY_per_minute_data.csv', '/content/Trade_data/MKRBTC_per_minute_data.csv', '/content/Trade_data/TRXXRP_per_minute_data.csv', 

2. **clean_data()** - Clean the data
- replace any non-numeric values in 'Close' and 'Volume' with NaN.
- removes rows that contain NaN values, ensuring that the DataFrame only contains complete records.
- sorts the DataFrame by the 'Open Time' column, which is useful for time series analysis.

In [163]:
def clean_data(df):
    df['Close'] = pd.to_numeric(df['Close'], errors='coerce')
    df['Volume'] = pd.to_numeric(df['Volume'], errors='coerce')
    df.dropna(inplace=True)
    df.sort_values('Open Time', inplace=True)  # Sort by 'Open Time'
    return df

# Step 3: Feature processing

1. **compute_rsi()** - RSI Calculation
- computes RSI based on the 'Close' prices in the provided DataFrame.
2. **calculate_dynamic_thresholds()** - calculate dynamic thresholds based on RSI
- computes dynamic buy and sell thresholds based on the mean and standard deviation of RSI over a specified rolling window to establish buy and sell signals.


In [164]:
# Calculate RSI
def compute_rsi(data, period=14):
    delta = data['Close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=period).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=period).mean()
    rs = gain / loss.replace(0, np.nan)  # Avoid division by zero
    rsi = 100 - (100 / (1 + rs))
    return rsi

# Function to calculate dynamic thresholds based on RSI
def calculate_dynamic_thresholds(data, window=3):
    rsi_mean = data['RSI'].rolling(window=window).mean().iloc[-1]
    rsi_std = data['RSI'].rolling(window=window).std().iloc[-1]
    threshold_buy = rsi_mean - rsi_std  # Dynamic lower threshold
    threshold_sell = rsi_mean + rsi_std  # Dynamic upper threshold
    return threshold_buy, threshold_sell

# Generate signals based on dynamic thresholds and moving averages
def generate_signals(data, threshold_buy, threshold_sell):
    signals = np.where(
        (data['RSI'] < threshold_buy) & (data['Fast_MA'] > data['Slow_MA']), 'Buy',
        np.where(
            (data['RSI'] > threshold_sell) & (data['Fast_MA'] < data['Slow_MA']), 'Sell',
            'Hold'
        )
    )
    return signals

# Step 4: Combine Indicators into a Strategy

1. **Moving Averages**:
- Fast Moving Average (5-period): Used to identify short-term trends.
- Slow Moving Average (20-period): Used to assess longer-term trends.

2. **Signal Generation**:
- Signals (Buy, Sell, Hold) were generated based on the calculated indicators, allowing for trading decisions to be made.
3. **Daily Scores Calculation**:
- computed for each file based on the counts of buy and sell signals
- Daily Score = Count of 'Buy' signals - Count of 'Sell' signals.
- A positive score indicates a stronger inclination toward buying.


In [165]:
# Dictionary to hold daily scores for each file
daily_scores = {}

# Loop over each large CSV file
for file_path in large_files:
    # Load the dataset
    single_day_data_before = pd.read_csv(file_path)

    # Clean the data
    single_day_data = clean_data(single_day_data_before)

    # Optionally slice to the last 10,080 entries
    last_data = single_day_data.iloc[-10080:].copy()

    # Calculate Fast and Slow Moving Averages
    last_data['Fast_MA'] = last_data['Close'].rolling(window=5).mean()  # 5-period Fast MA
    last_data['Slow_MA'] = last_data['Close'].rolling(window=20).mean()  # 20-period Slow MA

    # Calculate RSI
    last_data['RSI'] = compute_rsi(last_data)

    # Create DataFrame for analysis
    crypto_data = pd.DataFrame(last_data)

    # Calculate dynamic thresholds
    threshold_buy, threshold_sell = calculate_dynamic_thresholds(crypto_data)

    # Generate buy/sell/hold signals
    crypto_data['Signal'] = generate_signals(crypto_data, threshold_buy, threshold_sell)

    # Calculate daily scores
    buy_count = (crypto_data['Signal'] == 'Buy').sum()
    sell_count = (crypto_data['Signal'] == 'Sell').sum()
    daily_score = buy_count - sell_count  # Positive score indicates more buy signals

    # Store the daily score in the dictionary
    daily_scores[file_path] = daily_score

In [166]:
# Keep an array containing all the daily scores to sort with file names
score_list = [(score, file) for file, score in daily_scores.items()]

def quicksort(arr):
    # Base case: If the array has 0 or 1 element, it is already sorted
    if len(arr) <= 1:
        return arr

    # Step 1: Choose the pivot (last element in this case)
    pivot = arr[-1]

    # Step 2: Partition the array into two lists
    left = [x for x in arr[:-1] if x[0] > pivot[0]]   # Elements greater than the pivot
    right = [x for x in arr[:-1] if x[0] <= pivot[0]] # Elements less than or equal to the pivot

    # Step 3: Recursively sort the left and right lists, then concatenate
    return quicksort(left) + [pivot] + quicksort(right)

# Sort the daily scores
sorted_scores = quicksort(score_list)

print("Yay, it's sorted!")

# Display the top 5 and last 5 scores with their respective file names
top_five = sorted_scores[:5]
last_five = sorted_scores[-5:]

print("\nTop 5 Daily Scores:")
for score, file in top_five:
    print(f"File: {file}, Score: {score}")

print("\nLast 5 Daily Scores:")
for score, file in last_five:
    print(f"File: {file}, Score: {score}")

Yay, it's sorted!

Top 5 Daily Scores:
File: /content/Trade_data/MASKUSDT_per_minute_data.csv, Score: 4781
File: /content/Trade_data/ERNUSDT_per_minute_data.csv, Score: 4746
File: /content/Trade_data/TNSRUSDT_per_minute_data.csv, Score: 4707
File: /content/Trade_data/CVCUSDT_per_minute_data.csv, Score: 4245
File: /content/Trade_data/TNSRFDUSD_per_minute_data.csv, Score: 4028

Last 5 Daily Scores:
File: /content/Trade_data/BNBETH_per_minute_data.csv, Score: -1938
File: /content/Trade_data/PAXGTRY_per_minute_data.csv, Score: -2025
File: /content/Trade_data/PAXGUSDT_per_minute_data.csv, Score: -2051
File: /content/Trade_data/PAXGBTC_per_minute_data.csv, Score: -2306
File: /content/Trade_data/OMTRY_per_minute_data.csv, Score: -2853
