In [None]:
import os
import asyncio
from playwright.async_api import async_playwright
import nest_asyncio
from tqdm import tqdm  # Import tqdm for progress bar

# Apply nest_asyncio to allow for running asyncio in environments like Jupyter
nest_asyncio.apply()

# Folder to save the downloaded files and screenshots
download_path = "/allah/data/trades"
screenshot_path = "/allah/data/screenshots"
if not os.path.exists(download_path):
    os.makedirs(download_path)
if not os.path.exists(screenshot_path):
    os.makedirs(screenshot_path)

url = 'https://data.binance.vision/?prefix=data/futures/um/daily/trades/ETHUSDT/'

async def download_files():
    async with async_playwright() as p:
        # Launch the browser in headless mode (without a visible UI)
        browser = await p.chromium.launch(headless=True)  
        page = await browser.new_page()

        # Set up download handling
        page.on("download", lambda download: asyncio.create_task(handle_download(download)))

        # Visit the Binance data page
        await page.goto(url)
        
        # Wait for the page to fully load by waiting for the network to be idle
        await page.wait_for_load_state('networkidle')

        # Take a screenshot for debugging
        screenshot_file = os.path.join(screenshot_path, "page_screenshot.png")
        await page.screenshot(path=screenshot_file)
        print(f"Screenshot saved at: {screenshot_file}")

        # Select all the links that point to zip files
        links = await page.query_selector_all('a[href$=".zip"]')

        # Initialize tqdm progress bar
        progress_bar = tqdm(total=len(links), desc="Downloading files")

        for link in links:
            href = await link.get_attribute('href')
            # Construct the full URL correctly
            full_url = f"https://data.binance.vision{href}" if href.startswith('/') else href
            # Trigger the download by clicking the link
            await page.click(f'a[href="{href}"]')
            # Update the progress bar after each download
            progress_bar.update(1)
            # Wait for a few seconds after triggering the download
            await page.wait_for_timeout(3000)

        # Close the progress bar when done
        progress_bar.close()

        await browser.close()

    print(f"Downloads are saved in {download_path}")

async def handle_download(download):
    # Save the downloaded file to the specified folder
    file_path = os.path.join(download_path, download.suggested_filename)  # Fix: Access suggested_filename as an attribute, not a method
    await download.save_as(file_path)

# Run the async function
asyncio.run(download_files())


In [None]:
import os

# Directory containing the files
directory = '/allah/data/trades'

# Loop through all files in the directory
for filename in os.listdir(directory):
    # Check if the file contains "2024"
    if '2024' not in filename:
        # Get the full file path
        file_path = os.path.join(directory, filename)
        # Remove the file
        os.remove(file_path)
        print(f"Deleted: {filename}")


In [None]:
import os
import zipfile
from tqdm import tqdm  # Import tqdm for progress bar

# Define paths
zip_folder = "/allah/data/trades"
unzip_folder = "/allah/data/unzip_trades"

# Create the unzip folder if it doesn't exist
if not os.path.exists(unzip_folder):
    os.makedirs(unzip_folder)

# Get the list of all zip files in the folder
zip_files = [file for file in os.listdir(zip_folder) if file.endswith(".zip")]

# Initialize tqdm progress bar
with tqdm(total=len(zip_files), desc="Unzipping files", unit="file") as pbar:
    for file_name in zip_files:
        file_path = os.path.join(zip_folder, file_name)
        
        # Open the zip file
        with zipfile.ZipFile(file_path, 'r') as zip_ref:
            # Extract all contents to the unzip folder
            zip_ref.extractall(unzip_folder)
        
        # Delete the zip file after extraction
        os.remove(file_path)
        
        # Update the progress bar after each file is unzipped and deleted
        pbar.update(1)

print(f"All files are unzipped to {unzip_folder} and original zip files are deleted.")


In [6]:
import pandas as pd
import os
from datetime import datetime

def generate_dataframe(start_date, end_date, folder_path):
    # Convert the input dates to datetime objects for easier comparison
    start_date = datetime.strptime(start_date, '%Y-%m-%d')
    end_date = datetime.strptime(end_date, '%Y-%m-%d')

    # Initialize an empty list to store DataFrames
    df_list = []

    # Loop through all files in the specified folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.csv') and filename.startswith('ETHUSDT-trades'):
            # Ensure the filename has the expected format
            try:
                # Extract the date part from the filename
                
                file_date_str = filename.split('-')[2] + '-' + filename.split('-')[3] + '-' + filename.split('-')[4].replace('.csv', '')

                # Convert the extracted date to a datetime object
                file_date = datetime.strptime(file_date_str, '%Y-%m-%d')
            except (IndexError, ValueError):
                print(f"Skipping file {filename} due to incorrect date format.")
                continue

            # If the file's date is within the specified range, read it into a DataFrame
            if start_date <= file_date <= end_date:
                file_path = os.path.join(folder_path, filename)
                df = pd.read_csv(file_path)
                df_list.append(df)

    # Concatenate all the DataFrames into one
    if df_list:
        final_df = pd.concat(df_list, ignore_index=True)
        return final_df
    else:
        return None

# Example usage:
folder_path = '/allah/data/unzip_trades'  # Path where your CSV files are stored
start_date = '2024-07-01'  # Specify the start date
end_date = '2024-07-15'    # Specify the end date

# Generate the DataFrame for the specified date range
df = generate_dataframe(start_date, end_date, folder_path)

if df is not None:
    print(df.head())  # Display the first few rows of the DataFrame
else:
    print('No files found in the specified date range.')


           id    price    qty   quote_qty           time  is_buyer_maker
0  4160624153  3440.47  0.004    13.76188  1719878400035           False
1  4160624154  3440.47  0.166   571.11802  1719878400035           False
2  4160624155  3440.46  0.007    24.08322  1719878400061            True
3  4160624156  3440.46  1.000  3440.46000  1719878400062            True
4  4160624157  3440.46  0.053   182.34438  1719878400091            True
