In [91]:
import requests
import os
import datetime
import zipfile
from tqdm import tqdm
import pandas as pd

class DataProcessor:
    def __init__(self, base_url, zip_download_dir, csv_extract_dir):
        self.base_url = base_url
        self.zip_download_dir = zip_download_dir
        self.csv_extract_dir = csv_extract_dir

    def _download_zip_file(self, link, filename):
        # Check if the file already exists
        if os.path.exists(filename):
            print(f"File {filename} already exists. Skipping download.")
            return True

        response = requests.get(link)
        if response.status_code == 200:
            with open(filename, 'wb') as file:
                file.write(response.content)
            return True
        else:
            print(f"Error: {response.status_code} - Unable to download the ZIP file for {filename}")
            return False

    def _extract_zip_file(self, zip_path):
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            for file_info in zip_ref.infolist():
                # Construct the full path for the extracted file
                extracted_file_path = os.path.join(self.csv_extract_dir, file_info.filename)

                # Check if the file already exists, and skip extraction if it does
                if os.path.exists(extracted_file_path):
                    print(f"File {extracted_file_path} already exists. Skipping extraction.")
                    continue

                # Extract the file
                zip_ref.extract(file_info, path=self.csv_extract_dir)

    def download_and_extract_data(self, start_date, end_date):
        os.makedirs(self.zip_download_dir, exist_ok=True)
        os.makedirs(self.csv_extract_dir, exist_ok=True)
        os.makedirs(self.feather_stored_dir, exist_ok=True)
        download_bar = tqdm(total=(end_date - start_date).days + 1, desc="Downloading and Extracting")
        
        for current_date in (start_date + datetime.timedelta(n) for n in range((end_date - start_date).days + 1)):
            date_str = current_date.strftime('%Y-%m-%d')
            link = f'{self.base_url}ETHUSDT-aggTrades-{date_str}.zip'
            filename = os.path.join(self.zip_download_dir, f'ETHUSDT-aggTrades-{date_str}.zip')

            if self._download_zip_file(link, filename):
                self._extract_zip_file(filename)
            download_bar.update(1)
        download_bar.close()

    def process_csv_files(self):
        csv_files = [file for file in os.listdir(self.csv_extract_dir) if file.endswith('.csv')]
        csv_files.sort()
        combined_df = None
        progress_bar = tqdm(total=len(csv_files), desc="Combining CSV files")
        
        for csv_file in csv_files:
            file_path = os.path.join(self.csv_extract_dir, csv_file)
            df = pd.read_csv(file_path)
            if combined_df is None:
                combined_df = df
            else:
                combined_df = pd.concat([combined_df, df])
            progress_bar.update(1)
        
        progress_bar.close()
        return combined_df

    def _convert_to_seconds(self, df):
        df['utc_time'] = pd.to_datetime(df['transact_time'], unit='ms')
        df.set_index('utc_time', inplace=True)
        
        resampled_df = df.resample('S').agg({
            'agg_trade_id': 'first',
            'price': 'ohlc',
            'quantity': 'sum',
            'first_trade_id': 'first',
            'last_trade_id': 'last',
            'is_buyer_maker': 'last',
            # 'transact_time': 'last',
        })
        resampled_df.reset_index(inplace=True)
        resampled_df.columns = resampled_df.columns.droplevel()
        resampled_df.fillna(method='ffill', inplace=True)
        resampled_df.fillna(method='bfill', inplace=True)
        
        return resampled_df

    def run_data_processing_workflow(self, start_date, end_date):
        self.download_and_extract_data(start_date, end_date)
        df = self.process_csv_files()
        df = self._convert_to_seconds(df)
        # define start and end date of df
        # df = df[(df['utc_time'] >= start_date) & (df['utc_time'] <= end_date)]
        return df

    def combine_select_csv_to_df(self, start_date, end_date):
        csv_files = [file for file in os.listdir(self.csv_extract_dir) if file.endswith('.csv')]
        csv_files.sort()
        combined_df = None
        progress_bar = tqdm(total=len(csv_files), desc="Combining CSV files")

        for csv_file in csv_files:
            # Extract the date part from the file name and convert it to a date
            date_str = '-'.join(csv_file.split('-')[-3:]).replace('.csv', '')
            file_date = datetime.datetime.strptime(date_str, '%Y-%m-%d').date()

            # Check if the file date is within the specified date range
            if start_date <= file_date <= end_date:
                file_path = os.path.join(self.csv_extract_dir, csv_file)
                df = pd.read_csv(file_path)
                df = self._convert_to_seconds(df)  # Apply _convert_to_seconds
                combined_df = pd.concat([combined_df, df]) if combined_df is not None else df
            progress_bar.update(1)

        progress_bar.close()
        return combined_df



    def delete_all_data(self):
        os.system(f'rm -rf {self.zip_download_dir}')
        os.system(f'rm -rf {self.csv_extract_dir}')

# Example usage:
base_url = 'https://data.binance.vision/data/futures/um/daily/aggTrades/ETHUSDT/'
zip_download_dir = '/allah/freqtrade/json_dict/binance_aggTrades'
csv_extract_dir = '/allah/freqtrade/json_dict/decompressed_csv'

data_processor = DataProcessor(base_url, zip_download_dir, csv_extract_dir)
start_date = datetime.date(2023, 9, 15)
end_date = datetime.date(2023, 10, 1)

# df = data_processor.run_data_processing_workflow(start_date, end_date)
# df = data_processor.download_and_extract_data(datetime.date(2023, 7, 1), datetime.date(2023, 10, 20))
df = data_processor.combine_select_csv_to_df(datetime.date(2023, 4, 13), datetime.date(2023, 10, 20))



Combining CSV files:   0%|          | 0/479 [00:00<?, ?it/s]

Combining CSV files: 100%|██████████| 479/479 [00:54<00:00,  8.76it/s] 


In [93]:
import requests
import os
import datetime
import zipfile
from tqdm import tqdm
import pandas as pd
import feather

class DataProcessor:
    def __init__(self, base_url, zip_download_dir, csv_extract_dir, feather_stored_dir):
        self.base_url = base_url
        self.zip_download_dir = zip_download_dir
        self.csv_extract_dir = csv_extract_dir
        self.feather_stored_dir = feather_stored_dir

    def download_file(self, url, destination):
        if os.path.exists(destination):
            print(f"File {destination} already exists. Skipping download.")
            return
        response = requests.get(url)
        if response.status_code == 200:
            with open(destination, 'wb') as file:
                file.write(response.content)
            print(f"Downloaded {destination}")

    def download_and_extract_data(self, start_date, end_date):
        os.makedirs(self.zip_download_dir, exist_ok=True)
        os.makedirs(self.csv_extract_dir, exist_ok=True)
        os.makedirs(self.feather_stored_dir, exist_ok=True)
        download_bar = tqdm(total=(end_date - start_date).days + 1, desc="Downloading and Extracting")

        for current_date in (start_date + datetime.timedelta(n) for n in range((end_date - start_date).days + 1)):
            date_str = current_date.strftime('%Y-%m-%d')
            zip_url = f'{self.base_url}ETHUSDT-aggTrades-{date_str}.zip'
            zip_filename = os.path.join(self.zip_download_dir, f'ETHUSDT-aggTrades-{date_str}.zip')

            self.download_file(zip_url, zip_filename)

            with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
                for file_info in zip_ref.infolist():
                    extracted_file_path = os.path.join(self.csv_extract_dir, file_info.filename)

                    if not os.path.exists(extracted_file_path):
                        zip_ref.extract(file_info, path=self.csv_extract_dir)
                        print(f"Extracted {extracted_file_path}")
                    else:
                        print(f"File {extracted_file_path} already exists. Skipping extraction.")

            download_bar.update(1)
        download_bar.close()

    def transfer_to_feather(self, csv_file_path, feather_file_path):
        if not os.path.exists(feather_file_path):
            df = pd.read_csv(csv_file_path)
            df = self._convert_to_seconds(df)
            df.to_feather(feather_file_path)
            print(f"Transferred {csv_file_path} to Feather format.")
        else:
            print(f"Feather file for {csv_file_path} already exists. Skipping transfer.")

    def transfer_extracted_csv_to_feather(self):
        os.makedirs(self.feather_stored_dir, exist_ok=True)
        csv_files = [file for file in os.listdir(self.csv_extract_dir) if file.endswith('.csv')]
        csv_files.sort()
        transfer_bar = tqdm(total=len(csv_files), desc="Transferring CSV to Feather")

        for csv_file in csv_files:
            csv_file_path = os.path.join(self.csv_extract_dir, csv_file)
            feather_file_path = os.path.join(self.feather_stored_dir, f"{csv_file.replace('.csv', '.feather')}")
            self.transfer_to_feather(csv_file_path, feather_file_path)
            transfer_bar.update(1)
        transfer_bar.close()

    def process_csv_files(self):
        csv_files = [file for file in os.listdir(self.csv_extract_dir) if file.endswith('.csv')]
        csv_files.sort()
        combined_df = None
        progress_bar = tqdm(total=len(csv_files), desc="Combining CSV files")

        for csv_file in csv_files:
            file_path = os.path.join(self.csv_extract_dir, csv_file)
            df = pd.read_csv(file_path)
            if combined_df is None:
                combined_df = df
            else:
                combined_df = pd.concat([combined_df, df])
            progress_bar.update(1)

        progress_bar.close()
        return combined_df

    def _convert_to_seconds(self, df):
        df['utc_time'] = pd.to_datetime(df['transact_time'], unit='ms')
        df.set_index('utc_time', inplace=True)

        resampled_df = df.resample('S').agg({
            'agg_trade_id': 'first',
            'price': 'ohlc',
            'quantity': 'sum',
            'first_trade_id': 'first',
            'last_trade_id': 'last',
            'is_buyer_maker': 'last',
        })
        resampled_df.reset_index(inplace=True)
        resampled_df.columns = resampled_df.columns.droplevel()
        resampled_df.fillna(method='ffill', inplace=True)
        resampled_df.fillna(method='bfill', inplace=True)

        return resampled_df

    def combine_to_df(self, data_dir, start_date, end_date):
        data_files = [file for file in os.listdir(data_dir) if file.endswith('.feather')]
        data_files.sort()
        combined_df = None
        progress_bar = tqdm(total=len(data_files), desc="Combining Feather Files")

        for data_file in data_files:
            date_str = '-'.join(data_file.split('-')[-3:]).replace('.feather', '')
            file_date = datetime.datetime.strptime(date_str, '%Y-%m-%d').date()

            if start_date <= file_date <= end_date:
                data_path = os.path.join(data_dir, data_file)
                df = pd.read_feather(data_path)
                df = self._convert_to_seconds(df)
                combined_df = pd.concat([combined_df, df]) if combined_df is not None else df
            progress_bar.update(1)

        progress_bar.close()
        return combined_df

    def delete_all_data(self):
        os.system(f'rm -rf {self.zip_download_dir}')
        os.system(f'rm -rf {self.csv_extract_dir}')
        os.system(f'rm -rf {self.feather_stored_dir}')

# Example usage:
base_url = 'https://data.binance.vision/data/futures/um/daily/aggTrades/ETHUSDT/'
zip_download_dir = '/allah/freqtrade/json_dict/aggTrades/binance_aggTrades'
csv_extract_dir = '/allah/freqtrade/json_dict/aggTrades/decompressed_csv'
feather_stored_dir = '/allah/freqtrade/json_dict/aggTrades/feather_data'

data_processor = DataProcessor(base_url, zip_download_dir, csv_extract_dir, feather_stored_dir)
start_date = datetime.date(2023, 9, 15)
end_date = datetime.date(2023, 10, 1)

# Download and extract data
data_processor.download_and_extract_data(start_date, end_date)

# Transfer the extracted CSV files to Feather format
data_processor.transfer_extracted_csv_to_feather()

# Combine selected Feather files to a DataFrame
combined_df = data_processor.combine_to_df(feather_stored_dir, datetime.date(2023, 4, 13), datetime.date(2023, 10, 20))

# Perform operations with combined_df as needed.


TypeError: DataProcessor.__init__() takes 4 positional arguments but 5 were given

In [90]:
import pandas as pd
from datetime import datetime
import feather

def format_and_save_dataframe(input_df, output_path):
    # Create a copy of the input DataFrame
    df_formatted = input_df.copy()

    # Define the end time for your time series
    end_time = datetime.strptime('2023-10-16 23:00:00', '%Y-%m-%d %H:%M:%S')

    # Create a new index based on a range with 1-minute frequency
    new_index = pd.date_range(end=end_time, periods=len(df_formatted), freq='1T')

    # Assign the new index to the DataFrame and rename columns
    df_formatted.index = new_index
    df_formatted = df_formatted.rename(columns={'': 'real 1s'})

    # Reset the index to make the 'date' column a regular column
    df_formatted = df_formatted.reset_index()

    # Rename and format columns to match the target format
    df_formatted['date'] = df_formatted['index']
    df_formatted['volume'] = df_formatted['quantity']
    # df_formatted['date'] = pd.to_datetime(df_formatted['date']).dt.strftime('%Y-%m-%d %H:%M:%S') + '+00:00'
    df_formatted['date'] = pd.to_datetime(df_formatted['date'])

    df_formatted = df_formatted[['date', 'open', 'high', 'low', 'close', 'volume']]

    # Save the formatted DataFrame to a Feather file
    # feather.write_dataframe(df_formatted, output_path)
    df_formatted.to_feather(
            output_path, compression_level=9, compression='lz4')

# Usage
input_df = df.copy()  # Replace with your actual DataFrame
output_path = '/allah/freqtrade/user_data/data/binance/futures/BTC_USDT_USDT-1m-futures.feather'
format_and_save_dataframe(input_df, output_path)
