In [14]:
import requests
import os
import datetime
import zipfile
from tqdm import tqdm
import pandas as pd

class DataProcessor:
    def __init__(self, base_url, zip_download_dir, csv_extract_dir):
        self.base_url = base_url
        self.zip_download_dir = zip_download_dir
        self.csv_extract_dir = csv_extract_dir

    def _download_zip_file(self, link, filename):
        # Check if the file already exists
        if os.path.exists(filename):
            print(f"File {filename} already exists. Skipping download.")
            return True

        response = requests.get(link)
        if response.status_code == 200:
            with open(filename, 'wb') as file:
                file.write(response.content)
            return True
        else:
            print(f"Error: {response.status_code} - Unable to download the ZIP file for {filename}")
            return False

    def _extract_zip_file(self, zip_path):
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            for file_info in zip_ref.infolist():
                # Construct the full path for the extracted file
                extracted_file_path = os.path.join(self.csv_extract_dir, file_info.filename)

                # Check if the file already exists, and skip extraction if it does
                if os.path.exists(extracted_file_path):
                    print(f"File {extracted_file_path} already exists. Skipping extraction.")
                    continue

                # Extract the file
                zip_ref.extract(file_info, path=self.csv_extract_dir)

    def download_and_extract_data(self, start_date, end_date):
        os.makedirs(self.zip_download_dir, exist_ok=True)
        os.makedirs(self.csv_extract_dir, exist_ok=True)
        download_bar = tqdm(total=(end_date - start_date).days + 1, desc="Downloading and Extracting")
        
        for current_date in (start_date + datetime.timedelta(n) for n in range((end_date - start_date).days + 1)):
            date_str = current_date.strftime('%Y-%m-%d')
            link = f'{self.base_url}ETHUSDT-aggTrades-{date_str}.zip'
            filename = os.path.join(self.zip_download_dir, f'ETHUSDT-aggTrades-{date_str}.zip')

            if self._download_zip_file(link, filename):
                self._extract_zip_file(filename)
            download_bar.update(1)
        download_bar.close()

    def process_csv_files(self):
        csv_files = [file for file in os.listdir(self.csv_extract_dir) if file.endswith('.csv')]
        csv_files.sort()
        combined_df = None
        progress_bar = tqdm(total=len(csv_files), desc="Combining CSV files")
        
        for csv_file in csv_files:
            file_path = os.path.join(self.csv_extract_dir, csv_file)
            df = pd.read_csv(file_path)
            if combined_df is None:
                combined_df = df
            else:
                combined_df = pd.concat([combined_df, df])
            progress_bar.update(1)
        
        progress_bar.close()
        return combined_df

    def _convert_to_seconds(self, df):
        df['utc_time'] = pd.to_datetime(df['transact_time'], unit='ms')
        df.set_index('utc_time', inplace=True)
        
        resampled_df = df.resample('S').agg({
            'agg_trade_id': 'first',
            'price': 'ohlc',
            'quantity': 'sum',
            'first_trade_id': 'first',
            'last_trade_id': 'last',
            'is_buyer_maker': 'last',
            # 'transact_time': 'last',
        })
        resampled_df.reset_index(inplace=True)
        resampled_df.columns = resampled_df.columns.droplevel()
        resampled_df.fillna(method='ffill', inplace=True)
        resampled_df.fillna(method='bfill', inplace=True)
        
        return resampled_df

    def run_data_processing_workflow(self, start_date, end_date):
        self.download_and_extract_data(start_date, end_date)
        df = self.process_csv_files()
        df = self._convert_to_seconds(df)
        # define start and end date of df
        # df = df[(df['utc_time'] >= start_date) & (df['utc_time'] <= end_date)]
        return df

    def combine_select_csv_to_df(self, start_date, end_date):
        csv_files = [file for file in os.listdir(self.csv_extract_dir) if file.endswith('.csv')]
        csv_files.sort()
        combined_df = None
        progress_bar = tqdm(total=len(csv_files), desc="Combining CSV files")

        for csv_file in csv_files:
            # Extract the date part from the file name and convert it to a date
            date_str = '-'.join(csv_file.split('-')[-3:]).replace('.csv', '')
            file_date = datetime.datetime.strptime(date_str, '%Y-%m-%d').date()

            # Check if the file date is within the specified date range
            if start_date <= file_date <= end_date:
                file_path = os.path.join(self.csv_extract_dir, csv_file)
                df = pd.read_csv(file_path)
                df = self._convert_to_seconds(df)  # Apply _convert_to_seconds
                combined_df = pd.concat([combined_df, df]) if combined_df is not None else df
            progress_bar.update(1)

        progress_bar.close()
        return combined_df



    def delete_all_data(self):
        os.system(f'rm -rf {self.zip_download_dir}')
        os.system(f'rm -rf {self.csv_extract_dir}')

# Example usage:
base_url = 'https://data.binance.vision/data/futures/um/daily/aggTrades/ETHUSDT/'
zip_download_dir = '/allah/freqtrade/json_dict/binance_aggTrades'
csv_extract_dir = '/allah/freqtrade/json_dict/decompressed_csv'

data_processor = DataProcessor(base_url, zip_download_dir, csv_extract_dir)
start_date = datetime.date(2023, 9, 15)
end_date = datetime.date(2023, 10, 1)

# df = data_processor.run_data_processing_workflow(start_date, end_date)
# df = data_processor.download_and_extract_data(datetime.date(2023, 7, 1), datetime.date(2023, 10, 20))
df = data_processor.combine_select_csv_to_df(datetime.date(2023, 10, 13), datetime.date(2023, 10, 20))



Combining CSV files: 100%|██████████| 479/479 [00:01<00:00, 347.02it/s] 


In [44]:
import pandas as pd
from datetime import datetime

# Assuming you have a DataFrame called df
df_1m_fake = df.copy()  # Create a copy of the original DataFrame

# Define the end time for your time series
end_time = datetime.strptime('2023-10-16 23:00:00', '%Y-%m-%d %H:%M:%S')

# Create a new index based on a range with 1-minute frequency
new_index = pd.date_range(end=end_time, periods=len(df_1m_fake), freq='1T')

# Assign the new index to the DataFrame and rename columns
df_1m_fake.index = new_index
df_1m_fake = df_1m_fake.rename(columns={'': 'real 1s'})

# Reset the index to make the 'date' column a regular column
df_1m_fake = df_1m_fake.reset_index()

print(df_1m_fake)


                     index             real 1s  agg_trade_id     open  \
0      2023-02-18 23:17:00 2023-10-13 00:00:04  1.393602e+09  1538.82   
1      2023-02-18 23:18:00 2023-10-13 00:00:05  1.393602e+09  1538.99   
2      2023-02-18 23:19:00 2023-10-13 00:00:06  1.393602e+09  1538.91   
3      2023-02-18 23:20:00 2023-10-13 00:00:07  1.393602e+09  1538.99   
4      2023-02-18 23:21:00 2023-10-13 00:00:08  1.393602e+09  1538.89   
...                    ...                 ...           ...      ...   
345579 2023-10-16 22:56:00 2023-10-16 23:59:55  1.395611e+09  1598.53   
345580 2023-10-16 22:57:00 2023-10-16 23:59:56  1.395611e+09  1598.53   
345581 2023-10-16 22:58:00 2023-10-16 23:59:57  1.395611e+09  1598.54   
345582 2023-10-16 22:59:00 2023-10-16 23:59:58  1.395611e+09  1598.62   
345583 2023-10-16 23:00:00 2023-10-16 23:59:59  1.395611e+09  1598.61   

           high      low    close  quantity  first_trade_id  last_trade_id  \
0       1538.99  1538.81  1538.98    44.168  

In [45]:
df_1m_fake

Unnamed: 0,index,real 1s,agg_trade_id,open,high,low,close,quantity,first_trade_id,last_trade_id,is_buyer_maker
0,2023-02-18 23:17:00,2023-10-13 00:00:04,1.393602e+09,1538.82,1538.99,1538.81,1538.98,44.168,3.276229e+09,3.276229e+09,1.0
1,2023-02-18 23:18:00,2023-10-13 00:00:05,1.393602e+09,1538.99,1538.99,1538.90,1538.91,24.199,3.276229e+09,3.276229e+09,0.0
2,2023-02-18 23:19:00,2023-10-13 00:00:06,1.393602e+09,1538.91,1538.99,1538.91,1538.98,114.245,3.276229e+09,3.276229e+09,1.0
3,2023-02-18 23:20:00,2023-10-13 00:00:07,1.393602e+09,1538.99,1538.99,1538.90,1538.90,127.492,3.276229e+09,3.276230e+09,1.0
4,2023-02-18 23:21:00,2023-10-13 00:00:08,1.393602e+09,1538.89,1538.89,1538.88,1538.89,3.531,3.276230e+09,3.276230e+09,0.0
...,...,...,...,...,...,...,...,...,...,...,...
345579,2023-10-16 22:56:00,2023-10-16 23:59:55,1.395611e+09,1598.53,1598.54,1598.53,1598.54,30.008,3.282371e+09,3.282371e+09,0.0
345580,2023-10-16 22:57:00,2023-10-16 23:59:56,1.395611e+09,1598.53,1598.54,1598.53,1598.54,0.031,3.282371e+09,3.282371e+09,0.0
345581,2023-10-16 22:58:00,2023-10-16 23:59:57,1.395611e+09,1598.54,1598.61,1598.54,1598.61,3.971,3.282371e+09,3.282371e+09,0.0
345582,2023-10-16 22:59:00,2023-10-16 23:59:58,1.395611e+09,1598.62,1598.62,1598.62,1598.62,0.075,3.282371e+09,3.282371e+09,0.0


In [42]:
transformed_data

Unnamed: 0_level_0,index,real 1s,agg_trade_id,open,high,low,close,volume,first_trade_id,last_trade_id,is_buyer_maker
1m,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2023-02-18 23:17:00,2023-02-18 23:17:00,2023-10-13 00:00:04,1.393602e+09,1538.82,1538.99,1538.81,1538.98,44.168,3.276229e+09,3.276229e+09,1.0
2023-02-18 23:18:00,2023-02-18 23:18:00,2023-10-13 00:00:05,1.393602e+09,1538.99,1538.99,1538.90,1538.91,24.199,3.276229e+09,3.276229e+09,0.0
2023-02-18 23:19:00,2023-02-18 23:19:00,2023-10-13 00:00:06,1.393602e+09,1538.91,1538.99,1538.91,1538.98,114.245,3.276229e+09,3.276229e+09,1.0
2023-02-18 23:20:00,2023-02-18 23:20:00,2023-10-13 00:00:07,1.393602e+09,1538.99,1538.99,1538.90,1538.90,127.492,3.276229e+09,3.276230e+09,1.0
2023-02-18 23:21:00,2023-02-18 23:21:00,2023-10-13 00:00:08,1.393602e+09,1538.89,1538.89,1538.88,1538.89,3.531,3.276230e+09,3.276230e+09,0.0
...,...,...,...,...,...,...,...,...,...,...,...
2023-10-16 22:56:00,2023-10-16 22:56:00,2023-10-16 23:59:55,1.395611e+09,1598.53,1598.54,1598.53,1598.54,30.008,3.282371e+09,3.282371e+09,0.0
2023-10-16 22:57:00,2023-10-16 22:57:00,2023-10-16 23:59:56,1.395611e+09,1598.53,1598.54,1598.53,1598.54,0.031,3.282371e+09,3.282371e+09,0.0
2023-10-16 22:58:00,2023-10-16 22:58:00,2023-10-16 23:59:57,1.395611e+09,1598.54,1598.61,1598.54,1598.61,3.971,3.282371e+09,3.282371e+09,0.0
2023-10-16 22:59:00,2023-10-16 22:59:00,2023-10-16 23:59:58,1.395611e+09,1598.62,1598.62,1598.62,1598.62,0.075,3.282371e+09,3.282371e+09,0.0
