In [None]:
import data_processor_multi_processing as processor
import plotter
from data_service import DataService

In [None]:
def get_ranges(symbol, list_of_dataframes):
    symbol = symbol.split("-")[0]
    range_max = float('-inf')
    range_min = float('inf')
    
    for df in list_of_dataframes:
        # Get the high and low columns for the symbol
        high_col = f"high ({symbol})"
        low_col = f"low ({symbol})"
        
        if high_col in df and low_col in df:
            # Update range_max and range_min
            range_max = max(range_max, df[high_col].max())
            range_min = min(range_min, df[low_col].min())
    

    return range_min, range_max


def replace_cols_with_scaled_cols(samples, selected_scaler='StandardScaler'):
    for s in samples:
        new_dataframe = processor.scale_and_cleanup_dataframe(s['dataframe'])
        new_sample = {
            'timestamp': s['timestamp'],
            'label': s['label'],
            'dataframe': new_dataframe
        }
        s = new_sample
    return samples

In [None]:
ds = DataService()
# ds.delete_all_collections()
collections = ds.get_all_collection_names()

for i, collection in enumerate(collections):
    print(i, collection)

In [None]:
collection = 6
data = ds.get_dataframes_as_dicts(collection_name=collections[collection])

In [None]:
import pandas as pd

def check_timestamps(timestamps):
    # Convert to pandas datetime if not already
    if not isinstance(timestamps, pd.Series):
        timestamps = pd.Series(timestamps)
    timestamps = pd.to_datetime(timestamps)
    
    # Check if timestamps are in ascending order
    ascending = timestamps.is_monotonic_increasing
    
    # Calculate time differences between consecutive timestamps
    time_diffs = timestamps.diff().dt.total_seconds()
    
    # Check for gaps greater than 3 minutes (180 seconds)
    gaps = time_diffs[1:] > 180  # Skip the first NaN value
    
    has_gaps = gaps.any()
    
    # Result
    return {
        'ascending': ascending,
        'has_gaps': has_gaps,
        'gaps_indices': gaps[gaps].index.tolist()  # Indices of gaps if any
    }

In [None]:
timestamps = pd.Series([s['timestamp'] for s in data])
checks = check_timestamps(timestamps)
if checks['ascending'] == False or checks['has_gaps']:
    raise 

In [None]:
def split_list(data, fractions):
    """
    Splits a list of dictionaries into 3 segments based on given fractions.

    Parameters:
    data (list): List of dictionaries to be split.
    fractions (list): List of 3 fractions representing the relative sizes of the segments.

    Returns:
    tuple: Three lists of dictionaries, each representing a segment.
    """
    if len(fractions) != 3:
        raise ValueError("Fractions list must contain exactly 3 elements.")
    
    if sum(fractions) != 1:
        raise ValueError("Fractions must sum to 1.")
    
    # Calculate the number of items in each segment
    total_length = len(data)
    lengths = [int(total_length * fraction) for fraction in fractions]

    # Adjust the lengths to ensure they sum up to the total length
    while sum(lengths) < total_length:
        lengths[lengths.index(min(lengths))] += 1
    
    # Create the segments
    segment1 = data[:lengths[0]]
    segment2 = data[lengths[0]:lengths[0] + lengths[1]]
    segment3 = data[lengths[0] + lengths[1]:]

    return segment1, segment2, segment3


In [None]:
train, val, test = split_list(data, fractions=
                              [
                                  0.8, 0.1, 0.1
                              ])

In [None]:
def check_temporal_order(train, val, test):
    """
    Checks if the train, val, and test lists of dictionaries are temporally separated and in the correct order.

    Parameters:
    train (list): List of dictionaries for the training set.
    val (list): List of dictionaries for the validation set.
    test (list): List of dictionaries for the test set.

    Returns:
    bool: True if all sets are temporally separated and in the correct order, False otherwise.
    """
    # Convert timestamps to pandas datetime objects
    train_timestamps = pd.to_datetime([d['timestamp'] for d in train])
    val_timestamps = pd.to_datetime([d['timestamp'] for d in val])
    test_timestamps = pd.to_datetime([d['timestamp'] for d in test])
    
    # Check if timestamps are in ascending order within each set
    if not (train_timestamps.is_monotonic_increasing and val_timestamps.is_monotonic_increasing and test_timestamps.is_monotonic_increasing):
        return False
    
    # Check if train, val, and test sets are temporally separated
    if train_timestamps.max() >= val_timestamps.min() or val_timestamps.max() >= test_timestamps.min():
        return False
    
    return True

In [None]:
check = check_temporal_order(train, val, test)
if check == False:
    raise

In [None]:
print(train)

In [None]:
dataframes = [s['dataframe'] for s in train]
range_min, range_max = get_ranges("ETH-USDT", dataframes)
range_min_2, range_max_2 = get_ranges("BTC-USDT", dataframes)

In [None]:
for dataset in [train, val, test]:
    for sample in dataset:
        plot = plotter.create_minimal_candlestick_plot(sample['dataframe'], 'ETH', 'BTC', 
        plotter.save_plot(plot, collection, sample['timestamp'])