In [None]:
# Import necessary libraries
import pyarrow.feather as feather
import pandas as pd
import talib
import numpy as np
from pathlib import Path

# Define the Block class
class Block:
    def __init__(self, start_date, end_date, start_price, end_price, duration, data_segment):
        self.start_date = start_date
        self.end_date = end_date
        self.start_price = start_price
        self.end_price = end_price
        self.duration = duration
        self.data_segment = data_segment  # Store all the data points in this block
        self.direction = 'UP' if start_price < end_price else 'DOWN'
        self.features = {}

# Function to load cryptocurrency data
def load_crypto_data(data_file_path, start_date, end_date):
    """
    Load cryptocurrency data, filter by date range, and calculate TEMA and trend information.
    
    Parameters:
    data_file_path (str): Path to the Feather file containing the dataset.
    start_date (str): The start date for filtering the data in 'YYYY-MM-DD' format.
    end_date (str): The end date for filtering the data in 'YYYY-MM-DD' format.
    
    Returns:
    pd.DataFrame: DataFrame containing the filtered data with TEMA, trend, and trend change information.
    """
    # Load data from Feather file
    crypto_df = feather.read_feather(data_file_path)

    # Convert input dates to datetime format
    start_date = pd.to_datetime(start_date).tz_localize('UTC')
    end_date = pd.to_datetime(end_date).tz_localize('UTC')
    
    # Filter data between the start and end dates
    crypto_df = crypto_df[(crypto_df['date'] >= start_date) & (crypto_df['date'] <= end_date)]

    # Calculate the Triple Exponential Moving Average (TEMA) with a default period of 50
    tema_period = 50
    crypto_df['tema'] = talib.TEMA(crypto_df['close'], timeperiod=tema_period)

    # Determine the trend direction (UP, DOWN, STABLE)
    crypto_df['trend'] = np.where(crypto_df['tema'] > crypto_df['tema'].shift(1), 'UP',
                                  np.where(crypto_df['tema'] < crypto_df['tema'].shift(1), 'DOWN', 'STABLE'))

    # Identify significant trend changes (ignoring 'STABLE' transitions)
    crypto_df['is_trend_change'] = crypto_df['trend'] != crypto_df['trend'].shift(1)
    crypto_df['is_significant_trend_change'] = crypto_df['is_trend_change'] & (crypto_df['trend'] != 'STABLE')

    # Assign a unique group ID to each continuous trend segment
    crypto_df['group_id'] = crypto_df['is_significant_trend_change'].cumsum()

    return crypto_df

# Function to create blocks from the data
def create_blocks(crypto_df):
    """
    Create a list of Block objects from the DataFrame.
    """
    blocks = []

    # Group the data by 'group_id'
    grouped = crypto_df.groupby('group_id')

    # Iterate over each group and create a block
    for group_id, group_data in grouped:
        start_date = group_data['date'].iloc[0]
        end_date = group_data['date'].iloc[-1]
        start_price = group_data['close'].iloc[0]
        end_price = group_data['close'].iloc[-1]
        duration = len(group_data)

        # Create a block object that includes all the data points
        block = Block(
            start_date=start_date,
            end_date=end_date,
            start_price=start_price,
            end_price=end_price,
            duration=duration,
            data_segment=group_data  # Store the entire segment of data
        )

        # Add the block to the list
        blocks.append(block)

    return blocks

# Example usage (for debugging purposes):
data_file_path = '/allah/freqtrade/user_data/data/binance/futures/ETH_USDT_USDT-3m-futures.feather'
start_date = '2023-01-01'
end_date = '2024-10-22'

# Load the data
crypto_df = load_crypto_data(data_file_path, start_date, end_date)
# calculated RSI here
crypto_df['rsi'] = talib.RSI(crypto_df['close'], timeperiod=14)
# Create blocks
blocks = create_blocks(crypto_df)


In [None]:
import pandas as pd
import numpy as np

def calculate_statistical_features(series):
    return {
        'mean': np.mean(series),
        'median': np.median(series),
        'std': np.std(series),
        'variance': np.var(series),
        'skewness': series.skew(),
        'kurtosis': series.kurtosis(),
        'min': np.min(series),
        'max': np.max(series)
    }

def extract_ts_features(block):
    features = {'length': block.duration}
    rsi_series = block.data_segment['rsi'].dropna()  # Directly use RSI from data segment

    if not rsi_series.empty:
        # Calculate statistical features for RSI
        rsi_stat_features = calculate_statistical_features(rsi_series)

        # Update features with RSI statistical metrics
        features.update({f'rsi_{k}': v for k, v in rsi_stat_features.items()})

        # Add RSI level counts
        features['rsi_above_70'] = np.sum(rsi_series > 70)
        features['rsi_below_30'] = np.sum(rsi_series < 30)

    return features

# Iterate over blocks and extract RSI features
from tqdm import tqdm

for block in tqdm(blocks, desc="Extracting RSI Features"):
    block.features = extract_ts_features(block)


In [None]:
blocks[-24].features

In [None]:
# import pandas as pd
# import mplfinance as mpf

# # Prepare the data in the correct format
# data_segment = blocks[-1].data_segment  # Assume this is the OHLCV data

# data_segment['date'] = pd.to_datetime(data_segment['date'])

# # Set the 'date' as the index
# data_segment.set_index('date', inplace=True)

# # Rename the columns to match mplfinance requirements
# data_segment.rename(columns={
#     'open': 'Open', 
#     'high': 'High', 
#     'low': 'Low', 
#     'close': 'Close', 
#     'volume': 'Volume'
# }, inplace=True)

# # Plot the OHLCV data as candlesticks
# mpf.plot(
#     data_segment, 
#     type='candle', 
#     volume=True,  # Include volume subplot
#     style='charles',  # Choose a style (optional)
#     title=f'Candlestick Chart from {data_segment.index.min().date()} to {data_segment.index.max().date()}',
#     ylabel='Price (USD)',
#     ylabel_lower='Volume',
#     figsize=(14, 8)
# )


In [None]:
import math

# Function to replace NaN with the last valid non-NaN value from previous blocks' features
def replace_nan_in_features_with_last_valid(blocks):
    last_valid_values = {}

    for block in blocks:
        filled_features = {}
        
        # Iterate only over features within the block
        for key, value in block.features.items():
            # If value is NaN, use the last valid non-NaN value from previous blocks
            if isinstance(value, float) and math.isnan(value):
                # If we have a previous valid value, use it
                value = last_valid_values.get(key, 0)  # Default to 0 if no valid value found
            else:
                # Update last valid value for this key
                last_valid_values[key] = value
            filled_features[key] = value

        # Update the block's features with the filled values
        block.features = filled_features
    
    return blocks

# Apply the function to replace NaNs in block features
blocks = replace_nan_in_features_with_last_valid(blocks)

# Display the result to verify NaNs are replaced in features only
for i, block in enumerate(blocks[:5]):  # Displaying only the first 5 blocks as an example
    print(f"Block {i} features:", block.features)


In [None]:
import pandas as pd

Y = []

for i in range(len(blocks)):
    # Extract the last timestamp in the current block
    timestamp = blocks[i].data_segment.iloc[-1].date
    
    # Calculate the timestamp for 10 candles (30 minutes later)
    timestamp_after_10 = timestamp + pd.Timedelta(minutes=30)
    
    # Extract the corresponding close price from crypto_df for the timestamp after 10 candles
    close_price_after_10 = crypto_df.loc[crypto_df['date'] == timestamp_after_10, 'close']
    
    # Get the end price for the current block's timestamp
    end_price = crypto_df.loc[crypto_df['date'] == timestamp, 'close']
    
    # Check if end_price and close_price_after_10 are non-empty before accessing values
    if not end_price.empty and not close_price_after_10.empty:
        blocks[i].end_price = end_price.iloc[0]
        close_price_after_10_value = close_price_after_10.iloc[0]
        
        # Compare prices and append to Y
        if close_price_after_10_value > blocks[i].end_price:
            Y.append(1)
        else:
            Y.append(0)
    else:
        # Append None if no matching timestamp is found in crypto_df
        Y.append(None)

# Now Y contains 1 or 0 based on price movement, or None if data is missing.


In [None]:
Y

In [None]:
# Define the length of each sequence
k = 10

# Generate X: sequences of features from the previous k blocks in correct order
X = [[blocks[i + j].features for j in range(k)] for i in range(len(blocks) - k + 1)]

# Align Y with X by starting Y from the corresponding index
Y_aligned = Y[k - 1:]

# Print lengths to verify alignment
print("Length of X:", len(X))
print("Length of Y_aligned:", len(Y_aligned))

X_test = X[:-1]
Y_test = Y_aligned[:-1]

print("Length of X_test:", len(X_test))
print("Length of Y_test:", len(Y_test))


In [None]:
import numpy as np
import math

# Function to convert a dictionary to a feature vector
def dict_to_feature_vector(dictionary):
    return [
        0 if isinstance(value, float) and math.isnan(value) else value 
        for value in dictionary.values()
    ]

# Convert each sequence (list of dictionaries) in X_test into a 2D array
X_numeric = [
    [dict_to_feature_vector(block) for block in sequence] 
    for sequence in X_test
]

# Convert X_numeric to a numpy array for LSTM input
X_numeric = np.array(X_numeric)

# Reshape for LSTM input: (samples, time steps, features)
# Here, each sequence is treated as a sample with multiple time steps
print("Shape of X_numeric:", X_numeric.shape)  # Expected shape: (number of samples, time steps, number of features)

Y_numeric = np.array(Y_test)

# Print the shape of Y_numeric
print("Shape of Y_numeric:", Y_numeric.shape)

In [None]:
uniq = np.unique(Y_numeric, return_counts=True)
print("Unique values in Y_numeric:", dict(zip(*uniq)))

In [None]:
X_numeric.shape

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import TensorBoard
import datetime
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# Assuming X_numeric and Y_numeric are already defined
# Example shape of X_numeric: (samples, time steps, features) -> (1321, 10, 11)

# Reshape X_numeric to 2D for scaling
X_reshaped = X_numeric.reshape(-1, X_numeric.shape[2])

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_reshaped)

# Reshape back to original shape for LSTM input
X_scaled = X_scaled.reshape(X_numeric.shape[0], X_numeric.shape[1], X_numeric.shape[2])

print("Shape of X:", X_scaled.shape)  # Expected shape: (samples, time steps, features)
print("Shape of Y:", Y_numeric.shape)  # Shape of output labels

# Build the LSTM model
model = models.Sequential()

# Add LSTM layers
model.add(layers.LSTM(units=50, input_shape=(X_scaled.shape[1], X_scaled.shape[2]), return_sequences=False))

# Add dropout layer
model.add(layers.Dropout(0.5))

# Add output layer for binary classification
model.add(layers.Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print model summary
model.summary()

# TensorBoard callback for logging training metrics
log_dir = "/allah/data/logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

# Train the model
history = model.fit(X_scaled, Y_numeric, epochs=200, batch_size=32, validation_split=0.2, callbacks=[tensorboard_callback])

# Plot training and validation loss
plt.figure(figsize=(10, 5))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss Over Epochs')
plt.legend()
plt.grid(True)
plt.show()

# Plot training and validation accuracy
plt.figure(figsize=(10, 5))
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy Over Epochs')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# Check for NaN and Inf values in X_scaled
print("NaN values in X_scaled:", np.isnan(X_scaled).sum())  # Should be 0
print("Inf values in X_scaled:", np.isinf(X_scaled).sum())  # Should be 0
# Check for NaN and Inf values in Y
print("NaN values in Y:", np.isnan(Y).sum())  # Should be 0
print("Inf values in Y:", np.isinf(Y).sum())  # Should be 0

# Ensure Y is strictly binary (0 or 1)
print("Unique values in Y:", np.unique(Y))  # Should only be [0, 1]


In [1]:
import torch
print(torch.__version__)


2.5.0+cu124
