## Imports

In [1]:
# imports
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime, timezone

## Data Preprocessing

In [2]:
data = pd.read_csv('Data/df_all.csv')
data = data.drop(['Ignore','up_cross','down_cross','minutes','log_minutes','side'], axis=1)
data

Unnamed: 0,Open time,Open,High,Low,Close,Volume,Close time,Quote asset volume,Number of trades,Taker buy base asset volume,Taker buy quote asset volume,plus_6,minus_6,zero_6
0,1598918400000,11649.51,11668.50,11649.50,11667.12,165.713325,1598918579999,1.932464e+06,2146,83.409576,9.725705e+05,0,1,0
1,1598918580000,11667.12,11667.72,11634.06,11645.19,248.783097,1598918759999,2.897763e+06,3230,92.608664,1.078562e+06,0,1,0
2,1598918760000,11645.51,11649.88,11629.00,11639.12,217.735241,1598918939999,2.534315e+06,5186,93.190672,1.084609e+06,0,1,0
3,1598918940000,11639.12,11642.21,11630.21,11639.01,169.768550,1598919119999,1.975641e+06,3282,56.131767,6.532171e+05,0,1,0
4,1598919120000,11639.00,11640.01,11627.22,11634.16,130.973394,1598919299999,1.523519e+06,2300,45.500733,5.292799e+05,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
631196,1712620080000,71636.00,71683.82,71635.99,71676.45,22.694870,1712620259999,1.626456e+06,2027,10.817960,7.752474e+05,0,0,1
631197,1712620260000,71676.45,71676.46,71633.41,71636.83,15.908180,1712620439999,1.139815e+06,1209,5.646140,4.045180e+05,0,0,1
631198,1712620440000,71636.84,71636.84,71617.17,71621.38,24.054500,1712620619999,1.722863e+06,1326,7.466530,5.347731e+05,0,0,1
631199,1712620620000,71621.38,71662.94,71620.00,71620.00,42.037670,1712620799999,3.011572e+06,1624,17.243720,1.235209e+06,0,0,1


In [3]:
# Convert ISO 8601 date strings to Unix timestamp (milliseconds)
def iso_to_unix(iso_str):
    dt = datetime.strptime(iso_str, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
    return int(dt.timestamp() * 1000)  # Convert to milliseconds

def unix_to_iso(unix_timestamp_ms):
    # Convert milliseconds to seconds
    unix_timestamp_s = unix_timestamp_ms / 1000
    # Create a datetime object from the Unix timestamp
    dt = datetime.utcfromtimestamp(unix_timestamp_s)
    # Format the datetime object as an ISO 8601 date string
    iso_str = dt.strftime("%Y-%m-%dT%H:%M:%SZ")
    return iso_str

In [4]:
# looking for not continuous points
non_continuous_index =[]

for i in range(len(data)-1):
    if data['Open time'][i+1] - data['Open time'][i] != 3*60*1000:
        print('wrong!', i)
        print(data['Open time'][i])
        non_continuous_index.append(i)

wrong! 43319
1606715820000
wrong! 53535
1608558300000
wrong! 55135
1608861420000
wrong! 78189
1613014740000
wrong! 89169
1614995820000
wrong! 110739
1618883820000
wrong! 113130
1619323200000
wrong! 165795
1628819820000
wrong! 188365
1632898620000
wrong! 448119
1679661540000


In [5]:
print(non_continuous_index)
print(len(non_continuous_index))

[43319, 53535, 55135, 78189, 89169, 110739, 113130, 165795, 188365, 448119]
10


In [6]:
# Initialize an empty list to hold your chunks of data
data_chunks = []

# Set the start index for the first chunk
start_idx = 0

# Iterate through the non-continuous indices and split the data into chunks
for idx in non_continuous_index:
    # Create a chunk from start_idx to the non-continuous index
    chunk = data.iloc[start_idx:idx+1]
    # Append the chunk to your list of data chunks
    data_chunks.append(chunk.reset_index(drop=True))
    # Update start_idx for the next chunk
    start_idx = idx + 1

# Don't forget to grab the last chunk of data after the last non-continuous index
final_chunk = data.iloc[start_idx:]
data_chunks.append(final_chunk.reset_index(drop=True))

# Now data_chunks is a list of DataFrames, each representing a continuous chunk of time
# You can access individual chunks with data_chunks[0], data_chunks[1], etc.

In [7]:
# Define the window sizes for the moving averages
windows = [5, 10, 20, 30, 60, 120, 240]
# Define the window size and standard deviation multiplier for the Bollinger Bands
BB_window_size = 90
BB_std_multiplier = 1

# Iterate through each chunk in data_chunks
for i, chunk in enumerate(data_chunks):
    # ADDING MA
    for window in windows:
        # Calculate the moving average
        moving_avg = chunk['Close'].rolling(window).mean()
        # Add the moving average as a new column to the chunk
        chunk[f'MA{window}'] = moving_avg

    # Calculate the moving average and standard deviation
    BB_moving_avg = chunk['Close'].rolling(BB_window_size).mean()
    BB_std_dev = chunk['Close'].rolling(BB_window_size).std()

    # ADDING BB
    # Calculate the Bollinger Bands
    BB_upper_band = BB_moving_avg + (BB_std_multiplier * BB_std_dev)
    BB_lower_band = BB_moving_avg - (BB_std_multiplier * BB_std_dev)

    # Add the Bollinger Bands and moving average as new columns to the chunk
    chunk[f'MA{BB_window_size}'] = BB_moving_avg
    chunk[f'Upper_Band{BB_window_size}'] = BB_upper_band
    chunk[f'Lower_Band{BB_window_size}'] = BB_lower_band

    # drop NaN values
    chunk = chunk.dropna().reset_index(drop=True)
    chunk = chunk.drop(columns = ['Open time', 'Close time',], axis=1)
    # Optionally, update the chunk in data_chunks (if you want to keep the changes)
    data_chunks[i] = chunk

# Now each chunk in data_chunks has new columns for the moving averages


# # If you prefer to have separate variables for each chunk, you could do something like:
# for i, chunk in enumerate(data_chunks):
#     globals()[f'data_chunk_{i}'] = chunk

In [8]:
len(data_chunks)

11

In [None]:
import numpy as np

# Initialize empty lists to hold the data
matrix_list = []
answer_list = []

for chunk in data_chunks:
    if len(chunk)>=20:
        for i in range(len(chunk) - 19):  # Ensure there are 20 rows available
            # Create a matrix of 20 rows
            matrix = chunk.drop(columns= ['plus_6','minus_6','zero_6'], axis=1).iloc[i:i+20].values
            matrix_list.append(matrix)

            # Get the up, down, zero values and convert them to a list
            answer = chunk.iloc[i+19][['plus_6', 'minus_6', 'zero_6']].tolist()
            answer_list.append(answer)

# Convert the lists to numpy arrays
matrix_array_20 = np.array(matrix_list)
answer_array_20 = np.array(answer_list)

## to 03-1.py

## back to maxtrix_array_20

In [3]:
# matrix_array_20 = np.load('Data/matrix_array_20.npy')
# answer_array_20 = np.load('Data/answer_array_20.npy')

# List of file names to load
file_names_matrix = [f"Data/matrix_array_20_{i}.npy" for i in range(11)]  # Adjust range as needed
# file_names_answer = [f"Data/answer_array_20_{i}.npy" for i in range(11)]

# Load each file and store in a list
loaded_arrays_matrix = [np.load(file_name) for file_name in file_names_matrix]
# loaded_arrays_answer = [np.load(file_name) for file_name in file_names_answer]

# Concatenate all arrays into a single array
matrix_array_20 = np.concatenate(loaded_arrays_matrix, axis=0)
# answer_array_20 = np.concatenate(loaded_arrays_answer, axis=0)

In [5]:
# matrix_array_20.shape, answer_array_20.shape
matrix_array_20.shape

(587563, 20, 19)

In [6]:
# np.save('Data/matrix_array_20.npy', matrix_array_20)

In [9]:
from sklearn.preprocessing import StandardScaler

# Assuming data is your 600k matrices concatenated into a single 3D numpy array of shape (600000, 20, 19)
matrix_array_20_reshaped = matrix_array_20.reshape(-1, 19)  # Reshape to 2D for standardization
scaler = StandardScaler()
matrix_array_20_normalized = scaler.fit_transform(matrix_array_20_reshaped)

# Reshape back to 3D
matrix_array_20_normalized = matrix_array_20_normalized.reshape(-1, 20, 19)

# save that scaler
import joblib

joblib.dump(scaler, 'Scalers/StandardScaler_20.pkl')

['Scalers/StandardScaler_20.pkl']

In [10]:
np.save('Data/matrix_array_20_normalized.npy', matrix_array_20_normalized)

## back to maxtrix_array_40

In [2]:
matrix_array_40 = np.load('Data/matrix_array_40.npy')
answer_array_40 = np.load('Data/answer_array_40.npy')

# List of file names to load
# file_names_matrix = [f"Data/matrix_array_40_{i}.npy" for i in range(11)]  # Adjust range as needed
# file_names_answer = [f"Data/answer_array_40_{i}.npy" for i in range(11)]

# # Load each file and store in a list
# loaded_arrays_matrix = [np.load(file_name) for file_name in file_names_matrix]
# loaded_arrays_answer = [np.load(file_name) for file_name in file_names_answer]

# # Concatenate all arrays into a single array
# matrix_array_40 = np.concatenate(loaded_arrays_matrix, axis=0)
# answer_array_40 = np.concatenate(loaded_arrays_answer, axis=0)

# np.save('Data/matrix_array_40.npy', matrix_array_40)
# np.save('Data/answer_array_40.npy', answer_array_40)

In [3]:
matrix_array_40.shape, answer_array_40.shape

((587343, 40, 19), (587343, 3))

In [4]:
from sklearn.preprocessing import StandardScaler

# Assuming data is your 600k matrices concatenated into a single 3D numpy array of shape (600000, 20, 19)
matrix_array_40_reshaped = matrix_array_40.reshape(-1, 19)  # Reshape to 2D for standardization
scaler = StandardScaler()
matrix_array_40_normalized = scaler.fit_transform(matrix_array_40_reshaped)

# Reshape back to 3D
matrix_array_40_normalized = matrix_array_40_normalized.reshape(-1, 40, 19)


In [5]:
# save that scaler
import joblib

joblib.dump(scaler, 'Scalers/StandardScaler_40.pkl')

np.save('Data/matrix_array_40_normalized.npy', matrix_array_40_normalized)

## back to maxtrix_array_60

In [3]:
matrix_array_60 = np.load('Data/matrix_array_60.npy')
answer_array_60 = np.load('Data/answer_array_60.npy')

In [4]:
matrix_array_60.shape, answer_array_60.shape

((587123, 60, 19), (587123, 3))

In [5]:
from sklearn.preprocessing import StandardScaler

# Assuming data is your 600k matrices concatenated into a single 3D numpy array of shape (600000, 20, 19)
matrix_array_60_reshaped = matrix_array_60.reshape(-1, 19)  # Reshape to 2D for standardization
scaler = StandardScaler()
matrix_array_60_normalized = scaler.fit_transform(matrix_array_60_reshaped)

# Reshape back to 3D
matrix_array_60_normalized = matrix_array_60_normalized.reshape(-1, 60, 19)


In [6]:
# save that scaler
import joblib

joblib.dump(scaler, 'Scalers/StandardScaler_60.pkl')

np.save('Data/matrix_array_60_normalized.npy', matrix_array_60_normalized)

## back to maxtrix_array_80

In [9]:
matrix_array_80 = np.load('Data/matrix_array_80.npy')
answer_array_80 = np.load('Data/answer_array_80.npy')

In [10]:
matrix_array_80.shape, answer_array_80.shape

((586903, 80, 19), (586903, 3))

In [12]:
from sklearn.preprocessing import StandardScaler

# Assuming data is your 600k matrices concatenated into a single 3D numpy array of shape (600000, 20, 19)
matrix_array_80_reshaped = matrix_array_80.reshape(-1, 19)  # Reshape to 2D for standardization
scaler = StandardScaler()
matrix_array_80_normalized = scaler.fit_transform(matrix_array_80_reshaped)

# Reshape back to 3D
matrix_array_80_normalized = matrix_array_80_normalized.reshape(-1, 80, 19)


In [13]:
# save that scaler
import joblib

joblib.dump(scaler, 'Scalers/StandardScaler_80.pkl')

np.save('Data/matrix_array_80_normalized.npy', matrix_array_80_normalized)