## Imports

In [2]:
# imports
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime, timezone

## Data Preprocessing

In [3]:
data = pd.read_csv('Data/df_all.csv')
data = data.drop(['Ignore','up_cross','down_cross','minutes','log_minutes','side'], axis=1)
data

Unnamed: 0,Open time,Open,High,Low,Close,Volume,Close time,Quote asset volume,Number of trades,Taker buy base asset volume,Taker buy quote asset volume,plus_6,minus_6,zero_6
0,1598918400000,0.28115,0.28139,0.28071,0.28072,380328.3,1598918579999,106892.301589,273,218211.7,61330.310857,0,1,0
1,1598918580000,0.28072,0.28090,0.28003,0.28026,624736.7,1598918759999,175160.051227,390,304653.2,85409.151785,0,1,0
2,1598918760000,0.28026,0.28027,0.27900,0.27955,1356014.6,1598918939999,379265.408111,604,423652.9,118467.894514,0,1,0
3,1598918940000,0.27953,0.28009,0.27906,0.28000,537899.0,1598919119999,150457.534717,334,397486.4,111204.348733,0,1,0
4,1598919120000,0.28002,0.28010,0.27915,0.27917,660136.4,1598919299999,184575.997666,368,151908.0,42457.190472,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
583676,1704066480000,0.61410,0.61460,0.61360,0.61400,615136.0,1704066659999,377857.088800,353,365727.0,224648.533900,0,0,1
583677,1704066660000,0.61400,0.61430,0.61380,0.61410,585937.0,1704066839999,359771.483300,246,279143.0,171394.086100,0,0,1
583678,1704066840000,0.61410,0.61490,0.61370,0.61480,801337.0,1704067019999,492268.260700,395,353674.0,217266.831900,0,0,1
583679,1704067020000,0.61480,0.61570,0.61470,0.61560,908084.0,1704067199999,558697.709000,527,577042.0,355006.723400,0,0,1


In [4]:
# Convert ISO 8601 date strings to Unix timestamp (milliseconds)
def iso_to_unix(iso_str):
    dt = datetime.strptime(iso_str, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
    return int(dt.timestamp() * 1000)  # Convert to milliseconds

def unix_to_iso(unix_timestamp_ms):
    # Convert milliseconds to seconds
    unix_timestamp_s = unix_timestamp_ms / 1000
    # Create a datetime object from the Unix timestamp
    dt = datetime.utcfromtimestamp(unix_timestamp_s)
    # Format the datetime object as an ISO 8601 date string
    iso_str = dt.strftime("%Y-%m-%dT%H:%M:%SZ")
    return iso_str

In [5]:
# looking for not continuous points
non_continuous_index =[]

for i in range(len(data)-1):
    if data['Open time'][i+1] - data['Open time'][i] != 3*60*1000:
        print('wrong!', i)
        print(data['Open time'][i])
        non_continuous_index.append(i)

wrong! 43319
1606715820000
wrong! 53535
1608558300000
wrong! 55135
1608861420000
wrong! 78189
1613014740000
wrong! 89169
1614995820000
wrong! 110739
1618883820000
wrong! 113130
1619323200000
wrong! 165795
1628819820000
wrong! 188365
1632898620000
wrong! 448119
1679661540000


In [6]:
print(non_continuous_index)
print(len(non_continuous_index))

[43319, 53535, 55135, 78189, 89169, 110739, 113130, 165795, 188365, 448119]
10


In [7]:
# Initialize an empty list to hold your chunks of data
data_chunks = []

# Set the start index for the first chunk
start_idx = 0

# Iterate through the non-continuous indices and split the data into chunks
for idx in non_continuous_index:
    # Create a chunk from start_idx to the non-continuous index
    chunk = data.iloc[start_idx:idx+1]
    # Append the chunk to your list of data chunks
    data_chunks.append(chunk.reset_index(drop=True))
    # Update start_idx for the next chunk
    start_idx = idx + 1

# Don't forget to grab the last chunk of data after the last non-continuous index
final_chunk = data.iloc[start_idx:]
data_chunks.append(final_chunk.reset_index(drop=True))

# Now data_chunks is a list of DataFrames, each representing a continuous chunk of time
# You can access individual chunks with data_chunks[0], data_chunks[1], etc.

In [8]:
# Define the window sizes for the moving averages
windows = [5, 10, 20, 30, 60, 120, 240]
# Define the window size and standard deviation multiplier for the Bollinger Bands
BB_window_size = 90
BB_std_multiplier = 1

# Iterate through each chunk in data_chunks
for i, chunk in enumerate(data_chunks):
    # ADDING MA
    for window in windows:
        # Calculate the moving average
        moving_avg = chunk['Close'].rolling(window).mean()
        # Add the moving average as a new column to the chunk
        chunk[f'MA{window}'] = moving_avg

    # Calculate the moving average and standard deviation
    BB_moving_avg = chunk['Close'].rolling(BB_window_size).mean()
    BB_std_dev = chunk['Close'].rolling(BB_window_size).std()

    # ADDING BB
    # Calculate the Bollinger Bands
    BB_upper_band = BB_moving_avg + (BB_std_multiplier * BB_std_dev)
    BB_lower_band = BB_moving_avg - (BB_std_multiplier * BB_std_dev)

    # Add the Bollinger Bands and moving average as new columns to the chunk
    chunk[f'MA{BB_window_size}'] = BB_moving_avg
    chunk[f'Upper_Band{BB_window_size}'] = BB_upper_band
    chunk[f'Lower_Band{BB_window_size}'] = BB_lower_band

    # drop NaN values
    chunk = chunk.dropna().reset_index(drop=True)
    chunk = chunk.drop(columns = ['Open time', 'Close time',], axis=1)
    # Optionally, update the chunk in data_chunks (if you want to keep the changes)
    data_chunks[i] = chunk

# Now each chunk in data_chunks has new columns for the moving averages


# # If you prefer to have separate variables for each chunk, you could do something like:
# for i, chunk in enumerate(data_chunks):
#     globals()[f'data_chunk_{i}'] = chunk

In [9]:
len(data_chunks)

11

In [None]:
import numpy as np

# Initialize empty lists to hold the data
matrix_list = []
answer_list = []

for chunk in data_chunks:
    if len(chunk)>=20:
        for i in range(len(chunk) - 19):  # Ensure there are 20 rows available
            # Create a matrix of 20 rows
            matrix = chunk.drop(columns= ['plus_6','minus_6','zero_6'], axis=1).iloc[i:i+20].values
            matrix_list.append(matrix)

            # Get the up, down, zero values and convert them to a list
            answer = chunk.iloc[i+19][['plus_6', 'minus_6', 'zero_6']].tolist()
            answer_list.append(answer)

# Convert the lists to numpy arrays
matrix_array_20 = np.array(matrix_list)
answer_array_20 = np.array(answer_list)

## to 03-1.py

## back to maxtrix_array_20

In [17]:
matrix_array_20 = np.load('Data/matrix_array_20.npy')
answer_array_20 = np.load('Data/answer_array_20.npy')

# List of file names to load
# file_names_matrix = [f"Data/matrix_array_20_{i}.npy" for i in range(11)]  # Adjust range as needed
# # file_names_answer = [f"Data/answer_array_20_{i}.npy" for i in range(11)]

# # Load each file and store in a list
# loaded_arrays_matrix = [np.load(file_name) for file_name in file_names_matrix]
# # loaded_arrays_answer = [np.load(file_name) for file_name in file_names_answer]

# # Concatenate all arrays into a single array
# matrix_array_20 = np.concatenate(loaded_arrays_matrix, axis=0)
# # answer_array_20 = np.concatenate(loaded_arrays_answer, axis=0)

In [18]:
# matrix_array_20.shape, answer_array_20.shape
matrix_array_20.shape

(580843, 20, 19)

## back to maxtrix_array_40

In [15]:
matrix_array_40 = np.load('Data/matrix_array_40.npy')
answer_array_40 = np.load('Data/answer_array_40.npy')

# List of file names to load
# file_names_matrix = [f"Data/matrix_array_40_{i}.npy" for i in range(11)]  # Adjust range as needed
# file_names_answer = [f"Data/answer_array_40_{i}.npy" for i in range(11)]

# # Load each file and store in a list
# loaded_arrays_matrix = [np.load(file_name) for file_name in file_names_matrix]
# loaded_arrays_answer = [np.load(file_name) for file_name in file_names_answer]

# # Concatenate all arrays into a single array
# matrix_array_40 = np.concatenate(loaded_arrays_matrix, axis=0)
# answer_array_40 = np.concatenate(loaded_arrays_answer, axis=0)

# np.save('Data/matrix_array_40.npy', matrix_array_40)
# np.save('Data/answer_array_40.npy', answer_array_40)

In [16]:
matrix_array_40.shape, answer_array_40.shape

((580623, 40, 19), (580623, 3))

## back to maxtrix_array_60

In [13]:
matrix_array_60 = np.load('Data/matrix_array_60.npy')
answer_array_60 = np.load('Data/answer_array_60.npy')

In [14]:
matrix_array_60.shape, answer_array_60.shape

((580403, 60, 19), (580403, 3))

## back to maxtrix_array_80

In [11]:
matrix_array_80 = np.load('Data/matrix_array_80.npy')
answer_array_80 = np.load('Data/answer_array_80.npy')

In [12]:
matrix_array_80.shape, answer_array_80.shape

((580183, 80, 19), (580183, 3))