In [1]:
import pandas as pd
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import precision_score, accuracy_score
from sklearn.model_selection import train_test_split

In [20]:
df = pd.read_csv("binance_btcusd.csv")

In [21]:
df.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0,1596229260000,11322.05,11323.02,11294.0,11297.15,75.471279,1596229319999,853448.2,1200,24.872009,281269.304392,0
1,1,1596229320000,11298.56,11332.02,11297.07,11330.86,57.373999,1596229379999,649222.4,1486,36.822923,416647.500196,0
2,2,1596229380000,11331.09,11350.06,11330.86,11347.44,76.438368,1596229439999,867046.2,1114,59.610735,676161.703455,0
3,3,1596229440000,11347.44,11365.28,11347.44,11363.27,90.757347,1596229499999,1030781.0,1130,52.535249,596693.051054,0
4,4,1596229500000,11363.46,11365.52,11363.27,11364.08,21.991909,1596229559999,249915.3,332,4.848286,55096.55646,0


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 269176 entries, 0 to 269175
Data columns (total 13 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Unnamed: 0  269176 non-null  int64  
 1   0           269176 non-null  int64  
 2   1           269176 non-null  float64
 3   2           269176 non-null  float64
 4   3           269176 non-null  float64
 5   4           269176 non-null  float64
 6   5           269176 non-null  float64
 7   6           269176 non-null  int64  
 8   7           269176 non-null  float64
 9   8           269176 non-null  int64  
 10  9           269176 non-null  float64
 11  10          269176 non-null  float64
 12  11          269176 non-null  int64  
dtypes: float64(8), int64(5)
memory usage: 26.7 MB


In [23]:
#добавляем столбцы с месяцем, днем, часом и минутами
df["month"] = pd.to_datetime(df["0"], unit = "ms").dt.month
df["day"] = pd.to_datetime(df["0"], unit = "ms").dt.day
df["hour"] = pd.to_datetime(df["0"], unit = "ms").dt.hour
df["minute"] = pd.to_datetime(df["0"], unit = "ms").dt.minute

In [24]:
#убираем ненужные столбцы
df = df.drop(columns = ["Unnamed: 0", "0", "6", "11"])

In [25]:
#добавляем столбец target, в котором будет отображаться, повысилвсь ли цена на 0.5% за минуту, или нет
df["target"] = ((df["2"] - df["1"])/df["1"])>=0.005

In [26]:
X = df.drop(columns = "target")
y = df["target"]

In [27]:
#сохраняем данные, для обучения scaler в итоговом скрипте
X.to_csv("df_to_fit_scaler.csv", index = False)

In [10]:
#маштабируем данные в (0,1)
scaler = MinMaxScaler()
X = pd.DataFrame(data = scaler.fit_transform(X))

In [11]:
#определяем функцию, для создания выборки, подходящей для обучения сетей
def create_dataset(dataframe_X, dataframe_y, look_back,):
    data = []
    labels = []
    
    for i in range(look_back, dataframe_X.shape[0]):
        data.append(np.array(dataframe_X.iloc[i-look_back:i].values))
        labels.append(dataframe_y[i])
    
    return np.array(data), np.array(labels)

In [12]:
X, y = create_dataset(X, y, 15)

In [13]:
print(X.shape, y.shape)

(269161, 15, 13) (269161,)


In [14]:
#делим выборки на обучающую, валидационную и тестовую. Используем пропорцианальное деление, так как выборка несбалансированная
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, stratify = y)
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, train_size = 0.7, stratify = y_test)

In [15]:
#проверяем размеры
print(X_train.shape, y_train.shape, y_train.sum())
print(X_valid.shape, y_valid.shape, y_valid.sum())
print(X_test.shape, y_test.shape, y_test.sum())

(188412, 15, 13) (188412,) 1213
(56524, 15, 13) (56524,) 364
(24225, 15, 13) (24225,) 156


In [16]:
#зададим нужный тип данных
X_train = X_train.astype("float32")
X_valid = X_valid.astype("float32")
X_test = X_test.astype("float32")

In [17]:
#определяем модель и обучаем
model = keras.Sequential([
    layers.BatchNormalization(),
    layers.LSTM(64, activation = "relu", input_shape = (15,13)),
    layers.Dropout(rate = 0.3),
    layers.BatchNormalization(),
    layers.Dropout(rate = 0.3),
    layers.Dense(1, activation = "sigmoid")
])

model.compile(
    loss = "binary_crossentropy",
    optimizer = "adam",
    metrics = ["Precision", "binary_accuracy"]
)
filepath = "best_model_v2.h5"
callback = ModelCheckpoint(filepath, monitor = "val_precision", save_best_only = True)

history = model.fit(
    X_train, y_train,
    validation_data = (X_valid, y_valid),
    batch_size = 64, epochs = 3,
    callbacks = [callback],
    shuffle = True
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [18]:
#смотрим precision на тестовом множестве
pred = model.predict(X_test)
res = []
for i in pred:
    if i[0]>=0.5:
        res.append(True)
    else:
        res.append(False)
print(precision_score(y_test, res))
print(accuracy_score(y_test, res))

0.3888888888888889
0.9928998968008256


In [19]:
#сохраняем лучшую модель
#model.save("57_test_precision.h5")

In [None]:
#получается, из 100 входов в сделуку, 57 будут прибыльными