## Исследование влияния уменьшения ряда с конца на качество прогнозирования моделей RNN и LSTM

Особенно проверить, что ошибка на валидационных данных будет меньше, чем на тестовой при разбиении данных на другие части

### Подключение библиотек

In [11]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Input
from tensorflow.keras.layers import LSTM
from tensorflow.keras.optimizers import Adam

import sys
import os
sys.path.append(os.path.abspath('..'))
from helpful_functions import train_val_test_split, \
                              create_sequences,\
                              denormalize, \
                              train_and_evaluate_model          

### Подготовка данных

In [12]:
file_path = '../dataset/preprocessed_data.csv'
df = pd.read_csv(file_path)

df['datetime'] = pd.to_datetime(df['datetime'])
df.set_index('datetime', inplace=True)
df.sort_index(inplace=True)

target_feature = 'Global_active_power'
test_ratio = 0.10
val_ratio = 0.10

df = df.iloc[:-400000]

train_data, val_data, test_data = train_val_test_split(df, val_ratio, test_ratio)

scaler = MinMaxScaler(feature_range=(0, 1))
train_scaled = scaler.fit_transform(train_data)
val_scaled = scaler.transform(val_data)
test_scaled = scaler.transform(test_data)

target_idx = df.columns.get_loc(target_feature)

features_count = train_data.shape[1]

### Обучение модели

In [None]:
lr = 0.01
window_size = 30

X_train, y_train = create_sequences(train_scaled, target_idx, window_size)
X_val, y_val = create_sequences(val_scaled, target_idx, window_size)
X_test, y_test = create_sequences(test_scaled, target_idx, window_size)

lstm_model = Sequential([
    Input(shape=(window_size, features_count)),
    LSTM(50, activation='tanh'),
    Dense(1)
])
result_lstm = train_and_evaluate_model(
    lstm_model, 'LSTM', X_train, y_train, X_val, y_val, X_test, y_test,
    lr, window_size, scaler, target_feature, df.columns
)

LSTM | val_loss: 0.0003 | MSE_val: 0.0422


### Вывод ошибок

In [15]:
results_df = pd.DataFrame([result_lstm])

best_lstm = results_df[results_df['model_type'] == 'LSTM'].nsmallest(1, 'mse_val_dn').iloc[0]

print("\nЛучшая LSTM (по валидации):")
print(best_lstm[['model_type', 'learning_rate', 'window_size', 'mse_val_dn', 'mse_test_dn']])


Лучшая LSTM (по валидации):
model_type           LSTM
learning_rate        0.01
window_size            30
mse_val_dn       0.042153
mse_test_dn      0.052729
Name: 0, dtype: object
