In [107]:
import sqlite3
import pandas as pd
import time
from tensorflow.keras.models import load_model
import numpy as np

# Connect to the database
db_path = "../prisma/dev.db"
conn = sqlite3.connect(db_path)

library_name = "Bibliotheks­bereich A3"
last_chunk = None

# Load pre-trained RNN model
# model = load_model("path_to_your_model.h5")

In [108]:
def fetch_latest_data():
    global last_chunk
    query = f"""
        SELECT year, month, day, chunk, percentage
        FROM BibData
        WHERE name = '{library_name}' 
    """
    df = pd.read_sql(query, conn)
    if not df.empty:
        last_chunk = df['chunk'].iloc[-1]  # Update to latest chunk
    return df

def preprocess_data(df):
    return np.array(df['percentage']).reshape(-1, 1)  # Shape for RNN input

# IDEA: TAKE DIRECTLY ONLY THE DATA FROM THE LAST 14 DAYS OR SO
### storage optimization, time efficiency purposes???
### model could (theoretically) still be good with a dataset of 14 days

In [109]:
df = fetch_latest_data()
df

Unnamed: 0,year,month,day,chunk,percentage
0,2024,6,2,109,100
1,2024,6,2,109,100
2,2024,6,2,110,100
3,2024,6,2,110,100
4,2024,6,2,111,100
...,...,...,...,...,...
27532,2024,12,11,96,100
27533,2024,12,11,97,100
27534,2024,12,11,98,100
27535,2024,12,11,99,98


In [110]:
import pandas as pd
from datetime import timedelta

# Sort the DataFrame chronologically
df = df.sort_values(by=['year', 'month', 'day', 'chunk']).reset_index(drop=True)

# Combine year, month, day into a datetime column
df['DateTime'] = pd.to_datetime(df[['year', 'month', 'day']])

# Add the chunk offset (10 minutes per chunk)
df['DateTime'] += pd.to_timedelta(df['chunk'] * 10, unit='m')

# Drop redundant columns if needed
df = df.drop(columns=['year', 'month', 'day', 'chunk'])

# Display the updated DataFrame
df


Unnamed: 0,percentage,DateTime
0,28,2024-06-02 00:00:00
1,25,2024-06-02 00:10:00
2,25,2024-06-02 00:20:00
3,24,2024-06-02 00:30:00
4,23,2024-06-02 00:40:00
...,...,...
27532,100,2024-12-11 16:00:00
27533,100,2024-12-11 16:10:00
27534,100,2024-12-11 16:20:00
27535,98,2024-12-11 16:30:00


In [111]:
# Normalize percentage values with min max scaler
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df['Percentage'] = scaler.fit_transform(df['percentage'].values.reshape(-1, 1))
df.drop(columns=['percentage'], inplace=True)
df

Unnamed: 0,DateTime,Percentage
0,2024-06-02 00:00:00,0.28
1,2024-06-02 00:10:00,0.25
2,2024-06-02 00:20:00,0.25
3,2024-06-02 00:30:00,0.24
4,2024-06-02 00:40:00,0.23
...,...,...
27532,2024-12-11 16:00:00,1.00
27533,2024-12-11 16:10:00,1.00
27534,2024-12-11 16:20:00,1.00
27535,2024-12-11 16:30:00,0.98


In [112]:
# We look at 5 days in the past to predict the next value (5 days x 24 hours x 6 chunks = 720 chunks)
past = 720

# We want to predict one hour ahead
future = 6

# We sample data every hour - look at it every 6 chunks within the (past, future) timeframe
# We do this to reduce the amount of data to process to a manageable size
sampling_rate = 6

# Define the sequence length:
# We actually look at 720 / 6 = 120 timesteps in the past (120 points of past data)
sequence_length = int(past / sampling_rate)

# 80% train, 20% validation
# Note that there is no test data, since we do not actually know the future values to test against
split_fraction = 0.8
train_split = int(split_fraction * len(df)) # marker for the end of the training set

### Prepare data for training

In [113]:
import numpy as np
from tensorflow.keras.preprocessing import timeseries_dataset_from_array

# in the end, we only need the percentage data for the model
# because the order of the data, sorted chronologically, already encodes the time dependency
data = df['Percentage'].values

x_train = data[: train_split]
y_train = data[past : train_split + future] 

x_val = data[train_split : len(data) - future] # don't go to the end, let the future data be the target
y_val = data[train_split + past :] # offset train_split by future

# Use timeseries_dataset_from_array
train_ds = timeseries_dataset_from_array(
    x_train,
    y_train,
    sequence_length = sequence_length,
    sampling_rate = sampling_rate,
    batch_size=128,
)

val_ds = timeseries_dataset_from_array(
    x_val,
    y_val,
    sequence_length=sequence_length,
    sampling_rate = sampling_rate,
    batch_size=32,
)

In [114]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Dropout

# Define the GRU model
def build_gru_model(input_shape, output_steps):
    model = Sequential([
        GRU(128, activation='tanh', return_sequences=True, input_shape=input_shape),
        Dropout(0.2),
        GRU(64, activation='tanh', return_sequences=False),
        Dense(32, activation='relu'),
        Dense(1)
    ])
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

# Example parameters
input_shape = (120, 1)  # Sequence length = 120, Features = 1
model = build_gru_model(input_shape, output_steps)


  super().__init__(**kwargs)


In [115]:
# define callbacks
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ReduceLROnPlateau

early_stopping = EarlyStopping(monitor='val_loss', patience = 4)

learning_rate_reduction = ReduceLROnPlateau(
    monitor='val_loss', factor=0.2, patience=2, min_lr=0.0001
)

In [116]:
# train model
model.fit(
    train_ds,
    epochs = 5,
    validation_data = val_ds,
    callbacks = [early_stopping, learning_rate_reduction]
)

Epoch 1/5
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 239ms/step - loss: 0.0664 - mae: 0.1592 - val_loss: 0.0139 - val_mae: 0.0778 - learning_rate: 0.0010
Epoch 2/5
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 271ms/step - loss: 0.0092 - mae: 0.0596 - val_loss: 0.0129 - val_mae: 0.0705 - learning_rate: 0.0010
Epoch 3/5
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 271ms/step - loss: 0.0090 - mae: 0.0566 - val_loss: 0.0105 - val_mae: 0.0629 - learning_rate: 0.0010
Epoch 4/5
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 298ms/step - loss: 0.0078 - mae: 0.0514 - val_loss: 0.0095 - val_mae: 0.0586 - learning_rate: 0.0010
Epoch 5/5
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 272ms/step - loss: 0.0074 - mae: 0.0505 - val_loss: 0.0088 - val_mae: 0.0568 - learning_rate: 0.0010


<keras.src.callbacks.history.History at 0x1a7d01d9490>