# Model training

In [None]:
from google.colab import drive
import numpy as np
import os
import pickle
from sklearn.metrics import f1_score
import tensorflow as tf
from tensorflow.keras.callbacks import Callback, ModelCheckpoint
from tensorflow.keras.layers import Bidirectional, Dense, LSTM, TimeDistributed
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical

In [None]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
def data_generator(data_path, batch_size, n_tags):
    # List all .pkl files in the data_path directory, sorted alphabetically
    files = [os.path.join(data_path, f) for f in sorted(os.listdir(data_path)) if f.endswith('.pkl')]
    # Infinite loop to allow the generator to yield data indefinitely
    while True:
        # Iterate through each file in the sorted list of .pkl files
        for filepath in files:
            # Open the .pkl file in read-binary mode
            with open(filepath, 'rb') as file:
                # Load the data (features and labels) from the .pkl file
                X, y = pickle.load(file)
                # Convert the labels to one-hot encoding format based on the number of tags
                y = to_categorical(y, num_classes=n_tags)
                # Yield batches of the data and labels
                for i in range(0, len(X), batch_size):
                    yield X[i:i+batch_size], y[i:i+batch_size]

## Model

We create a neural network model for sequence labeling;

In [None]:
def create_model(input_shape, n_tags):
    # Initialize a Sequential model
    model = Sequential([
        # Add a Bidirectional LSTM layer
        Bidirectional(LSTM(units=64, return_sequences=True, dropout=0.1), input_shape=input_shape),
        # Add a TimeDistributed Dense layer for output, with softmax activation
        TimeDistributed(Dense(n_tags, activation="softmax"))
    ])
    # Compile the model with Adam optimizer, categorical crossentropy loss, and accuracy metric
    model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

    return model

Then we train it our preprocessed data. The model is also saved after each epoch.

In [None]:
max_sequence_length = 30
embedding_dim = 300
n_tags = 13

# Create the model with specified input shape and number of tags
model = create_model((max_sequence_length, embedding_dim), n_tags)
# Print the model summary to show its architecture
model.summary()

checkpoint_path = "/content/gdrive/MyDrive/opj/data/checkpoints/model_epoch_{epoch:02d}.hdf5"

# Initialize a ModelCheckpoint callback to save the model at the end of each epoch
model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_path,
    save_weights_only=False,
    save_freq='epoch'
)

train_data_path = '/content/gdrive/MyDrive/opj/data/train'
batch_size = 32
train_generator = data_generator(train_data_path, batch_size, 13)
# Calculate the number of steps per epoch based on the total number of training samples and batch size
steps_per_epoch = sum([len(pickle.load(open(os.path.join(train_data_path, f), 'rb'))[0]) for f in os.listdir(train_data_path) if f.endswith('.pkl')]) // batch_size

# Training
model.fit(
    train_generator,
    steps_per_epoch=steps_per_epoch,
    epochs=10,
    verbose=1,
    callbacks=[model_checkpoint_callback]
)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional (Bidirection  (None, 30, 128)           186880    
 al)                                                             
                                                                 
 time_distributed (TimeDist  (None, 30, 13)            1677      
 ributed)                                                        
                                                                 
Total params: 188557 (736.55 KB)
Trainable params: 188557 (736.55 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/10
Epoch 2/10
   2/2810 [..............................] - ETA: 2:44 - loss: 0.1626 - accuracy: 0.9495

  saving_api.save_model(


Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x78803c307490>

We see the model took ~37 minutes to train across 10 epochs. If we used the full data, the time taken would be increased by 672 (14 original chunks * 48 later divisions) to 414 hours.