In [1]:
# ! pip install transformers datasets
# ! pip install rouge-score
# ! pip install huggingface_hub
# ! pip install keras_nlp

In [53]:
import pathlib
import random
import string
import re
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization

In [54]:
# Enable mixed precision for better performance on GPUs with Tensor Cores
tf.keras.mixed_precision.set_global_policy('mixed_float16')

# Check the number of available GPUs
gpus = tf.config.list_physical_devices('GPU')
num_gpus = len(gpus)
if num_gpus > 1:
    # Enable mirrored strategy for distributed training
    strategy = tf.distribute.MirroredStrategy()
    print('Running distributed training on', num_gpus, 'GPUs.')
else:
    strategy = tf.distribute.get_strategy()
    print('Running training on a single GPU.')

Running training on a single GPU.


In [55]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the data from the Excel file
file_path = "raw_8_om_training_set.xlsx"
data = pd.read_excel(file_path)

# Extract the source and target sequences
source_sequences = data["OM_Regular"].tolist()
target_sequences = data["OM_Prediction"].tolist()

# Concatenate "[start]" and "[end]" to each target sequence
target_sequences = ["[start] " + text + " [end]" for text in target_sequences]

# Split the dataset into train, validation, and test sets
train_src, val_test_src, train_tgt, val_test_tgt = train_test_split(source_sequences, target_sequences, test_size=0.3, random_state=42)
val_src, test_src, val_tgt, test_tgt = train_test_split(val_test_src, val_test_tgt, test_size=0.5, random_state=42)

# Combine the source and target sequences into pairs
train_pairs = list(zip(train_src, train_tgt))
val_pairs = list(zip(val_src, val_tgt))
test_pairs = list(zip(test_src, test_tgt))

print(f"Total pairs: {len(train_pairs) + len(val_pairs) + len(test_pairs)}")
print(f"Training pairs: {len(train_pairs)}")
print(f"Validation pairs: {len(val_pairs)}")
print(f"Test pairs: {len(test_pairs)}")


Total pairs: 13336
Training pairs: 9335
Validation pairs: 2000
Test pairs: 2001


In [56]:
import os
import pandas as pd
from transformers import AutoTokenizer

# Define the maximum lengths for input and target sequences
MAX_INPUT_LENGTH = 500
MAX_TARGET_LENGTH = 500

# Load the data from the Excel file
file_path = "raw_8_om_training_set.xlsx"
data = pd.read_excel(file_path)

# Extract the source and target sequences
source_sequences = data["OM_Regular"].tolist()
target_sequences = data["OM_Prediction"].tolist()

# Initialize the T5 tokenizer with customized vocab size
tokenizer = AutoTokenizer.from_pretrained("t5-small", vocab_size=15000)

# Tokenize the sequences
tokenized_sequences = []
for source, target in zip(source_sequences, target_sequences):
    tokenized_input = tokenizer(source, max_length=MAX_INPUT_LENGTH, truncation=True, padding="max_length", return_tensors="pt")
    tokenized_target = tokenizer(target, max_length=MAX_TARGET_LENGTH, truncation=True, padding="max_length", return_tensors="pt")

    tokenized_sequence_pair = {
        "input_ids": tokenized_input.input_ids,
        "attention_mask": tokenized_input.attention_mask,
        "decoder_input_ids": tokenized_target.input_ids,
        "decoder_attention_mask": tokenized_target.attention_mask
    }

    tokenized_sequences.append(tokenized_sequence_pair)

# Save the tokenizer model in the current directory
tokenizer.save_pretrained(".")

# The tokenizer model will be saved in the current directory.


('./tokenizer_config.json',
 './special_tokens_map.json',
 './spiece.model',
 './added_tokens.json',
 './tokenizer.json')

In [57]:

# Print tokenized data for the first sequence pair
print("Tokenized Input:")
print(tokenized_sequences[0]["input_ids"])
print("Tokenized Target:")
print(tokenized_sequences[0]["decoder_input_ids"])





Tokenized Input:
tensor([[ 6008,  2204,  9505,    35, 28449,   782,     3, 13658,  5570, 24722,
          4285,     7,  4501,     2,   144,    17,    52, 17175,  3274,   564,
          6633,  4309,  1220,  4350,  6633, 23954,    23,    26,  2423,  4350,
          6633,  4309,   159,  8952,     7,  6471,    17,  3274,  5388,    32,
          4208,     2,   782,     3, 13658,   564,  6633,  4309,  4285,     7,
            86,    17,    15,  1304,     2,   782,     3, 13658,   564,  6633,
         23954,  4285,     7,  6108,     2,   782,     3, 13658, 12928,   179,
          4285,     7,  4501,     2,   144,    17,    52, 17175,  3274,  7660,
          4309,  1220,  9504,   179, 23954,    23,    26,  2423,  9504,   179,
          4309,   159,  8952,     7,  6471,    17,  3274,  5388,    32,  4208,
             2,   782,     3, 13658,  7660,  4309,  4285,     7,    86,    17,
            15,  1304,     2,   782,     3, 13658,  7660, 23954,  4285,     7,
          6108,     2,   782,     3

In [58]:
# Convert token IDs back to tokens using the tokenizer
input_ids = tokenized_sequences[0]["input_ids"].tolist()
decoder_input_ids = tokenized_sequences[0]["decoder_input_ids"].tolist()

input_tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
target_tokens = tokenizer.convert_ids_to_tokens(decoder_input_ids[0])

# Print tokenized data as tokens
print("Tokenized Input Tokens:")
print(input_tokens)
print("Tokenized Target Tokens:")
print(target_tokens)


Tokenized Input Tokens:
['▁module', '▁decide', 'rop', 'en', '▁Declaration', 'one', '▁', 'sig', '▁Name', 'Space', '▁extend', 's', '▁Class', '<unk>', 'at', 't', 'r', 'Set', '▁=', '▁name', 'space', 'ID', '+', 'name', 'space', 'Name', 'i', 'd', '=', 'name', 'space', 'ID', 'is', 'Ab', 's', 'trac', 't', '▁=', '▁Non', 'o', '▁parent', '<unk>', 'one', '▁', 'sig', '▁name', 'space', 'ID', '▁extend', 's', '▁In', 't', 'e', 'ger', '<unk>', 'one', '▁', 'sig', '▁name', 'space', 'Name', '▁extend', 's', '▁string', '<unk>', 'one', '▁', 'sig', '▁Vari', 'able', '▁extend', 's', '▁Class', '<unk>', 'at', 't', 'r', 'Set', '▁=', '▁variable', 'ID', '+', 'vari', 'able', 'Name', 'i', 'd', '=', 'vari', 'able', 'ID', 'is', 'Ab', 's', 'trac', 't', '▁=', '▁Non', 'o', '▁parent', '<unk>', 'one', '▁', 'sig', '▁variable', 'ID', '▁extend', 's', '▁In', 't', 'e', 'ger', '<unk>', 'one', '▁', 'sig', '▁variable', 'Name', '▁extend', 's', '▁string', '<unk>', 'one', '▁', 'sig', '▁Relation', 'ship', '▁extend', 's', '▁Class', '<unk>