In [12]:
# pip install transformers[torch]

In [11]:
# pip install accelerate -U

In [3]:
#Import Libraries

import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import torch

In [4]:
# Load the dataset
data = pd.read_csv("8_OM_dynamic_40p_60np.csv")

In [5]:
# Selecting the relevant columns
data = data[['OM_Regular', 'OM_Prediction']]

In [6]:
# Splitting data into train and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [7]:
# Load the T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
tokenizer.model_max_length = 512  # Set maximum sequence length directly in the tokenizer
model = T5ForConditionalGeneration.from_pretrained("t5-small")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [8]:

   def preprocess_data(data, tokenizer, max_seq_length=512):
    input_texts = list(data['OM_Regular'])
    target_texts = list(data['OM_Prediction'])

    # Tokenize input and target texts
    input_encodings = tokenizer(input_texts, truncation=True, padding='max_length', max_length=max_seq_length)
    target_encodings = tokenizer(target_texts, truncation=True, padding='max_length', max_length=max_seq_length)
        # Convert tokenized sequences to PyTorch tensors
    input_ids = torch.tensor(input_encodings['input_ids'])
    target_ids = torch.tensor(target_encodings['input_ids'])

    return input_ids, target_ids

train_inputs, train_targets = preprocess_data(train_data, tokenizer)
test_inputs, test_targets = preprocess_data(test_data, tokenizer)

In [9]:
# Define a custom dataset class
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs[idx],
            'labels': self.targets[idx]
        }


In [10]:
# Create instances of custom dataset
train_dataset = CustomDataset(train_inputs, train_targets)
eval_dataset = CustomDataset(test_inputs, test_targets)


In [11]:

# Define training arguments with smaller batch size
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=8,  # Reduce batch size
    per_device_eval_batch_size=8,   # Reduce batch size for evaluation
    warmup_steps=500,
    weight_decay=0.02,
    logging_dir='./logs',
    logging_steps=500,
    save_steps=1000,
    evaluation_strategy="epoch",
)

In [12]:
# Define a Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0014,0.001027
2,0.0009,0.000829
3,0.0009,0.000777
4,0.0008,0.000778
5,0.0008,0.000758


TrainOutput(global_step=6630, training_loss=0.27931009854553873, metrics={'train_runtime': 4776.6836, 'train_samples_per_second': 11.104, 'train_steps_per_second': 1.388, 'total_flos': 7178529150074880.0, 'train_loss': 0.27931009854553873, 'epoch': 5.0})

In [13]:
# Evaluate the model
eval_results = trainer.evaluate(eval_dataset)

In [14]:
# # Calculate additional metrics
# labels = test_targets.flatten().tolist()
# preds = model.generate(test_inputs)
# preds = torch.tensor(preds).flatten().tolist()
# accuracy = accuracy_score(labels, preds)
# precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')

# # Display evaluation results
# print("Evaluation Results:")
# print(f"Accuracy: {accuracy}")
# print(f"Precision: {precision}")
# print(f"Recall: {recall}")
# print(f"F1 Score: {f1}")

In [15]:
# Test the model with new input data
def generate_predictions(input_text, model, tokenizer):
    # Move input text to the same device as the model
    device = next(model.parameters()).device

    # Encode input text
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)

    # Generate predictions
    with torch.no_grad():
        outputs = model.generate(input_ids.to('cuda:0'))  # Move input_ids to CPU before generation

    # Move predictions to CPU and decode
    predicted_text = tokenizer.decode(outputs[0].cpu(), skip_special_tokens=True)

    return predicted_text

# Test the model with new input data
input_text = "module OM_name: 0, open Declaration one sig class1_name extends Class attrSet = c1_at1+c1_at2 id=c1_at1 no parent isAbstract = No } one sig c1_at1 extends c1_at1_type one sig c1_at2 extends c1_at2_type, one sig class2_name extends Class attrSet = c2_at1+c2_at2 id = c2_at1 no parent isAbstract = No } one sig c2_at1 extends c2_at1_type one sig c2_at2 extends c2_at2_type, one sig class3_name extends Class attrSet = c3_at1+c3_at2+c3_at3+c3_at4+c3_at5+c3_at6 id=c3_at1 no parent isAbstract = No } one sig c3_at1 extends c3_at1_type one sig c3_at2 extends c3_at2_type one sig c3_at3 extends c3_at3_type one sig c3_at4 extends c3_at4_type one sig c3_at5 extends c3_at5_type one sig c3_at6 extends c3_at6_type, one sig class4_name extends Class attrSet = c4_at1+c4_at2+c4_at3+c4_at4 id=c4_at3 no parent isAbstract = No } one sig c4_at1 extends c4_at1_type one sig c4_at2 extends c4_at2_type one sig c4_at3 extends c4_at3_type one sig c4_at4 extends c4_at4_type, one sig class5_name extends Class attrSet = c5_at1 one parent parent in class2_name id=c2_at1 isAbstract = No } one sig c5_at1 extends c5_at1_type, one sig class6_name extends Class attrSet = c6_at1_0+c6_at2+c6_at3+c6_at4 id=c6_at2 no parent isAbstract = No } one sig c6_at1_0 extends c6_at1_type one sig c6_at2 extends c6_at2_type one sig c6_at3 extends c6_at3_type one sig c6_at4 extends c6_at4_type, one sig class7_name extends Class attrSet = c7_at1 one parent parent in class2_name id=c2_at1 isAbstract = No } one sig c7_at1 extends c7_at1_type, one sig class8_name extends Class attrSet = c8_at1+c8_at2+c2_at2+c6_at3 id=c8_at1 no parent isAbstract = No } one sig c8_at1 extends c8_at1_type one sig c8_at1 extends c8_at1_type, one sig assoc1 extends Association src = class8_name dst = class2_name, src_multiplicity = src_mlpc dst_multiplicity = dst_mlpc, } one sig assoc2 extends Association src = class2_name dst = class4_name, src_multiplicity = src_mlpc2 dst_multiplicity = dst_mlpc, } one sig assoc3 extends Association src = class8_name dst = class3_name, src_multiplicity = src_mlpc dst_multiplicity = dst_mlpc, } one sig assoc4 extends Association src = class8_name dst = class4_name, src_multiplicity = src_mlpc dst_multiplicity = dst_mlpc, } one sig assoc5 extends Association src = class8_name dst = class6_name, src_multiplicity = src_mlpc dst_multiplicity = dst_mlpc2, } one sig assoc6 extends Association src = class6_name dst = class1_name, src_multiplicity = src_mlpc dst_multiplicity = dst_mlpc2, } one sig assoc7 extends Association src = class6_name dst = class4_name, src_multiplicity = src_mlpc dst_multiplicity = dst_mlpc, } one sig assoc8 extends Association src = class1_name dst = class3_name, src_multiplicity = src_mlpc dst_multiplicity = dst_mlpc, } one sig assoc9 extends Association src = class1_name dst = class4_name, src_multiplicity = src_mlpc dst_multiplicity = dst_mlpc, } one sig assoc10 extends Association src = class2_name dst = class3_name, src_multiplicity = src_mlpc dst_multiplicity = dst_mlpc2,},Mapping Strategy for class1_name : map_str2Mapping Strategy for class2_name : map_str2Mapping Strategy for class5_name : map_str2Mapping Strategy for class6_name : map_str2Mapping Strategy for class7_name : map_str2Association Strategy for assoc3 : assoc_str1Association Strategy for assoc4 : assoc_str1Association Strategy for assoc7 : assoc_str1Association Strategy for assoc9 : assoc_str1Association Strategy for assoc1 : assoc_str2Association Strategy for assoc10 : assoc_str2Association Strategy for assoc5 : assoc_str2Association Strategy for assoc6 : assoc_str2Association Strategy for assoc8 : assoc_str2,USE OM_name:0;CREATE TABLE `class3_name` (`c3_at6` c3_at6_type,`c3_at5` c3_at5_type,`c3_at4` c3_at4_type,`c3_at3` c3_at3_type,`c3_at2` c3_at2_type,`c3_at1` c3_at1_type NOT NULL,PRIMARY KEY (`c3_at1`)CREATE TABLE `class1_name` (`c1_at2` c1_at2_type(64),`c1_at1` c1_at1_type NOT NULL,PRIMARY KEY (`c1_at1`)CREATE TABLE `class2_name` (`c2_at2` c2_at2_type(64),`c8_at1` c8_at1_type,`c2_at1` c2_at1_type NOT NULL,KEY `FK_class2_name_c8_at1_idx` (`c8_at1`),PRIMARY KEY (`c2_at1`)CREATE TABLE `class7_name` (`c7_at1` c7_at1_type(64),`c2_at1` c2_at1_type NOT NULL,KEY `FK_class7_name_c2_at1_idx` (`c2_at1`),PRIMARY KEY (`c2_at1`)CREATE TABLE `assoc10` (`c3_at1` c3_at1_type NOT NULL,`c2_at1` c2_at1_type NOT NULL,KEY `FK_assoc10_c3_at1_idx` (`c3_at1`),KEY `FK_assoc10_c2_at1_idx` (`c2_at1`),PRIMARY KEY (`c3_at1`,`c2_at1`)CREATE TABLE `class8_name` (`c8_at2` c8_at2_type(64),`c6_at3` c6_at3_type(64),`c2_at2` c2_at2_type(64),`c8_at1` c8_at1_type NOT NULL,PRIMARY KEY (`c8_at1`)CREATE TABLE `class5_name` (`c5_at1` c5_at1_type,`c2_at1` c2_at1_type NOT NULL,KEY `FK_class5_name_c2_at1_idx` (`c2_at1`),PRIMARY KEY (`c2_at1`)Association Strategy for assoc5 : assoc_str2`c8_at1` c8_at1_type NOT NULL,`c6_at2` c6_at2_type NOT NULL,KEY `FK_assoc5_c8_at1_idx` (`c8_at1`),KEY `FK_assoc5_c6_at2_idx` (`c6_at2`),PRIMARY KEY (`c8_at1`,`c6_at2`)CREATE TABLE `assoc3` (`c8_at1` c8_at1_type NOT NULL,`c3_at1` c3_at1_type NOT NULL,KEY `FK_assoc3_c8_at1_idx` (`c8_at1`),KEY `FK_assoc3_c3_at1_idx` (`c3_at1`),PRIMARY KEY (`c8_at1`,`c3_at1`)CREATE TABLE `assoc8` (`c3_at1` c3_at1_type NOT NULL,`c1_at1` c1_at1_type NOT NULL,KEY `FK_assoc8_c3_at1_idx` (`c3_at1`),KEY `FK_assoc8_c1_at1_idx` (`c1_at1`),PRIMARY KEY (`c3_at1`,`c1_at1`)CREATE TABLE `class4_name` (`c4_at4` c4_at4_type(64),`c4_at2` c4_at2_type(64),`c4_at1` c4_at1_type(64),`c8_at1` c8_at1_type,`c6_at2` c6_at2_type,`c4_at3` c4_at3_type NOT NULL,KEY `FK_class4_name_c8_at1_idx` (`c8_at1`),KEY `FK_class4_name_c6_at2_idx` (`c6_at2`),PRIMARY KEY (`c4_at3`)CREATE TABLE `assoc6` (`c6_at2` c6_at2_type NOT NULL,`c1_at1` c1_at1_type NOT NULL,KEY `FK_assoc6_c6_at2_idx` (`c6_at2`),KEY `FK_assoc6_c1_at1_idx` (`c1_at1`),PRIMARY KEY (`c6_at2`,`c1_at1`)CREATE TABLE `class6_name` (`c6_at4` c6_at4_type(64),`c6_at3` c6_at3_type(64),`c6_at1_0` c6_at1_type(64),`c6_at2` c6_at2_type NOT NULL,PRIMARY KEY (`c6_at2`)CREATE TABLE `assoc9` (`c4_at3` c4_at3_type NOT NULL,`c1_at1` c1_at1_type NOT NULL,KEY `FK_assoc9_c4_at3_idx` (`c4_at3`),KEY `FK_assoc9_c1_at1_idx` (`c1_at1`),PRIMARY KEY (`c4_at3`,`c1_at1`)CREATE TABLE `assoc2` (`c4_at3` c4_at3_type NOT NULL,`c2_at1` c2_at1_type NOT NULL,KEY `FK_assoc2_c4_at3_idx` (`c4_at3`),KEY `FK_assoc2_c2_at1_idx` (`c2_at1`),PRIMARY KEY (`c4_at3`,`c2_at1`)ALTER TABLE `class2_name`ADD CONSTRAINT `FK_class2_name_c8_at1` FOREIGN KEY (`c8_at1`) REFERENCES `class8_name` (`c8_at1`) ON DELETE CASCADE ON UPDATE CASCADE,ALTER TABLE `class7_name`ADD CONSTRAINT `FK_class7_name_c2_at1` FOREIGN KEY (`c2_at1`) REFERENCES `class2_name` (`c2_at1`) ON DELETE CASCADE ON UPDATE CASCADE,ALTER TABLE `assoc10`ADD CONSTRAINT `FK_assoc10_c3_at1` FOREIGN KEY (`c3_at1`) REFERENCES `class3_name` (`c3_at1`) ON DELETE CASCADE ON UPDATE CASCADE,ADD CONSTRAINT `FK_assoc10_c2_at1` FOREIGN KEY (`c2_at1`) REFERENCES `class2_name` (`c2_at1`) ON DELETE CASCADE ON UPDATE CASCADE,ALTER TABLE `class5_name`ADD CONSTRAINT `FK_class5_name_c2_at1` FOREIGN KEY (`c2_at1`) REFERENCES `class2_name` (`c2_at1`) ON DELETE CASCADE ON UPDATE CASCADE,ALTER TABLE `assoc5`ADD CONSTRAINT `FK_assoc5_c8_at1` FOREIGN KEY (`c8_at1`) REFERENCES `class8_name` (`c8_at1`) ON DELETE CASCADE ON UPDATE CASCADE,ADD CONSTRAINT `FK_assoc5_c6_at2` FOREIGN KEY (`c6_at2`) REFERENCES `class6_name` (`c6_at2`) ON DELETE CASCADE ON UPDATE CASCADE;ALTER TABLE `assoc3`ADD CONSTRAINT `FK_assoc3_c8_at1` FOREIGN KEY (`c8_at1`) REFERENCES `class8_name` (`c8_at1`) ON DELETE CASCADE ON UPDATE CASCADE,ADD CONSTRAINT `FK_assoc3_c3_at1` FOREIGN KEY (`c3_at1`) REFERENCES `class3_name` (`c3_at1`) ON DELETE CASCADE ON UPDATE CASCADE,ALTER TABLE `assoc8`ADD CONSTRAINT `FK_assoc8_c3_at1` FOREIGN KEY (`c3_at1`) REFERENCES `class3_name` (`c3_at1`) ON DELETE CASCADE ON UPDATE CASCADE,ADD CONSTRAINT `FK_assoc8_c1_at1` FOREIGN KEY (`c1_at1`) REFERENCES `class1_name` (`c1_at1`) ON DELETE CASCADE ON UPDATE CASCADE,ALTER TABLE `class4_name`ADD CONSTRAINT `FK_class4_name_c8_at1` FOREIGN KEY (`c8_at1`) REFERENCES `class8_name` (`c8_at1`) ON DELETE CASCADE ON UPDATE CASCADE,ADD CONSTRAINT `FK_class4_name_c6_at2` FOREIGN KEY (`c6_at2`) REFERENCES `class6_name` (`c6_at2`) ON DELETE CASCADE ON UPDATE CASCADE;ALTER TABLE `assoc6`ALTER TABLE `class7_name`ADD CONSTRAINT `FK_assoc6_c1_at1` FOREIGN KEY (`c1_at1`) REFERENCES `class1_name` (`c1_at1`) ON DELETE CASCADE ON UPDATE CASCADE,ALTER TABLE `assoc9`ADD CONSTRAINT `FK_assoc9_c4_at3` FOREIGN KEY (`c4_at3`) REFERENCES `class4_name` (`c4_at3`) ON DELETE CASCADE ON UPDATE CASCADE,ADD CONSTRAINT `FK_assoc9_c1_at1` FOREIGN KEY (`c1_at1`) REFERENCES `class1_name` (`c1_at1`) ON DELETE CASCADE ON UPDATE CASCADE;ALTER TABLE `assoc2`ADD CONSTRAINT `FK_assoc2_c4_at3` FOREIGN KEY (`c4_at3`) REFERENCES `class4_name` (`c4_at3`) ON DELETE CASCADE ON UPDATE CASCADEADD CONSTRAINT `FK_assoc2_c2_at1` FOREIGN KEY (`c2_at1`) REFERENCES `class2_name` (`c2_at1`) ON DELETE CASCADE ON UPDATE CASCADE,"
predicted_text = generate_predictions(input_text, model, tokenizer)
print("Predicted Output:", predicted_text)



Predicted Output: P


In [16]:
# Test the model with new input data
import pandas as pd

# Load CSV file
df = pd.read_excel("Customer_Order_testset.xlsx")  # Update with your file path

# Assuming 'OM_Regular' is the column name
texts = df['OM_Regular']

# Iterate through each text in the column
for test_text in texts:

    predicted_text = generate_predictions(test_text, model, tokenizer)
    print("")
    print(f"status: {predicted_text}")




status: NP

status: NP

status: NP

status: NP

status: NP

status: NP

status: NP

status: NP


Calculating Metrics on External Dataset

In [1]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_curve, roc_auc_score
from sklearn.metrics import precision_recall_curve, classification_report

In [2]:
dc = pd.read_excel('Customer_Order_testset.xlsx')

In [3]:
X_test2 = dc['OM_Regular'].values
y_test2 = dc['OM_Prediction'].values

In [4]:
print(X_test2.shape)
print(y_test2.shape)

print("X data type: ", X_test2.dtype)
print("y data type: ", y_test2.dtype)

(8,)
(8,)
X data type:  object
y data type:  int64


In [5]:
print(y_test2)

[0 0 0 0 0 1 1 0]


In [6]:
dd = pd.read_excel('Customer_Order_pred_testset_2.xlsx')

In [7]:
X_test_pred2 = dd['OM_Regular'].values
y_test_pred2 = dd['OM_Prediction'].values

In [8]:
print (y_test_pred2 )

[0 0 0 0 0 0 0 0]


In [9]:
precision = precision_score(y_test2, y_test_pred2)
print("Testing: Precision = %f" % precision)


recall = recall_score(y_test2, y_test_pred2)
print("Testing: Recall = %f" % recall)


f1 = f1_score(y_test2, y_test_pred2)
print("Testing: F1 Score = %f" % f1)

print("\nConfusion Matrix (Test Data):\n", confusion_matrix(y_test2, y_test_pred2))

Testing: Precision = 0.000000
Testing: Recall = 0.000000
Testing: F1 Score = 0.000000

Confusion Matrix (Test Data):
 [[6 0]
 [2 0]]


  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
print(classification_report(y_test2,y_test_pred2))

              precision    recall  f1-score   support

           0       0.75      1.00      0.86         6
           1       0.00      0.00      0.00         2

    accuracy                           0.75         8
   macro avg       0.38      0.50      0.43         8
weighted avg       0.56      0.75      0.64         8



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
