In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
import sklearn
def find(str, ch):
    for i, ltr in enumerate(str):
        if ltr == ch:
            yield i

def read_custom_csv(file_path, encoding='utf-8', errors='ignore'):
    with open(file_path, "r", encoding=encoding, errors='ignore') as f:
        lines = f.readlines()

    reqs = []
    label = []
    for line in lines:
      comma_index = list(find(line, ","))[-2]
      before_comma = line[:comma_index]
      before_comma = before_comma.replace('"', '')
      after_comma = line[comma_index+1:][:-2]
      if after_comma != 'unknown':
        reqs.append(before_comma)
        if 'non' in after_comma:
          label.append(0)
        else:
          label.append(1)
    data_dict = {'text': reqs, 'label': label}
    return pd.DataFrame(data_dict)

# Load the data using the custom CSV reader with assumed utf-8 encoding
data = read_custom_csv("requirements.csv")
data = data.dropna()


2023-05-17 10:12:18.618114: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
import torch
import numpy as np
import random

def set_seed(seed_value=0):
    """Set seed for reproducibility."""
    # Set `python` built-in pseudo-random generator at a fixed value
    random.seed(seed_value)

    # Set `numpy` pseudo-random generator at a fixed value
    np.random.seed(seed_value)

    # Set `torch` pseudo-random generator at a fixed value
    torch.manual_seed(seed_value)

    # If you're using a GPU (which you should):
    torch.cuda.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)  # if you are using multi-GPU.

    # Also, this removes randomness, so your results are reproducible (not completely though)
    torch.backends.cudnn.deterministic = True
    # Even though it makes the experiment slower, it also makes the network architecture the same every time, which if not could create slight differences in the performance of the model.
    torch.backends.cudnn.benchmark = False


In [3]:
import gc
import torch

def free_memory(model, trainer):
    """Free GPU memory."""
    # delete model and trainer
    del model
    del trainer

    # If using GPU
    if torch.cuda.is_available(): 
        torch.cuda.empty_cache()

    # PyTorch thing
    gc.collect()


In [4]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

set_seed()
# Split the data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(data["text"], data["label"], test_size=0.2, stratify=data["label"], random_state=42)

# Bag of Words representation with Naive Bayes
bow_nb_pipeline = Pipeline([
    ("vectorizer", CountVectorizer()),
    ("classifier", MultinomialNB())
])

bow_nb_pipeline.fit(X_train, y_train)
bow_nb_pred = bow_nb_pipeline.predict(X_val)
print("Bag of Words with Naive Bayes:")
print(classification_report(y_val, bow_nb_pred, target_names=["nonsec", "sec"]))

# TF-IDF representation with Naive Bayes
tfidf_nb_pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer()),
    ("classifier", MultinomialNB())
])

tfidf_nb_pipeline.fit(X_train, y_train)
tfidf_nb_pred = tfidf_nb_pipeline.predict(X_val)
print("TF-IDF with Naive Bayes:")
print(classification_report(y_val, tfidf_nb_pred, target_names=["nonsec", "sec"]))


Bag of Words with Naive Bayes:
              precision    recall  f1-score   support

      nonsec       0.81      0.81      0.81        16
         sec       0.86      0.86      0.86        21

    accuracy                           0.84        37
   macro avg       0.83      0.83      0.83        37
weighted avg       0.84      0.84      0.84        37

TF-IDF with Naive Bayes:
              precision    recall  f1-score   support

      nonsec       1.00      0.69      0.81        16
         sec       0.81      1.00      0.89        21

    accuracy                           0.86        37
   macro avg       0.90      0.84      0.85        37
weighted avg       0.89      0.86      0.86        37



In [5]:
set_seed()
# Split the data into train and validation sets
train_data, val_data = train_test_split(data, test_size=0.2, stratify=data["label"], random_state=42)
model = "distilbert-base-uncased"
# Tokenize the text data
tokenizer = DistilBertTokenizerFast.from_pretrained(model)
train_encodings = tokenizer(train_data["text"].tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_data["text"].tolist(), truncation=True, padding=True)

# Create PyTorch datasets
import torch

class RequirementDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = RequirementDataset(train_encodings, train_data["label"].tolist())
val_dataset = RequirementDataset(val_encodings, val_data["label"].tolist())

# Train the model
model = DistilBertForSequenceClassification.from_pretrained(model, num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=20,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    logging_steps=20,
    load_best_model_at_end=True,
    save_strategy="epoch",  # Add this line to match the evaluation strategy
    metric_for_best_model="f1",
    weight_decay=0.01,
    save_total_limit=1,
    seed=42,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=lambda eval_pred: {
        "f1": sklearn.metrics.f1_score(eval_pred.label_ids, eval_pred.predictions.argmax(-1), average="weighted"),
    },
)

trainer.train()

# Evaluate the model
predictions = trainer.predict(val_dataset)
pred_labels = predictions.predictions.argmax(-1)

print(classification_report(val_data["label"].tolist(), pred_labels, target_names=["nonsec", "sec"]))
free_memory(model, trainer)


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

Epoch,Training Loss,Validation Loss,F1
1,No log,0.549558,0.890745
2,0.554600,0.40907,0.811642
3,0.554600,0.362175,0.85954
4,0.182000,0.276568,0.917428
5,0.182000,0.336619,0.917428
6,0.044500,0.43319,0.85954
7,0.044500,0.693577,0.829262
8,0.008900,0.602287,0.86238
9,0.008900,0.523206,0.890745
10,0.002800,0.591138,0.85954


              precision    recall  f1-score   support

      nonsec       1.00      0.81      0.90        16
         sec       0.88      1.00      0.93        21

    accuracy                           0.92        37
   macro avg       0.94      0.91      0.91        37
weighted avg       0.93      0.92      0.92        37



In [6]:
set_seed()
# Split the data into train and validation sets
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
train_data, val_data = train_test_split(data, test_size=0.2, stratify=data["label"], random_state=42)
model = "bert-base-uncased"
# Tokenize the text data
tokenizer = BertTokenizer.from_pretrained(model)
train_encodings = tokenizer(train_data["text"].tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_data["text"].tolist(), truncation=True, padding=True)

# Create PyTorch datasets
import torch

class RequirementDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = RequirementDataset(train_encodings, train_data["label"].tolist())
val_dataset = RequirementDataset(val_encodings, val_data["label"].tolist())

# Train the model
model = BertForSequenceClassification.from_pretrained(model, num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=20,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    logging_steps=20,
    load_best_model_at_end=True,
    save_strategy="epoch",  # Add this line to match the evaluation strategy
    metric_for_best_model="f1",
    weight_decay=0.01,
    save_total_limit=1,
    seed=42,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=lambda eval_pred: {
        "f1": sklearn.metrics.f1_score(eval_pred.label_ids, eval_pred.predictions.argmax(-1), average="weighted"),
    },
)

trainer.train()

# Evaluate the model
predictions = trainer.predict(val_dataset)
pred_labels = predictions.predictions.argmax(-1)

print(classification_report(val_data["label"].tolist(), pred_labels, target_names=["nonsec", "sec"]))
free_memory(model, trainer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch,Training Loss,Validation Loss,F1
1,No log,0.598407,0.670768
2,0.629400,0.483342,0.81137
3,0.629400,0.406046,0.837838
4,0.259400,0.412431,0.864257
5,0.259400,0.448297,0.890745
6,0.021800,0.659281,0.86238
7,0.021800,0.778239,0.88887
8,0.001000,0.789189,0.86238
9,0.001000,0.787865,0.86238
10,0.000400,0.788239,0.86238


              precision    recall  f1-score   support

      nonsec       0.93      0.81      0.87        16
         sec       0.87      0.95      0.91        21

    accuracy                           0.89        37
   macro avg       0.90      0.88      0.89        37
weighted avg       0.90      0.89      0.89        37



In [7]:
set_seed()
# Split the data into train and validation sets
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
train_data, val_data = train_test_split(data, test_size=0.2, stratify=data["label"], random_state=42)
model = "bert-large-uncased"
# Tokenize the text data
tokenizer = BertTokenizer.from_pretrained(model)
train_encodings = tokenizer(train_data["text"].tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_data["text"].tolist(), truncation=True, padding=True)

# Create PyTorch datasets
import torch

class RequirementDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = RequirementDataset(train_encodings, train_data["label"].tolist())
val_dataset = RequirementDataset(val_encodings, val_data["label"].tolist())

# Train the model
model = BertForSequenceClassification.from_pretrained(model, num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=20,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    logging_steps=20,
    load_best_model_at_end=True,
    save_strategy="epoch",  # Add this line to match the evaluation strategy
    metric_for_best_model="f1",
    weight_decay=0.01,
    save_total_limit=1,
    seed=42,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=lambda eval_pred: {
        "f1": sklearn.metrics.f1_score(eval_pred.label_ids, eval_pred.predictions.argmax(-1), average="weighted"),
    },
)

trainer.train()

# Evaluate the model
predictions = trainer.predict(val_dataset)
pred_labels = predictions.predictions.argmax(-1)

print(classification_report(val_data["label"].tolist(), pred_labels, target_names=["nonsec", "sec"]))
free_memory(model, trainer)


Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

Epoch,Training Loss,Validation Loss,F1
1,No log,0.6799,0.410997
2,0.743500,0.66886,0.410997
3,0.743500,0.624287,0.410997
4,0.631800,0.632675,0.614414
5,0.631800,0.390757,0.865264
6,0.366800,0.521803,0.687983
7,0.366800,0.861792,0.730628
8,0.291800,0.203225,0.945946
9,0.291800,0.455679,0.917428
10,0.119900,0.377766,0.891892


              precision    recall  f1-score   support

      nonsec       0.94      0.94      0.94        16
         sec       0.95      0.95      0.95        21

    accuracy                           0.95        37
   macro avg       0.94      0.94      0.94        37
weighted avg       0.95      0.95      0.95        37



In [8]:
set_seed()
# Split the data into train and validation sets
from transformers import RobertaTokenizer, RobertaForSequenceClassification
train_data, val_data = train_test_split(data, test_size=0.2, stratify=data["label"], random_state=42)
model = "roberta-base"
# Tokenize the text data
tokenizer = RobertaTokenizer.from_pretrained(model)
train_encodings = tokenizer(train_data["text"].tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_data["text"].tolist(), truncation=True, padding=True)

# Create PyTorch datasets
import torch

class RequirementDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = RequirementDataset(train_encodings, train_data["label"].tolist())
val_dataset = RequirementDataset(val_encodings, val_data["label"].tolist())

# Train the model
model = RobertaForSequenceClassification.from_pretrained(model, num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=20,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    logging_steps=20,
    load_best_model_at_end=True,
    save_strategy="epoch",  # Add this line to match the evaluation strategy
    metric_for_best_model="f1",
    weight_decay=0.01,
    save_total_limit=1,
    seed=42,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=lambda eval_pred: {
        "f1": sklearn.metrics.f1_score(eval_pred.label_ids, eval_pred.predictions.argmax(-1), average="weighted"),
    },
)

trainer.train()

# Evaluate the model
predictions = trainer.predict(val_dataset)
pred_labels = predictions.predictions.argmax(-1)

print(classification_report(val_data["label"].tolist(), pred_labels, target_names=["nonsec", "sec"]))
free_memory(model, trainer)


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

Epoch,Training Loss,Validation Loss,F1
1,No log,0.636572,0.410997
2,0.602900,0.498232,0.750285
3,0.602900,0.126965,0.973053
4,0.326200,0.197869,0.918554
5,0.326200,0.382469,0.88887
6,0.059200,0.35134,0.945373
7,0.059200,0.42239,0.945373
8,0.000800,0.487749,0.918554
9,0.000800,0.581452,0.918554
10,0.000300,0.606441,0.918554


              precision    recall  f1-score   support

      nonsec       0.94      1.00      0.97        16
         sec       1.00      0.95      0.98        21

    accuracy                           0.97        37
   macro avg       0.97      0.98      0.97        37
weighted avg       0.97      0.97      0.97        37



In [9]:
set_seed()
# Split the data into train and validation sets
from transformers import RobertaTokenizer, RobertaForSequenceClassification
train_data, val_data = train_test_split(data, test_size=0.2, stratify=data["label"], random_state=42)
model = "roberta-large"
# Tokenize the text data
tokenizer = RobertaTokenizer.from_pretrained(model)
train_encodings = tokenizer(train_data["text"].tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_data["text"].tolist(), truncation=True, padding=True)

# Create PyTorch datasets
import torch

class RequirementDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = RequirementDataset(train_encodings, train_data["label"].tolist())
val_dataset = RequirementDataset(val_encodings, val_data["label"].tolist())

# Train the model
model = RobertaForSequenceClassification.from_pretrained(model, num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=20,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    logging_steps=20,
    load_best_model_at_end=True,
    save_strategy="epoch",  # Add this line to match the evaluation strategy
    metric_for_best_model="f1",
    weight_decay=0.01,
    save_total_limit=1,
    seed=42,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=lambda eval_pred: {
        "f1": sklearn.metrics.f1_score(eval_pred.label_ids, eval_pred.predictions.argmax(-1), average="weighted"),
    },
)

trainer.train()

# Evaluate the model
predictions = trainer.predict(val_dataset)
pred_labels = predictions.predictions.argmax(-1)

print(classification_report(val_data["label"].tolist(), pred_labels, target_names=["nonsec", "sec"]))
free_memory(model, trainer)


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.weight', 'classif

Epoch,Training Loss,Validation Loss,F1
1,No log,0.283544,0.864257
2,0.570900,0.724058,0.687983
3,0.570900,0.208107,0.890745
4,0.394500,0.386013,0.864257
5,0.394500,0.614521,0.864865
6,0.195700,0.498275,0.945373
7,0.195700,0.753084,0.917428
8,0.116900,0.857536,0.891892
9,0.116900,1.120653,0.891892
10,0.006800,0.87106,0.918554


              precision    recall  f1-score   support

      nonsec       1.00      0.88      0.93        16
         sec       0.91      1.00      0.95        21

    accuracy                           0.95        37
   macro avg       0.96      0.94      0.94        37
weighted avg       0.95      0.95      0.95        37



In [15]:
set_seed()
# Split the data into train and validation sets
from transformers import ElectraTokenizer, ElectraForSequenceClassification
train_data, val_data = train_test_split(data, test_size=0.2, stratify=data["label"], random_state=42)
model = "google/electra-base-discriminator"
# Tokenize the text data
tokenizer = ElectraTokenizer.from_pretrained(model)
train_encodings = tokenizer(train_data["text"].tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_data["text"].tolist(), truncation=True, padding=True)

# Create PyTorch datasets
import torch

class RequirementDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = RequirementDataset(train_encodings, train_data["label"].tolist())
val_dataset = RequirementDataset(val_encodings, val_data["label"].tolist())

# Train the model
model = ElectraForSequenceClassification.from_pretrained(model, num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=20,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    logging_steps=20,
    load_best_model_at_end=True,
    save_strategy="epoch",  # Add this line to match the evaluation strategy
    metric_for_best_model="f1",
    weight_decay=0.01,
    save_total_limit=1,
    seed=42,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=lambda eval_pred: {
        "f1": sklearn.metrics.f1_score(eval_pred.label_ids, eval_pred.predictions.argmax(-1), average="weighted"),
    },
)

trainer.train()

# Evaluate the model
predictions = trainer.predict(val_dataset)
pred_labels = predictions.predictions.argmax(-1)

print(classification_report(val_data["label"].tolist(), pred_labels, target_names=["nonsec", "sec"]))
free_memory(model, trainer)


Some weights of the model checkpoint at google/electra-base-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.d

Epoch,Training Loss,Validation Loss,F1
1,No log,0.627876,0.410997
2,0.630400,0.542893,0.756757
3,0.630400,0.33989,0.865264
4,0.302100,0.229714,0.945373
5,0.302100,0.295843,0.917428
6,0.050400,0.307547,0.918554
7,0.050400,0.489398,0.88887
8,0.007400,0.351014,0.918554
9,0.007400,0.455593,0.918554
10,0.013100,0.554561,0.88887


              precision    recall  f1-score   support

      nonsec       1.00      0.88      0.93        16
         sec       0.91      1.00      0.95        21

    accuracy                           0.95        37
   macro avg       0.96      0.94      0.94        37
weighted avg       0.95      0.95      0.95        37



In [17]:
set_seed()
# Split the data into train and validation sets
from transformers import ElectraTokenizer, ElectraForSequenceClassification
train_data, val_data = train_test_split(data, test_size=0.2, stratify=data["label"], random_state=42)
model = "google/electra-large-discriminator"
# Tokenize the text data
tokenizer = ElectraTokenizer.from_pretrained(model)
train_encodings = tokenizer(train_data["text"].tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_data["text"].tolist(), truncation=True, padding=True)

# Create PyTorch datasets
import torch

class RequirementDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = RequirementDataset(train_encodings, train_data["label"].tolist())
val_dataset = RequirementDataset(val_encodings, val_data["label"].tolist())

# Train the model
model = ElectraForSequenceClassification.from_pretrained(model, num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=20,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    logging_steps=20,
    load_best_model_at_end=True,
    save_strategy="epoch",  # Add this line to match the evaluation strategy
    metric_for_best_model="f1",
    weight_decay=0.01,
    save_total_limit=1,
    seed=42,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=lambda eval_pred: {
        "f1": sklearn.metrics.f1_score(eval_pred.label_ids, eval_pred.predictions.argmax(-1), average="weighted"),
    },
)

trainer.train()

# Evaluate the model
predictions = trainer.predict(val_dataset)
pred_labels = predictions.predictions.argmax(-1)

print(classification_report(val_data["label"].tolist(), pred_labels, target_names=["nonsec", "sec"]))
free_memory(model, trainer)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/668 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

Epoch,Training Loss,Validation Loss,F1
1,No log,0.683933,0.410997
2,0.706600,0.680414,0.410997
3,0.706600,0.681583,0.410997
4,0.679800,0.546906,0.77774
5,0.679800,0.504432,0.838553
6,0.618500,0.579999,0.715437
7,0.618500,0.424839,0.838553
8,0.475900,0.703253,0.614414
9,0.475900,0.5701,0.781491
10,0.398000,0.386012,0.829262


              precision    recall  f1-score   support

      nonsec       0.87      0.81      0.84        16
         sec       0.86      0.90      0.88        21

    accuracy                           0.86        37
   macro avg       0.87      0.86      0.86        37
weighted avg       0.86      0.86      0.86        37

