In [1]:
import re
from collections import Counter

import numpy as np
import pandas as pd
import torch
from datasets import Dataset, load_metric
from sklearn.model_selection import train_test_split
from transformers import Trainer, TrainingArguments, EvalPrediction
from transformers import pipeline

In [2]:
model_name = 'bart'

Code inspired from  https://medium.com/@igafni21/smartshot-fine-tuning-zero-shot-classification-models-with-nli-a990f5478b4f

In [3]:
bart = "facebook/bart-large-mnli"
deberta_base = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"
dir = "/model"

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
classifier_before = pipeline('zero-shot-classification', device=device, model=bart)



In [4]:
model = classifier_before.model
tokenizer = classifier_before.tokenizer

In [5]:
user_stories_df = pd.read_csv('../DB_GroundTruth/userStories.csv', delimiter=';')
user_stories = user_stories_df['user_stories'].tolist()
df = pd.read_csv('D:/Thesis/DB/datasets/skills.csv', header=None, encoding='ISO-8859-1')
labels = df[0].tolist()

In [6]:
def get_new_tokens(sentences, vocabulary):
    vocab_set = set(vocabulary)
    cleaned_words = (re.sub(r"[.'\s\n]+|('\s)", "", word).lower().strip() for sentence in sentences for word in
                     sentence.split())
    return [word for word in cleaned_words if word not in vocab_set and word]


def word_count(word_list):
    return Counter(word_list)


def tokenize(df):
    sentences = df['hypothesis'].to_list() + df['premise'].to_list()
    vocabulary = tokenizer.get_vocab().keys()
    tokens_to_add = get_new_tokens(sentences, vocabulary)
    words = word_count(tokens_to_add)
    # Initialize an empty list to store new tokens + Loop through the words and their counts
    new_tokens = []
    for key, value in words.items():
        if value > 10 and len(key) > 2:
            new_tokens.append(key)
    tokenizer.add_tokens(new_tokens)
    model.resize_token_embeddings(len(tokenizer))

In [7]:
def synth_to_nli(df):
    df.rename(columns={'user_stories': 'hypothesis', 'skills': 'premise'}, inplace=True)
    df['class'] = 0
    df['hypothesis'] = df['hypothesis'].astype("str")
    df['premise'] = df['premise'].astype("str")
    #df['premise'] = df['premise'].str.replace("'", '')
    tokenize(df)
    return df

df = synth_to_nli(user_stories_df)

In [8]:
def shuffle_df(old_df: pd.DataFrame, cycles: int = 1) -> pd.DataFrame:
    np.random.seed(42)
    for i in range(cycles):
        new_df = old_df.sample(frac=1).reset_index(drop=True)
        return new_df

In [9]:
def create_input_sequence(sample):
    text = sample["premise"]
    hypothesis = sample['hypothesis']

    label = sample['class']

    # Encoding the sequence using the tokenizer
    encoded_sequence = tokenizer(text, hypothesis, truncation=True, padding='max_length')
    # Assign label to the encoded sequence
    encoded_sequence['labels'] = label
    # Decode the input_ids
    encoded_sequence["input_sentence"] = tokenizer.batch_decode(encoded_sequence.input_ids)
    return encoded_sequence

In [10]:

train_data, test_data = train_test_split(df, test_size=0.8, random_state=42)
train_shuffle_df = shuffle_df(train_data)
test_shuffle_df = shuffle_df(test_data)

# Create a Dataset object from the shuffled train DataFrame
train = Dataset.from_pandas(train_shuffle_df)
test = Dataset.from_pandas(test_shuffle_df)

# Map the create_input_sequence function to the train and test datasets
# This function encodes the data, adds labels, and generates input sentences
train_dataset = train.map(create_input_sequence, batched=True, batch_size=1, remove_columns=["class", "premise"])
test_dataset = test.map(create_input_sequence, batched=True, batch_size=1, remove_columns=["class", "premise"])

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

In [11]:
def compute_metrics(p: EvalPrediction):
    # Extracting predictions from EvalPrediction object
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    # Obtaining the predicted classes
    preds = np.argmax(preds, axis=1)

    # Calculating the ratio of predictions equal to 2 (assumed label)
    ratio = np.mean(preds == 2)

    # Dictionary to store computed metrics
    result = {}

    # Loading evaluation metrics
    metric_f1 = load_metric("f1")
    metric_precision = load_metric("precision")
    metric_recall = load_metric("recall")
    metric_acc = load_metric("accuracy")

    # Computing various metrics
    result["accuracy"] = metric_acc.compute(predictions=preds, references=p.label_ids)["accuracy"]
    result["precision"] = metric_precision.compute(predictions=preds, references=p.label_ids, average='macro')[
        'precision']
    result["recall"] = metric_recall.compute(predictions=preds, references=p.label_ids, average='macro')["recall"]
    result["f1"] = metric_f1.compute(predictions=preds, references=p.label_ids, average='macro')["f1"]
    result["ratio"] = ratio

    return result

In [12]:
training_args = TrainingArguments(
    output_dir=dir,  # Output directory
    logging_dir=dir + "/logs",  # Output directory for logging
    num_train_epochs=1,  # Total number of training epochs
    per_device_train_batch_size=32,  # Batch size per device during training
    per_device_eval_batch_size=2,  # Batch size for evaluation
    warmup_steps=4,  # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # Strength of weight decay
    gradient_accumulation_steps=2,  # The number of steps whose gradients are accumulated
    learning_rate=2e-05,  # Controls the magnitude of updates to the model weights
    warmup_ratio=0.06,  # Represents the proportion of training steps
    label_smoothing_factor=0.1,  # Regularization technique to prevent the model from becoming overconfident
    eval_strategy='steps',  # Frequency or timing of evaluating
    logging_strategy='steps',  # Frequency or timing of logging
    logging_steps=10,  # Frequency or timing of logging
    eval_steps=10,  # Frequency or timing of evaluating
    logging_first_step=True,
    do_eval=True
)

In [13]:
model.gradient_checkpointing_enable()
model.config.use_cache = False
trainer = Trainer(
    model=model,  # The instantiated model to be trained
    args=training_args,  # Training arguments, defined above
    compute_metrics=compute_metrics,  # A function to compute the metrics
    train_dataset=train_dataset,  # Training dataset
    eval_dataset=test_dataset  # Evaluation dataset
)

In [14]:
trainer.evaluate()

  attn_output = torch.nn.functional.scaled_dot_product_attention(


  metric_f1 = load_metric("f1")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 7.271732330322266,
 'eval_accuracy': 0.0,
 'eval_precision': 0.0,
 'eval_recall': 0.0,
 'eval_f1': 0.0,
 'eval_ratio': 0.0,
 'eval_runtime': 27.8657,
 'eval_samples_per_second': 2.871,
 'eval_steps_per_second': 1.435}

In [15]:
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=1, training_loss=3.1530885696411133, metrics={'train_runtime': 4.7089, 'train_samples_per_second': 4.247, 'train_steps_per_second': 0.212, 'total_flos': 43471444746240.0, 'train_loss': 3.1530885696411133, 'epoch': 1.0})

In [16]:
model.eval()

BartForSequenceClassification(
  (model): BartModel(
    (shared): Embedding(50282, 1024)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50282, 1024)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerNorm((1024,), eps=1e-05, e

In [17]:
trainer.save_model("model/bart_08")

Non-default generation parameters: {'forced_eos_token_id': 2}


In [18]:
"""
def plot_metrics(data):
    # Extract parameters
    parameters = data[0].keys()

    # Plot each parameter in a separate graph
    num_plots = len(parameters) - 1  # Exclude 'Step'
    fig, axes = plt.subplots(num_plots, 1, figsize=(8, num_plots*2))

    for i, param in enumerate(parameters):
        if param == 'Step':
            continue

        ax = axes[i - 1] if num_plots > 1 else axes
        ax.plot([d['Step'] for d in data], [d[param] for d in data], marker='o', label=param)
        ax.set_xlabel('Step')
        ax.set_ylabel(param)
        ax.set_title(f'Plot of {param}')
        ax.legend()

    plt.tight_layout()
    plt.show()

plot_metrics(data)"""

"\ndef plot_metrics(data):\n    # Extract parameters\n    parameters = data[0].keys()\n\n    # Plot each parameter in a separate graph\n    num_plots = len(parameters) - 1  # Exclude 'Step'\n    fig, axes = plt.subplots(num_plots, 1, figsize=(8, num_plots*2))\n\n    for i, param in enumerate(parameters):\n        if param == 'Step':\n            continue\n\n        ax = axes[i - 1] if num_plots > 1 else axes\n        ax.plot([d['Step'] for d in data], [d[param] for d in data], marker='o', label=param)\n        ax.set_xlabel('Step')\n        ax.set_ylabel(param)\n        ax.set_title(f'Plot of {param}')\n        ax.legend()\n\n    plt.tight_layout()\n    plt.show()\n\nplot_metrics(data)"

In [19]:
# Create new pipeline object with our finetuned model and tokenizer
model.config.use_cache = True
classifier_after = pipeline('zero-shot-classification', model=model, tokenizer=tokenizer, device=device)

In [20]:
after_results = classifier_after(user_stories, labels, multi_label=True)
#to_txt(results, "after")
with open(f"{model_name}_08_result_after.txt", 'w') as f:
    for story, result in zip(user_stories, after_results):
        f.write(f"Story: {story}\n")
        for label, score in zip(result['labels'], result['scores']):
            f.write(f"- {label}: {score:.2f}\n")

In [21]:
import CsvConverter as conv
import os
print(os.getcwd())
dir = os.getcwd()
#dir = os.path.abspath("")
csv = conv.CsvConverter(os.path.join(dir, f'{model_name}_08_result_after.txt'),
                        os.path.join(dir, f'{model_name}_08_result_after.csv'),
                        'Story')
csv.convert()

D:\Thesis\FineTuning
