In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import re
from collections import Counter
import pandas as pd
import numpy as np
import torch
from datasets import Dataset, load_metric
from sklearn.model_selection import train_test_split
from transformers import TrainingArguments, Trainer
from transformers import pipeline
import CsvConverter as Conv
import os

Configure Parameters and Name

In [2]:
model_name = "deberta"

train_batch = 32
eval_batch = 32
lr = 2e-05
eps = 6
wd = 0.06
warm_ratio = 0.1

freeze = False
hypothesis_template_available = False

test_size = 0.8

In [3]:
# generate output name based on configurations
if hypothesis_template_available:
    config = f"{str(test_size).replace('.', '')}_HP_{eps}_{lr}_{train_batch}{eval_batch}_{warm_ratio}_{wd}"
elif freeze:
    config = f"{str(test_size).replace('.', '')}_freeze_{eps}_{lr}_{train_batch}{eval_batch}_{warm_ratio}_{wd}"
else:
    config = f"{str(test_size).replace('.', '')}_{eps}_{lr}_{train_batch}{eval_batch}_{warm_ratio}_{wd}"

# Final model configuration name
model_config_name = f"{model_name}_{config}"

In [4]:
print(model_config_name)

deberta_208_6_2e-05_3232_0.1_0.06


In [5]:
# select the gpu if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

Extract the entailments and contradictions

In [6]:
# get synthetic user stories and matching skills
user_stories_df = pd.read_csv('../Classification_Synth/userStories.csv', delimiter=';')
user_stories = user_stories_df['user_stories'].tolist()

# cleans up the skills representation in the user stories dataframe
user_stories_df['skills'] = user_stories_df['skills'].apply(lambda x: [i.strip().replace("'", "") for i in x.split(",")])

# get skills which were used in the employee database
df = pd.read_csv('../DB/datasets/skills.csv', header=None, encoding='ISO-8859-1')
labels = df[0].tolist()

In [7]:
user_stories_df

Unnamed: 0,user_stories,skills
0,"As a software developer at our company, I want...","[MQTT, IoT, Sensor Integration, Smart contra..."
1,"As a software developer at our company, I want...","[Pandas, scikit-learn, Natural Language Proces..."
2,"As a DevOps engineer at our company, I want to...","[GCP, Azure]"
3,As a software engineer at our cryptocurrency d...,"[Cryptocurrency development, IoT, MQTT, Sensor..."
4,As a UI/UX designer and developer at our softw...,"[UI Design, Responsive Design, React Native]"
...,...,...
95,"As a DevOps engineer, I want to automate the b...","[GCP, Build Automation, Kubernetes]"
96,"As a web developer, I want to implement a user...","[HTML, React, CSS, UI Design, Web Accessibility]"
97,"As a QA engineer, I want to integrate Selenium...","[Agile Methodologies, Selenium]"
98,"As an IT administrator, I want to implement se...","[Troubleshooting, Active Directory, Security B..."


In [8]:
# load the ground truth, which is later used to extract the contradictions
contras_df = pd.read_csv('truth.csv', index_col=0)

# Extract NaN skills for each column
skills_nan = {col: contras_df.loc[contras_df[col].isna()].index.tolist() for col in contras_df.columns}

In [9]:
# Create a new DataFrame with the desired structure
new_data = []
for col, nan_indices in skills_nan.items():
    new_data.append([col, nan_indices])

In [10]:
contras_df = pd.DataFrame(new_data, columns=['user_stories', 'skills'])

Prepeare the base models

In [11]:
# select the model path based on the configurated model name
if model_name == "bart":
    model_dir = "facebook/bart-large-mnli"
else:
    model_dir = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSequenceClassification.from_pretrained(model_dir)

In [12]:
def get_new_tokens(sentences, vocabulary):
    """Identifies new tokens from sentences that are not present in the current tokenizer vocabulary."""
    vocab_set = set(vocabulary)
    cleaned_words = (re.sub(r"[.'\s\n]+|('\s)", "", word).lower().strip() for sentence in sentences for word in sentence)
    return [word for word in cleaned_words if word not in vocab_set and word]

In [13]:
def word_count(word_list):
    """Counts the frequency of words in a list."""
    return Counter(word_list)

In [14]:
def tokenize(data):
    """Tokenizes the data by adding new tokens to the tokenizer if they appear more than 10 times and have more than 2 characters.
     Resizes the model's token embedding size to match the updated tokenizer."""
    data['premise'] = data['premise'].astype("str")
    hypothesis = []
    for x in data['hypothesis'].to_list():
        hypothesis.extend([item.strip() for item in x])
    sentences = data['premise'].to_list() + hypothesis
    
    vocabulary = tokenizer.get_vocab().keys()
    tokens_to_add = get_new_tokens(sentences, vocabulary)
    words = word_count(tokens_to_add)
    # Initialize an empty list to store new tokens + Loop through the words and their counts
    new_tokens = []
    for key, value in words.items():
        if value > 10 and len(key) > 2:
            new_tokens.append(key)
    tokenizer.add_tokens(new_tokens)
    model.resize_token_embeddings(len(tokenizer))

In [15]:
def synth_to_nli(data, value):
    """ Converts the dataset format to fit the NLI format with 'hypothesis' and 'premise'.
     Calls the tokenize function on the dataset."""
    data = data.copy()
    data.rename(columns={'user_stories': 'premise', 'skills': 'hypothesis'}, inplace=True)
    data['class'] = value
    tokenize(data)
    return data

In [16]:
# based on the model the scores for the entailment and contradiction
if model_name == "bart":
    df = synth_to_nli(user_stories_df, 2) # entailment
    contras_df = synth_to_nli(contras_df, 0) # contradiction
else:
    df = synth_to_nli(user_stories_df, 0) # entailment
    contras_df = synth_to_nli(contras_df, 2) # contradiction

In [17]:
# split up the hypothesis labels, and select random contradictions (equals entailment count)
df = df.explode('hypothesis')
contras_df = contras_df.explode('hypothesis')
contras_df = contras_df.sample(359)

In [18]:
df = pd.concat([df, contras_df], ignore_index=True)

In [19]:
df.reset_index(drop=True, inplace=True)

In [20]:
df

Unnamed: 0,premise1,hypothesis1,class
0,"As a software developer at our company, I want...",MQTT,0
1,"As a software developer at our company, I want...",IoT,0
2,"As a software developer at our company, I want...",Sensor Integration,0
3,"As a software developer at our company, I want...",Smart contracts,0
4,"As a software developer at our company, I want...",Pandas,0
...,...,...,...
713,As an IT Project Manager overseeing the develo...,NoSQL,2
714,As an IT Project Manager overseeing the develo...,MQTT,2
715,As a systems administrator at our software com...,Hardware/Software Integration,2
716,"As a quality assurance engineer, I want to str...",Data Modeling,2


In [21]:
def shuffle_df(old_df: pd.DataFrame, cycles: int = 1) -> pd.DataFrame:
    np.random.seed(42)
    for i in range(cycles):
        new_df = old_df.sample(frac=1).reset_index(drop=True)
        return new_df

In [22]:
def encode_examples(examples):
    """
    Encodes examples for NLI training by tokenizing the 'hypothesis' and 'premise' columns.
    :returns 
        A dictionary with tokenized input data and labels.
    """
    encoding = tokenizer(examples['premise'], examples['hypothesis'], truncation=True)
    encoding['labels'] = examples['class']
    encoding["input_sentence"] = tokenizer.batch_decode(encoding.input_ids)
    return encoding

In [23]:
# selects a random data for the training
train_data, test_data = train_test_split(df, test_size=test_size, random_state=42)
train_shuffle_df = shuffle_df(train_data)
test_shuffle_df = shuffle_df(test_data)

# Create a Dataset object from the shuffled train DataFrame
train = Dataset.from_pandas(train_shuffle_df)
test = Dataset.from_pandas(test_shuffle_df)

In [24]:
# Map the encode_examples function to the train and test datasets - This function encodes the data, adds labels, and generates input sentences
train_dataset = train.map(encode_examples, batched=True, batch_size=1, remove_columns=["class", "hypothesis"])

Map:   0%|          | 0/143 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [25]:
test_dataset = test.map(encode_examples, batched=True, batch_size=1, remove_columns=["class", "hypothesis"])

Map:   0%|          | 0/575 [00:00<?, ? examples/s]

In [26]:
from transformers import EvalPrediction

def compute_metrics(p: EvalPrediction):
    # Extracting predictions from EvalPrediction object
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    # Obtaining the predicted classes
    preds = np.argmax(preds, axis=1)

    # Calculating the ratio of predictions equals to 2 (assumed label)
    ratio = np.mean(preds == 2)

    # Dictionary to store computed metrics
    metric_result = {}

    # Loading evaluation metrics
    metric_f1 = load_metric("f1", trust_remote_code=True)
    metric_precision = load_metric("precision", trust_remote_code=True)
    metric_recall = load_metric("recall", trust_remote_code=True)
    metric_acc = load_metric("accuracy", trust_remote_code=True)

    # Computing various metrics
    metric_result["accuracy"] = metric_acc.compute(predictions=preds, references=p.label_ids)["accuracy"]
    metric_result["precision"] = metric_precision.compute(predictions=preds, references=p.label_ids, average='macro')['precision']
    metric_result["recall"] = metric_recall.compute(predictions=preds, references=p.label_ids, average='macro')["recall"]
    metric_result["f1"] = metric_f1.compute(predictions=preds, references=p.label_ids, average='macro')["f1"]
    metric_result["ratio"] = ratio

    return metric_result

In [27]:
# Freeze all base parameters of transformer 
if freeze:
    for param in model.base_model.parameters():
        param.requires_grad = False
    
    # check which paramteres are trained
    for name, param in model.named_parameters():
        print(f"Parameter {name}: {'trainierbar' if param.requires_grad else 'eingefroren'}")

In [28]:
#model.gradient_checkpointing_enable()
#model.config.use_cache = True

Fine-Tuning the models

In [29]:
training_args = TrainingArguments(
    output_dir=f"FinalRuns",
    num_train_epochs=eps,              # total number of training epochs
    learning_rate=lr,
    per_device_train_batch_size=train_batch,   # batch size per device during training
    per_device_eval_batch_size=eval_batch,    # batch size for evaluation
    warmup_ratio=warm_ratio,                # number of warmup steps for learning rate scheduler
    weight_decay=wd,               # strength of weight decay
    fp16=True                        # mixed precision training
)

In [30]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [31]:
trainer.train()

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/sailerco/huggingface/66a13a42322c4e96a5a1d82401dbaac3



Step,Training Loss


[1;38;5;39mCOMET INFO:[0m The process of logging environment details (conda environment, git patch) is underway. Please be patient as this may take some time.
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : nursing_radius_9397
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/sailerco/huggingface/66a13a42322c4e96a5a1d82401dbaac3
[1;38;5;39mCOMET INFO:[0m   Metrics:
[1;38;5;39mCOMET INFO:[0m     epoch                          : 6.0
[1;38;5;39mCOMET INFO:[0m     total_flos                     : 30093109036380.0
[1;38;5;39mCOMET INFO:[0m     train/epoch                

TrainOutput(global_step=30, training_loss=0.5531073888142903, metrics={'train_runtime': 6.3812, 'train_samples_per_second': 134.458, 'train_steps_per_second': 4.701, 'total_flos': 30093109036380.0, 'train_loss': 0.5531073888142903, 'epoch': 6.0})

In [32]:
trainer.get_num_trainable_parameters() #6 Epochen: 1052675 vs. 407344131

184348419

In [33]:
trainer.evaluate()

  metric_f1 = load_metric("f1", trust_remote_code=True)


{'eval_loss': 0.3496900200843811,
 'eval_accuracy': 0.8817391304347826,
 'eval_precision': 0.8868595709247649,
 'eval_recall': 0.8822444041137326,
 'eval_f1': 0.881437547000461,
 'eval_ratio': 0.5547826086956522,
 'eval_runtime': 3.2278,
 'eval_samples_per_second': 178.142,
 'eval_steps_per_second': 5.577,
 'epoch': 6.0}

In [34]:
model.eval()

DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128001, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
        

In [35]:
#other ways to save the model, will only save the config.json and model safetensor
#trainer.save_model(f"FinalRuns_PreTrained/{model_config_name}")

In [36]:
model.save_pretrained(f"FinalRuns_PreTrained_{model_name}/{model_config_name}",from_pt=True)

Zero-Shot Classification with the new model

In [37]:
# Create new pipeline object with our fine-tuned model and tokenizer
model.config.use_cache = True
classifier_after = pipeline('zero-shot-classification', model=model, tokenizer=tokenizer, device=device)

In [38]:
# zero shot classification
hypothesis_template = "To resolve this issue the skill {} is needed."
if hypothesis_template_available:
    after_results = classifier_after(user_stories, labels, multi_label=True, hypothesis_template=hypothesis_template)
else:
    after_results = classifier_after(user_stories, labels, multi_label=True)

In [39]:
# save to text file
with open(f"output_txt/{model_config_name}.txt", 'w') as f:
    for story, result in zip(user_stories, after_results):
        f.write(f"Story: {story}\n")
        for label, score in zip(result['labels'], result['scores']):
            f.write(f"- {label}: {score:.2f}\n")

In [40]:
# convert to csv
file_dir = os.getcwd()
csv = Conv.CsvConverter(os.path.join(file_dir, 'output_txt',f'{model_config_name}.txt'),
                        os.path.join(file_dir, 'output_csv', f'{model_config_name}.csv'),
                        'Story')
csv.convert()

In [41]:
# generate metrics
import MetricsGenerator as Metrics
dir = os.getcwd()
end_dir = os.path.join(dir, "output_csv")
if model_name == "bart":
    metrics = Metrics.MetricsGenerator(f"_{config}", dir, end_dir, False, True, False).main()
else:
    metrics = Metrics.MetricsGenerator(f"_{config}", dir, end_dir, False, False, True).main()

---_208_6_2e-05_3232_0.1_0.06---

---DEBERTA---
  Threshold    Label Density    Subset Accuracy    Recall    F1 Score    F-Beta Score    Hamming Loss    Jaccard Index
-----------  ---------------  -----------------  --------  ----------  --------------  --------------  ---------------
       1                1.1                0.02    0.1897      0.2404          0.2057          0.026            0.1738
       0.95             3.87               0.13    0.7397      0.6721          0.7041          0.0186           0.5592
       0.9              4.33               0.11    0.768       0.6704          0.7172          0.0203           0.5516
       0.8              4.88               0.13    0.7917      0.6573          0.7224          0.0232           0.536
       0.5              6.22               0.02    0.8275      0.6057          0.7103          0.0315           0.4655
Differences:
  Threshold    Label Density    Subset Accuracy    Recall    F1 Score    F-Beta Score    Hamming Loss    Ja