In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import re
from collections import Counter
import pandas as pd
import numpy as np
import torch
from datasets import Dataset, load_metric
from sklearn.model_selection import train_test_split
from transformers import TrainingArguments, Trainer
from transformers import pipeline
import CsvConverter as Conv
import os

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
user_stories_df = pd.read_csv('../DB_GroundTruth/userStories.csv', delimiter=';')
user_stories = user_stories_df['user_stories'].tolist()
df = pd.read_csv('D:/Thesis/DB/datasets/skills.csv', header=None, encoding='ISO-8859-1')
labels = df[0].tolist()

In [4]:
user_stories_df['skills'] = user_stories_df['skills'].apply(lambda x: [i.strip().replace("'", "") for i in x.split(",")])

In [5]:
user_stories_df

Unnamed: 0,user_stories,skills
0,"As a software developer at our company, I want...","[MQTT, IoT, Sensor Integration, Smart contra..."
1,"As a software developer at our company, I want...","[Pandas, scikit-learn, Natural Language Proces..."
2,"As a DevOps engineer at our company, I want to...","[GCP, Azure]"
3,As a software engineer at our cryptocurrency d...,"[Cryptocurrency development, IoT, MQTT, Sensor..."
4,As a UI/UX designer and developer at our softw...,"[UI Design, Responsive Design, React Native]"
...,...,...
95,"As a DevOps engineer, I want to automate the b...","[GCP, Build Automation, Kubernetes]"
96,"As a web developer, I want to implement a user...","[HTML, React, CSS, UI Design, Web Accessibility]"
97,"As a QA engineer, I want to integrate Selenium...","[Agile Methodologies, Selenium]"
98,"As an IT administrator, I want to implement se...","[Troubleshooting, Active Directory, Security B..."


In [6]:
contras_df = pd.read_csv('truth.csv', index_col=0)

In [7]:
# Extract NaN skills for each column
skills_nan = {col: contras_df.loc[contras_df[col].isna()].index.tolist() for col in contras_df.columns}

In [8]:
# Create a new DataFrame with the desired structure
new_data = []
for col, nan_indices in skills_nan.items():
    #nan_indices = ', '.join(nan_indices)
    new_data.append([col, nan_indices])

In [9]:
contras_df = pd.DataFrame(new_data, columns=['user_stories', 'skills'])

In [10]:
model_name = "deberta"
if model_name == "bart":
    model_dir = "facebook/bart-large-mnli"
else:
    model_dir = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSequenceClassification.from_pretrained(model_dir)

In [11]:
def get_new_tokens(sentences, vocabulary):
    vocab_set = set(vocabulary)
    cleaned_words = (re.sub(r"[.'\s\n]+|('\s)", "", word).lower().strip() for sentence in sentences for word in
                     sentence)
    return [word for word in cleaned_words if word not in vocab_set and word]

In [12]:
def word_count(word_list):
    return Counter(word_list)

In [13]:
def tokenize(data):
    data['hypothesis'] = data['hypothesis'].astype("str")
    premises = []
    for x in data['premise'].to_list():
        premises.extend([item.strip() for item in x])
    #data['premise'] = data['premise'].astype("str")
    sentences = data['hypothesis'].to_list() + premises
    #sentences = [sentence.replace(",", "") for sentence in sentences]
    
    vocabulary = tokenizer.get_vocab().keys()
    tokens_to_add = get_new_tokens(sentences, vocabulary)
    words = word_count(tokens_to_add)
    # Initialize an empty list to store new tokens + Loop through the words and their counts
    new_tokens = []
    for key, value in words.items():
        if value > 10 and len(key) > 2:
            new_tokens.append(key)
    tokenizer.add_tokens(new_tokens)
    model.resize_token_embeddings(len(tokenizer))

In [14]:
def synth_to_nli(data, value):
    data = data.copy()
    data.rename(columns={'user_stories': 'hypothesis', 'skills': 'premise'}, inplace=True)
    data['class'] = value
    tokenize(data)
    return data

In [15]:
if model_name == "bart":
    df = synth_to_nli(user_stories_df, 2) # entailement
    contras_df = synth_to_nli(contras_df, 0) # contradiction
else:
    df = synth_to_nli(user_stories_df, 0) # entailement
    contras_df = synth_to_nli(contras_df, 2) # contradiction

In [16]:
df = df.explode('premise')
contras_df = contras_df.explode('premise')
contras_df = contras_df.sample(359)

In [17]:
df = pd.concat([df, contras_df], ignore_index=True)

In [18]:
df.reset_index(drop=True, inplace=True)

In [19]:
df

Unnamed: 0,hypothesis,premise,class
0,"As a software developer at our company, I want...",MQTT,0
1,"As a software developer at our company, I want...",IoT,0
2,"As a software developer at our company, I want...",Sensor Integration,0
3,"As a software developer at our company, I want...",Smart contracts,0
4,"As a software developer at our company, I want...",Pandas,0
...,...,...,...
713,"As a software developer at our company, I want...",Wireless Networking,2
714,"As a web developer, I want to ensure cross-bro...",Data Manipulation,2
715,"As a software developer at our company, I want...",PyTorch,2
716,As an IT Project Manager overseeing the develo...,Data Mining,2


In [20]:
def shuffle_df(old_df: pd.DataFrame, cycles: int = 1) -> pd.DataFrame:
    np.random.seed(42)
    for i in range(cycles):
        new_df = old_df.sample(frac=1).reset_index(drop=True)
        return new_df

In [21]:
# Function to encode the dataset
def encode_examples(examples):
    encoding = tokenizer(examples['hypothesis'], examples['premise'], truncation=True)
    encoding['labels'] = examples['class']
    encoding["input_sentence"] = tokenizer.batch_decode(encoding.input_ids)
    return encoding

In [22]:
test_size = 0.8

In [23]:
train_data, test_data = train_test_split(df, test_size=test_size, random_state=42)
train_shuffle_df = shuffle_df(train_data)
test_shuffle_df = shuffle_df(test_data)

# Create a Dataset object from the shuffled train DataFrame
train = Dataset.from_pandas(train_shuffle_df)
test = Dataset.from_pandas(test_shuffle_df)

In [24]:
# Map the create_input_sequence function to the train and test datasets - This function encodes the data, adds labels, and generates input sentences
train_dataset = train.map(encode_examples, batched=True, batch_size=1, remove_columns=["class", "premise"])

Map:   0%|          | 0/143 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [25]:
test_dataset = test.map(encode_examples, batched=True, batch_size=1, remove_columns=["class", "premise"])

Map:   0%|          | 0/575 [00:00<?, ? examples/s]

In [26]:
from transformers import EvalPrediction


def compute_metrics(p: EvalPrediction):
    # Extracting predictions from EvalPrediction object
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    # Obtaining the predicted classes
    preds = np.argmax(preds, axis=1)

    # Calculating the ratio of predictions equals to 2 (assumed label)
    ratio = np.mean(preds == 2)

    # Dictionary to store computed metrics
    metric_result = {}

    # Loading evaluation metrics
    metric_f1 = load_metric("f1", trust_remote_code=True)
    metric_precision = load_metric("precision", trust_remote_code=True)
    metric_recall = load_metric("recall", trust_remote_code=True)
    metric_acc = load_metric("accuracy", trust_remote_code=True)

    # Computing various metrics
    metric_result["accuracy"] = metric_acc.compute(predictions=preds, references=p.label_ids)["accuracy"]
    metric_result["precision"] = metric_precision.compute(predictions=preds, references=p.label_ids, average='macro')['precision']
    metric_result["recall"] = metric_recall.compute(predictions=preds, references=p.label_ids, average='macro')["recall"]
    metric_result["f1"] = metric_f1.compute(predictions=preds, references=p.label_ids, average='macro')["f1"]
    metric_result["ratio"] = ratio

    return metric_result

In [27]:
# Einfrieren aller Parameter des Basismodells (Transformermodell)
for param in model.base_model.parameters():
    param.requires_grad = False

# Optional: Überprüfen Sie, welche Parameter trainiert werden
for name, param in model.named_parameters():
    print(f"Parameter {name}: {'trainierbar' if param.requires_grad else 'eingefroren'}")

Parameter deberta.embeddings.word_embeddings.weight: eingefroren
Parameter deberta.embeddings.LayerNorm.weight: eingefroren
Parameter deberta.embeddings.LayerNorm.bias: eingefroren
Parameter deberta.encoder.layer.0.attention.self.query_proj.weight: eingefroren
Parameter deberta.encoder.layer.0.attention.self.query_proj.bias: eingefroren
Parameter deberta.encoder.layer.0.attention.self.key_proj.weight: eingefroren
Parameter deberta.encoder.layer.0.attention.self.key_proj.bias: eingefroren
Parameter deberta.encoder.layer.0.attention.self.value_proj.weight: eingefroren
Parameter deberta.encoder.layer.0.attention.self.value_proj.bias: eingefroren
Parameter deberta.encoder.layer.0.attention.output.dense.weight: eingefroren
Parameter deberta.encoder.layer.0.attention.output.dense.bias: eingefroren
Parameter deberta.encoder.layer.0.attention.output.LayerNorm.weight: eingefroren
Parameter deberta.encoder.layer.0.attention.output.LayerNorm.bias: eingefroren
Parameter deberta.encoder.layer.0.int

In [28]:
model.gradient_checkpointing_enable()
model.config.use_cache = True

In [29]:
train_batch = 32
eval_batch = 32
lr = 2e-05
eps = 3
wd = 0.06
warm_ratio = 0.1

In [30]:
training_args = TrainingArguments(
    output_dir=f"FinalRuns",
    num_train_epochs=eps,              # total number of training epochs
    learning_rate=lr,
    per_device_train_batch_size=train_batch,   # batch size per device during training
    per_device_eval_batch_size=eval_batch,    # batch size for evaluation
    warmup_ratio=warm_ratio,                # number of warmup steps for learning rate scheduler
    weight_decay=wd,               # strength of weight decay
    fp16=True                        # mixed precision training
)

In [31]:
training_args

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_steps=None,
eval_strategy=IntervalStrategy.NO,
evaluation_strategy=None,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,

In [32]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [33]:
trainer.train()



Step,Training Loss


TrainOutput(global_step=15, training_loss=1.9950833638509116, metrics={'train_runtime': 2.628, 'train_samples_per_second': 163.241, 'train_steps_per_second': 5.708, 'total_flos': 14990025231090.0, 'train_loss': 1.9950833638509116, 'epoch': 3.0})

In [34]:
trainer.get_num_trainable_parameters() #6 Epochen: 1052675 vs. 407344131

592899

In [35]:
trainer.evaluate()

  metric_f1 = load_metric("f1", trust_remote_code=True)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 2.2735488414764404,
 'eval_accuracy': 0.40869565217391307,
 'eval_precision': 0.5517069772071265,
 'eval_recall': 0.2710828796128251,
 'eval_f1': 0.34664060667050584,
 'eval_ratio': 0.11826086956521739,
 'eval_runtime': 3.0353,
 'eval_samples_per_second': 189.44,
 'eval_steps_per_second': 5.93,
 'epoch': 3.0}

In [36]:
model.eval()

DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128001, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
        

In [37]:
trainer.save_model(f"FinalRuns_PreTrained/{model_name}_{eps}_{lr}_{train_batch}{eval_batch}_{warm_ratio}_{wd}")

In [38]:
from datetime import datetime
#model.save_pretrained(f"test_trainer_Trainer_{model_name}/{datetime.now().strftime('%Y%m%d-%H%M%S')}",from_pt=True)
#model.save_pretrained(f"FinalRuns_PreTrained_{model_name}/{eps}_{lr}_{train_batch}{eval_batch}_{warm_ratio}_{wd}",from_pt=True)


Create Pipeline with the new model

In [39]:
# Create new pipeline object with our fine-tuned model and tokenizer
model.config.use_cache = True
classifier_after = pipeline('zero-shot-classification', model=model, tokenizer=tokenizer, device=device)

In [40]:
hypothesis_template = "To resolve this issue the skill {} is needed."

after_results = classifier_after(user_stories, labels, multi_label=True)

In [41]:

split = str(test_size).replace(".","") + f"_freeze1_{eps}_{lr}_{train_batch}{eval_batch}_{warm_ratio}_{wd}"

#split = str(test_size).replace(".","") + f"_default"

In [42]:
with open(f"output_txt/{model_name}_{split}.txt", 'w') as f:
    for story, result in zip(user_stories, after_results):
        f.write(f"Story: {story}\n")
        for label, score in zip(result['labels'], result['scores']):
            f.write(f"- {label}: {score:.2f}\n")

In [43]:
print(os.getcwd())
file_dir = os.getcwd()
#dir = os.path.abspath("")
csv = Conv.CsvConverter(os.path.join(file_dir, 'output_txt',f'{model_name}_{split}.txt'),
                        os.path.join(file_dir, 'output_csv', f'{model_name}_{split}.csv'),
                        'Story')
csv.convert()

D:\Thesis\FineTuning


In [44]:
import MetricsGenerator as Metrics
dir = os.getcwd()
end_dir = os.path.join(dir, "output_csv")
if model_name == "bart":
    metrics = Metrics.MetricsGenerator(f"_{split}", dir, end_dir, False, True, False).main()
else:
    metrics = Metrics.MetricsGenerator(f"_{split}", dir, end_dir, False, False, True).main()

---_08_freeze1_3_2e-05_3232_0.1_0.06---

---DEBERTA---
  Threshold    Label Density    Subset Accuracy    Recall    F1 Score    F-Beta Score    Hamming Loss    ROC AUC    Jaccard Loss
-----------  ---------------  -----------------  --------  ----------  --------------  --------------  ---------  --------------
       1                0.13               0       0.0143      0.0228          0.0168          0.0281     0.5068          0.0143
       0.95             2.66               0.12    0.4698      0.4825          0.4697          0.0235     0.7307          0.3883
       0.9              3.62               0.16    0.6501      0.5968          0.6182          0.0221     0.8193          0.4934
       0.8              4.59               0.12    0.7249      0.605           0.6606          0.0252     0.854           0.4915
       0.5              8.9                0       0.8072      0.4687          0.607           0.054      0.879           0.3271
Differences:
  Threshold    Label Density 