In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import re
from collections import Counter
import pandas as pd
import numpy as np
import torch
from datasets import Dataset, load_metric
from sklearn.model_selection import train_test_split
from transformers import TrainingArguments, Trainer
from transformers import pipeline
import CsvConverter as Conv
import os

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
user_stories_df = pd.read_csv('../DB_GroundTruth/userStories.csv', delimiter=';')
user_stories = user_stories_df['user_stories'].tolist()
df = pd.read_csv('D:/Thesis/DB/datasets/skills.csv', header=None, encoding='ISO-8859-1')
labels = df[0].tolist()

In [4]:
user_stories_df['skills'] = user_stories_df['skills'].apply(lambda x: [i.strip().replace("'", "") for i in x.split(",")])

In [5]:
user_stories_df

Unnamed: 0,user_stories,skills
0,"As a software developer at our company, I want...","[MQTT, IoT, Sensor Integration, Smart contra..."
1,"As a software developer at our company, I want...","[Pandas, scikit-learn, Natural Language Proces..."
2,"As a DevOps engineer at our company, I want to...","[GCP, Azure]"
3,As a software engineer at our cryptocurrency d...,"[Cryptocurrency development, IoT, MQTT, Sensor..."
4,As a UI/UX designer and developer at our softw...,"[UI Design, Responsive Design, React Native]"
...,...,...
95,"As a DevOps engineer, I want to automate the b...","[GCP, Build Automation, Kubernetes]"
96,"As a web developer, I want to implement a user...","[HTML, React, CSS, UI Design, Web Accessibility]"
97,"As a QA engineer, I want to integrate Selenium...","[Agile Methodologies, Selenium]"
98,"As an IT administrator, I want to implement se...","[Troubleshooting, Active Directory, Security B..."


In [6]:
contras_df = pd.read_csv('truth.csv', index_col=0)


In [7]:
# Extract NaN skills for each column
skills_nan = {col: contras_df.loc[contras_df[col].isna()].index.tolist() for col in contras_df.columns}

In [8]:
# Create a new DataFrame with the desired structure
new_data = []
for col, nan_indices in skills_nan.items():
    #nan_indices = ', '.join(nan_indices)
    new_data.append([col, nan_indices])

In [9]:
contras_df = pd.DataFrame(new_data, columns=['user_stories', 'skills'])

In [10]:
model_name = "deberta"
if model_name == "bart":
    model_dir = "facebook/bart-large-mnli"
else:
    model_dir = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSequenceClassification.from_pretrained(model_dir)

In [11]:
def get_new_tokens(sentences, vocabulary):
    vocab_set = set(vocabulary)
    cleaned_words = (re.sub(r"[.'\s\n]+|('\s)", "", word).lower().strip() for sentence in sentences for word in
                     sentence)
    return [word for word in cleaned_words if word not in vocab_set and word]

In [12]:
def word_count(word_list):
    return Counter(word_list)

In [13]:
def tokenize(data):
    data['hypothesis'] = data['hypothesis'].astype("str")
    premises = []
    for x in data['premise'].to_list():
        premises.extend([item.strip() for item in x])
    #data['premise'] = data['premise'].astype("str")
    sentences = data['hypothesis'].to_list() + premises
    #sentences = [sentence.replace(",", "") for sentence in sentences]
    
    vocabulary = tokenizer.get_vocab().keys()
    tokens_to_add = get_new_tokens(sentences, vocabulary)
    words = word_count(tokens_to_add)
    # Initialize an empty list to store new tokens + Loop through the words and their counts
    new_tokens = []
    for key, value in words.items():
        if value > 10 and len(key) > 2:
            new_tokens.append(key)
    tokenizer.add_tokens(new_tokens)
    model.resize_token_embeddings(len(tokenizer))

In [14]:
def synth_to_nli(data, value):
    data = data.copy()
    data.rename(columns={'user_stories': 'hypothesis', 'skills': 'premise'}, inplace=True)
    data['class'] = value
    tokenize(data)
    return data

In [15]:
df = synth_to_nli(user_stories_df, 0)

In [16]:
contras_df = synth_to_nli(contras_df, 2)


In [17]:
df

Unnamed: 0,hypothesis,premise,class
0,"As a software developer at our company, I want...","[MQTT, IoT, Sensor Integration, Smart contra...",0
1,"As a software developer at our company, I want...","[Pandas, scikit-learn, Natural Language Proces...",0
2,"As a DevOps engineer at our company, I want to...","[GCP, Azure]",0
3,As a software engineer at our cryptocurrency d...,"[Cryptocurrency development, IoT, MQTT, Sensor...",0
4,As a UI/UX designer and developer at our softw...,"[UI Design, Responsive Design, React Native]",0
...,...,...,...
95,"As a DevOps engineer, I want to automate the b...","[GCP, Build Automation, Kubernetes]",0
96,"As a web developer, I want to implement a user...","[HTML, React, CSS, UI Design, Web Accessibility]",0
97,"As a QA engineer, I want to integrate Selenium...","[Agile Methodologies, Selenium]",0
98,"As an IT administrator, I want to implement se...","[Troubleshooting, Active Directory, Security B...",0


In [18]:
df = df.explode('premise')
contras_df = contras_df.explode('premise')
contras_df = contras_df.sample(359)

In [19]:
df

Unnamed: 0,hypothesis,premise,class
0,"As a software developer at our company, I want...",MQTT,0
0,"As a software developer at our company, I want...",IoT,0
0,"As a software developer at our company, I want...",Sensor Integration,0
0,"As a software developer at our company, I want...",Smart contracts,0
1,"As a software developer at our company, I want...",Pandas,0
...,...,...,...
98,"As an IT administrator, I want to implement se...",Backup and Recovery,0
99,"As a database developer, I want to optimize ou...",Microsoft SQL Server,0
99,"As a database developer, I want to optimize ou...",RDBMS,0
99,"As a database developer, I want to optimize ou...",MySQL,0


In [20]:
df = pd.concat([df, contras_df], ignore_index=True)

In [21]:
df

Unnamed: 0,hypothesis,premise,class
0,"As a software developer at our company, I want...",MQTT,0
1,"As a software developer at our company, I want...",IoT,0
2,"As a software developer at our company, I want...",Sensor Integration,0
3,"As a software developer at our company, I want...",Smart contracts,0
4,"As a software developer at our company, I want...",Pandas,0
...,...,...,...
713,"As a data analyst, I want to develop a text pr...",Firewall Configuration,2
714,"As a member of the development team, I want to...",UX/UI Design and Prototyping,2
715,"As a software developer, I want to containeriz...",Usability Testing,2
716,"As a web developer, I want to enhance the user...",Big Data Technologies,2


In [22]:
#df['premise'] = df['premise'].apply(lambda x: ','.join(str(element) for element in x))

In [23]:
#df

In [24]:
df.reset_index(drop=True, inplace=True)

In [25]:
def shuffle_df(old_df: pd.DataFrame, cycles: int = 1) -> pd.DataFrame:
    np.random.seed(42)
    for i in range(cycles):
        new_df = old_df.sample(frac=1).reset_index(drop=True)
        return new_df

In [26]:
def create_input_sequence(sample):
    text = sample["premise"]
    hypothesis = sample['hypothesis']
    nli_label = sample['class']

    # Encoding the sequence using the tokenizer
    encoded_sequence = tokenizer(text, hypothesis, truncation=True, padding='max_length')
    # Assign label to the encoded sequence
    encoded_sequence['labels'] = nli_label
    # Decode the input_ids
    encoded_sequence["input_sentence"] = tokenizer.batch_decode(encoded_sequence.input_ids)
    return encoded_sequence

In [27]:
# Function to encode the dataset
def encode_examples(examples):
    encoding = tokenizer( examples['hypothesis'], examples['premise'], truncation=True)
    encoding['labels'] = examples['class']
    encoding["input_sentence"] = tokenizer.batch_decode(encoding.input_ids)
    return encoding

In [28]:
test_size = 0.8

In [29]:
train_data, test_data = train_test_split(df, test_size=test_size, random_state=42)
train_shuffle_df = shuffle_df(train_data)
test_shuffle_df = shuffle_df(test_data)

# Create a Dataset object from the shuffled train DataFrame
train = Dataset.from_pandas(train_shuffle_df)
test = Dataset.from_pandas(test_shuffle_df)

In [30]:
# Map the create_input_sequence function to the train and test datasets - This function encodes the data, adds labels, and generates input sentences
train_dataset = train.map(encode_examples, batched=True, batch_size=1, remove_columns=["class", "premise"])

Map:   0%|          | 0/143 [00:00<?, ? examples/s]

In [31]:
test_dataset = test.map(encode_examples, batched=True, batch_size=1, remove_columns=["class", "premise"])

Map:   0%|          | 0/575 [00:00<?, ? examples/s]

In [32]:
from transformers import EvalPrediction


def compute_metrics(p: EvalPrediction):
    # Extracting predictions from EvalPrediction object
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    # Obtaining the predicted classes
    preds = np.argmax(preds, axis=1)

    # Calculating the ratio of predictions equals to 2 (assumed label)
    ratio = np.mean(preds == 2)

    # Dictionary to store computed metrics
    metric_result = {}

    # Loading evaluation metrics
    metric_f1 = load_metric("f1", trust_remote_code=True)
    metric_precision = load_metric("precision", trust_remote_code=True)
    metric_recall = load_metric("recall", trust_remote_code=True)
    metric_acc = load_metric("accuracy", trust_remote_code=True)

    # Computing various metrics
    metric_result["accuracy"] = metric_acc.compute(predictions=preds, references=p.label_ids)["accuracy"]
    metric_result["precision"] = metric_precision.compute(predictions=preds, references=p.label_ids, average='macro')['precision']
    metric_result["recall"] = metric_recall.compute(predictions=preds, references=p.label_ids, average='macro')["recall"]
    metric_result["f1"] = metric_f1.compute(predictions=preds, references=p.label_ids, average='macro')["f1"]
    metric_result["ratio"] = ratio

    return metric_result

In [33]:
model.gradient_checkpointing_enable()
model.config.use_cache = False

In [34]:
"""do = True
train_batch = 16
eval_batch = 2
lr = 1e-05
weight = 0.1
eps = 1
warm = 50
wd = 0.01
warm_ratio = 0.1"""

'do = True\ntrain_batch = 16\neval_batch = 2\nlr = 1e-05\nweight = 0.1\neps = 1\nwarm = 50\nwd = 0.01\nwarm_ratio = 0.1'

In [35]:
"""training_args = TrainingArguments(
    output_dir="test_trainer_Trainer",  # Output directory
    num_train_epochs=eps,
    do_train=do,
    per_device_train_batch_size=train_batch,
    per_device_eval_batch_size=eval_batch,
    learning_rate=lr,
    warmup_steps=warm,  # Number of warmup steps for learning rate scheduler
    weight_decay=wd,  # Strength of weight decay
    gradient_accumulation_steps=2,  # The number of steps whose gradients are accumulated
    warmup_ratio=warm_ratio,  # Represents the proportion of training steps
)"""

'training_args = TrainingArguments(\n    output_dir="test_trainer_Trainer",  # Output directory\n    num_train_epochs=eps,\n    do_train=do,\n    per_device_train_batch_size=train_batch,\n    per_device_eval_batch_size=eval_batch,\n    learning_rate=lr,\n    warmup_steps=warm,  # Number of warmup steps for learning rate scheduler\n    weight_decay=wd,  # Strength of weight decay\n    gradient_accumulation_steps=2,  # The number of steps whose gradients are accumulated\n    warmup_ratio=warm_ratio,  # Represents the proportion of training steps\n)'

In [36]:
train_batch = 32
eval_batch = 32
lr = 2e-05
eps = 3
wd = 0.06
warm_ratio = 0.1
training_args = TrainingArguments(
    output_dir=f"test_trainer_Trainer_{model_name}",
    num_train_epochs=3,              # total number of training epochs
    learning_rate=2e-05,
    per_device_train_batch_size=32,   # batch size per device during training
    per_device_eval_batch_size=32,    # batch size for evaluation
    warmup_ratio=0.1,                # number of warmup steps for learning rate scheduler
    weight_decay=0.06,               # strength of weight decay
    fp16=True                        # mixed precision training
)
"""
training_args = TrainingArguments(
    output_dir="test_trainer_Trainer_deberta",
    num_train_epochs=3,              # total number of training epochs
    learning_rate=2e-05,
    per_device_train_batch_size=32,   # batch size per device during training
    per_device_eval_batch_size=32,    # batch size for evaluation
    warmup_ratio=0.1,                # number of warmup steps for learning rate scheduler
    weight_decay=0.06,               # strength of weight decay
    fp16=True                        # mixed precision training
)"""

'\ntraining_args = TrainingArguments(\n    output_dir="test_trainer_Trainer_deberta",\n    num_train_epochs=3,              # total number of training epochs\n    learning_rate=2e-05,\n    per_device_train_batch_size=32,   # batch size per device during training\n    per_device_eval_batch_size=32,    # batch size for evaluation\n    warmup_ratio=0.1,                # number of warmup steps for learning rate scheduler\n    weight_decay=0.06,               # strength of weight decay\n    fp16=True                        # mixed precision training\n)'

In [37]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [39]:
trainer.train()

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Step,Training Loss


TrainOutput(global_step=15, training_loss=1.0966145833333334, metrics={'train_runtime': 4.7477, 'train_samples_per_second': 90.359, 'train_steps_per_second': 3.159, 'total_flos': 64103399967600.0, 'train_loss': 1.0966145833333334, 'epoch': 3.0})

In [40]:
trainer.evaluate()

  metric_f1 = load_metric("f1", trust_remote_code=True)


{'eval_loss': 0.373416006565094,
 'eval_accuracy': 0.8556521739130435,
 'eval_precision': 0.8755286893599674,
 'eval_recall': 0.8566545674531156,
 'eval_f1': 0.8539541772624479,
 'eval_ratio': 0.6121739130434782,
 'eval_runtime': 3.3101,
 'eval_samples_per_second': 173.709,
 'eval_steps_per_second': 5.438,
 'epoch': 3.0}

In [41]:
model.eval()

BartForSequenceClassification(
  (model): BartModel(
    (shared): Embedding(50265, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): L

In [43]:
from datetime import datetime

model.save_pretrained(f"test_trainer_Trainer_{model_name}/{datetime.now().strftime('%Y%m%d-%H%M%S')}",from_pt=True)

Non-default generation parameters: {'forced_eos_token_id': 2}


Create Pipeline with the new model

In [44]:
# Create new pipeline object with our fine-tuned model and tokenizer
model.config.use_cache = True
classifier_after = pipeline('zero-shot-classification', model=model, tokenizer=tokenizer, device=device)
after_results = classifier_after(user_stories, labels, multi_label=True)

In [45]:
#model_name = "deberta"
split = str(test_size).replace(".","") + f"_{eps}_{lr}_{train_batch}{eval_batch}_{warm_ratio}_{wd}_exploded"

In [46]:
with open(f"output_txt/{model_name}_{split}.txt", 'w') as f:
    for story, result in zip(user_stories, after_results):
        f.write(f"Story: {story}\n")
        for label, score in zip(result['labels'], result['scores']):
            f.write(f"- {label}: {score:.2f}\n")

In [47]:
print(os.getcwd())
file_dir = os.getcwd()
#dir = os.path.abspath("")
csv = Conv.CsvConverter(os.path.join(file_dir, 'output_txt',f'{model_name}_{split}.txt'),
                        os.path.join(file_dir, 'output_csv', f'{model_name}_{split}.csv'),
                        'Story')
csv.convert()

D:\Thesis\FineTuning


In [48]:
import MetricsGenerator as Metrics
dir = os.getcwd()
end_dir = os.path.join(dir, "output_csv")
if model_name == "bart":
    metrics = Metrics.MetricsGenerator(f"_{split}", dir, end_dir, False, True, False).main()
else:
    metrics = Metrics.MetricsGenerator(f"_{split}", dir, end_dir, False, False, True).main()

---_08_3_2e-05_3232_0.1_0.06_exploded---
---BART---
  Threshold    Label Density    Subset Accuracy    Recall    F1 Score    F-Beta Score    Hamming Loss    ROC AUC    Jaccard Loss
-----------  ---------------  -----------------  --------  ----------  --------------  --------------  ---------  --------------
       1                0                     0    0           0               0               0.028      0.5             0
       0.95            23.82                  0    0.005       0.0027          0.0032          0.2138     0.4069          0.0015
       0.9             72.97                  0    0.0455      0.0047          0.0072          0.5952     0.2302          0.0024
       0.8            112.52                  0    0.1265      0.0086          0.0134          0.899      0.1131          0.0044
       0.5            124.53                  0    0.383       0.0232          0.0362          0.9773     0.1971          0.0119
Differences:
  Threshold    Label Density    Subse

In [None]:
"""training_args = TrainingArguments(
    output_dir="test_trainer",  # Output directory
    logging_dir="test_trainer/logs",
    learning_rate=1e-05,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    seed=42,
    optim="adamw_torch",
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-08,
    lr_scheduler_type="linear",
    num_train_epochs=1,
)"""
"""training_args = TrainingArguments(
    output_dir="test_trainer",  # Output directory
    logging_dir="test_trainer/logs",  # Output directory for logging
    num_train_epochs=32,  # Total number of training epochs
    per_device_train_batch_size=16,  # Batch size per device during training
    per_device_eval_batch_size=64,  # Batch size for evaluation
    warmup_steps=500,  # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # Strength of weight decay
    gradient_accumulation_steps=2,  # The number of steps whose gradients are accumulated
    learning_rate=5e-05,  # Controls the magnitude of updates to the model weights
    warmup_ratio=0.06,  # Represents the proportion of training steps
    label_smoothing_factor=0.1,  # Regularization technique to prevent the model from becoming overconfident
    eval_strategy='steps',  # Frequency or timing of evaluating
    logging_strategy='steps',  # Frequency or timing of logging
    logging_steps=10,  # Frequency or timing of logging
    eval_steps=10,  # Frequency or timing of evaluating
    logging_first_step=True,
    do_eval=True,
)"""