In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import re
from collections import Counter
import pandas as pd
import numpy as np
import torch
from datasets import Dataset, load_metric
from sklearn.model_selection import train_test_split
from transformers import TrainingArguments, Trainer
from transformers import pipeline
import CsvConverter as Conv
import os

In [2]:
user_stories_df = pd.read_csv('../DB_GroundTruth/userStories.csv', delimiter=';')
user_stories = user_stories_df['user_stories'].tolist()
df = pd.read_csv('D:/Thesis/DB/datasets/skills.csv', header=None, encoding='ISO-8859-1')
labels = df[0].tolist()

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
model_name = "facebook/bart-large-mnli"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [5]:
def get_new_tokens(sentences, vocabulary):
    vocab_set = set(vocabulary)
    cleaned_words = (re.sub(r"[.'\s\n]+|('\s)", "", word).lower().strip() for sentence in sentences for word in
                     sentence.split())
    return [word for word in cleaned_words if word not in vocab_set and word]

In [6]:
def word_count(word_list):
    return Counter(word_list)

In [7]:
def tokenize(data):
    data['hypothesis'] = data['hypothesis'].astype("str")
    data['premise'] = data['premise'].astype("str")
    data['premise'] = data['premise'].str.replace("'", '')
    sentences = data['hypothesis'].to_list() + data['premise'].to_list()
    sentences = [sentence.replace(",", "") for sentence in sentences]
    
    vocabulary = tokenizer.get_vocab().keys()
    tokens_to_add = get_new_tokens(sentences, vocabulary)
    words = word_count(tokens_to_add)
    # Initialize an empty list to store new tokens + Loop through the words and their counts
    new_tokens = []
    for key, value in words.items():
        if value > 10 and len(key) > 2:
            new_tokens.append(key)
    tokenizer.add_tokens(new_tokens)
    model.resize_token_embeddings(len(tokenizer))

In [8]:
def synth_to_nli(data):
    data.rename(columns={'user_stories': 'hypothesis', 'skills': 'premise'}, inplace=True)
    data['class'] = 0

    tokenize(data)
    return data

df = synth_to_nli(user_stories_df)

In [9]:
def shuffle_df(old_df: pd.DataFrame, cycles: int = 1) -> pd.DataFrame:
    np.random.seed(42)
    for i in range(cycles):
        new_df = old_df.sample(frac=1).reset_index(drop=True)
        return new_df

In [10]:
def create_input_sequence(sample):
    text = sample["premise"]
    hypothesis = sample['hypothesis']
    nli_label = sample['class']

    # Encoding the sequence using the tokenizer
    encoded_sequence = tokenizer(text, hypothesis, truncation=True, padding='max_length')
    # Assign label to the encoded sequence
    encoded_sequence['labels'] = nli_label
    # Decode the input_ids
    encoded_sequence["input_sentence"] = tokenizer.batch_decode(encoded_sequence.input_ids)
    return encoded_sequence

In [11]:
test_size = 0.6
train_data, test_data = train_test_split(df, test_size=test_size, random_state=42)
train_shuffle_df = shuffle_df(train_data)
test_shuffle_df = shuffle_df(test_data)

# Create a Dataset object from the shuffled train DataFrame
train = Dataset.from_pandas(train_shuffle_df)
test = Dataset.from_pandas(test_shuffle_df)

# Map the create_input_sequence function to the train and test datasets
# This function encodes the data, adds labels, and generates input sentences
train_dataset = train.map(create_input_sequence, batched=True, batch_size=1, remove_columns=["class", "premise"])
test_dataset = test.map(create_input_sequence, batched=True, batch_size=1, remove_columns=["class", "premise"])

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

In [12]:
#def compute_metrics(eval_pred):
#    logits, labels = eval_pred
#    predictions = np.argmax(logits, axis=-1)
#    return metric.compute(predictions=predictions, references=labels)

In [13]:
from transformers import EvalPrediction


def compute_metrics(p: EvalPrediction):
    # Extracting predictions from EvalPrediction object
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    # Obtaining the predicted classes
    preds = np.argmax(preds, axis=1)

    # Calculating the ratio of predictions equals to 2 (assumed label)
    ratio = np.mean(preds == 2)

    # Dictionary to store computed metrics
    metric_result = {}

    # Loading evaluation metrics
    metric_f1 = load_metric("f1", trust_remote_code=True)
    metric_precision = load_metric("precision", trust_remote_code=True)
    metric_recall = load_metric("recall", trust_remote_code=True)
    metric_acc = load_metric("accuracy", trust_remote_code=True)

    # Computing various metrics
    metric_result["accuracy"] = metric_acc.compute(predictions=preds, references=p.label_ids)["accuracy"]
    metric_result["precision"] = metric_precision.compute(predictions=preds, references=p.label_ids, average='macro')['precision']
    metric_result["recall"] = metric_recall.compute(predictions=preds, references=p.label_ids, average='macro')["recall"]
    metric_result["f1"] = metric_f1.compute(predictions=preds, references=p.label_ids, average='macro')["f1"]
    metric_result["ratio"] = ratio

    return metric_result

In [14]:
model.gradient_checkpointing_enable()
model.config.use_cache = False

In [15]:
"""training_args = TrainingArguments(
    output_dir="test_trainer",
    eval_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    learning_rate=2e-5,
    weight_decay=0.01,
)"""

'training_args = TrainingArguments(\n    output_dir="test_trainer",\n    eval_strategy="epoch",\n    per_device_train_batch_size=16,\n    per_device_eval_batch_size=16,\n    num_train_epochs=5,\n    learning_rate=2e-5,\n    weight_decay=0.01,\n)'

In [16]:
training_args = TrainingArguments(
    output_dir="test_trainer",  # Output directory
    logging_dir="test_trainer/logs",  # Output directory for logging
    num_train_epochs=1,  # Total number of training epochs
    per_device_train_batch_size=16,  # Batch size per device during training
    per_device_eval_batch_size=2,  # Batch size for evaluation
    warmup_steps=4,  # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # Strength of weight decay
    gradient_accumulation_steps=2,  # The number of steps whose gradients are accumulated
    learning_rate=2e-05,  # Controls the magnitude of updates to the model weights
    warmup_ratio=0.06,  # Represents the proportion of training steps
    label_smoothing_factor=0.1,  # Regularization technique to prevent the model from becoming overconfident
    eval_strategy='steps',  # Frequency or timing of evaluating
    logging_strategy='steps',  # Frequency or timing of logging
    logging_steps=10,  # Frequency or timing of logging
    eval_steps=10,  # Frequency or timing of evaluating
    logging_first_step=True,
    do_eval=True
)

In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [18]:
trainer.evaluate()

  attn_output = torch.nn.functional.scaled_dot_product_attention(


  metric_f1 = load_metric("f1", trust_remote_code=True)


ValueError: Mismatch in the number of predictions (3) and references (60)

In [None]:
trainer.train()

In [None]:
model.eval()

Create Pipeline with the new model

In [None]:
# Create new pipeline object with our fine-tuned model and tokenizer
model.config.use_cache = True
classifier_after = pipeline('zero-shot-classification', model=model, tokenizer=tokenizer, device=device)
after_results = classifier_after(user_stories, labels, multi_label=True)

In [None]:
model_name = "bart"
split = str(test_size).replace(".","")

In [None]:
with open(f"output_txt/{model_name}_{split}_result_after.txt", 'w') as f:
    for story, result in zip(user_stories, after_results):
        f.write(f"Story: {story}\n")
        for label, score in zip(result['labels'], result['scores']):
            f.write(f"- {label}: {score:.2f}\n")

In [None]:
print(os.getcwd())
file_dir = os.getcwd()
#dir = os.path.abspath("")
csv = Conv.CsvConverter(os.path.join(file_dir, 'output_txt',f'{model_name}_{split}_result_after.txt'),
                        os.path.join(file_dir, 'output_csv', f'{model_name}_{split}_result_after.csv'),
                        'Story')
csv.convert()