In [1]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

df = pd.read_csv('data.csv')
df['theme'] = df['theme'].apply(lambda x: x.split('_'))

# Step 3: Initialize MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# Step 4: One-hot encode the themes
encoded_labels = mlb.fit_transform(df['theme'])

# Step 5: Create a DataFrame for the encoded labels
df_encoded = pd.DataFrame(encoded_labels, columns=mlb.classes_)
df_encoded = df_encoded.astype(float)

# Step 6: Convert the one-hot encoded columns back to a list of labels
df['labels'] = df_encoded.values.tolist()

# Step 6: Combine the original dataframe with the encoded labels
df = pd.concat([df, df_encoded], axis=1)

# Step 7: Print the processed DataFrame
print(df.columns)

df.to_csv('data_encoded.csv', index=False)

Index(['sentence', 'aspect', 'theme', 'description', 'labels', 'ANA', 'BIB',
       'DAT', 'EXP', 'INT', 'MET', 'OAL', 'PDI', 'RES', 'RWK', 'TNF'],
      dtype='object')


In [2]:
# Calculate the frequency of each label
label_counts = df_encoded.sum(axis=0)

# Print the counts for each label
print(label_counts)

ANA     45.0
BIB     17.0
DAT    124.0
EXP    261.0
INT     13.0
MET    937.0
OAL    121.0
PDI     38.0
RES    165.0
RWK    227.0
TNF    108.0
dtype: float64


In [3]:
df_train = df[['sentence', 'labels']]
from datasets import Dataset
# Convert DataFrame to Dataset
dataset = Dataset.from_pandas(df_train)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.nn import BCEWithLogitsLoss
import torch
from sklearn.metrics import f1_score, accuracy_score
from transformers import TrainingArguments


num_themes = 11
# Load the pretrained MLM model
#"/home/nana/EMNLP2023_jiu_jitsu_argumentation_for_rebuttals/JitsuPEER_data_and_models_v1/models/bert-base-uncased_neg"
#"/home/nana/DASP_report_template/model_training/nlp/review_to_theme/results/final_model"
model = BertForSequenceClassification.from_pretrained("/home/nana/EMNLP2023_jiu_jitsu_argumentation_for_rebuttals/JitsuPEER_data_and_models_v1/models/bert-base-uncased_neg", num_labels=num_themes, problem_type="multi_label_classification")
# Initialize the tokenizer
# /home/nana/DASP_report_template/model_training/nlp/review_to_theme/results/final_model
# bert-base-uncased
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize your dataset
def tokenize_function(examples):
    return tokenizer(examples["sentence"], truncation=True, padding="max_length", max_length=128)

# Assuming dataset has columns 'text' and 'labels' (labels should be a binary vector per example)
dataset = dataset.map(tokenize_function, batched=True)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /home/nana/EMNLP2023_jiu_jitsu_argumentation_for_rebuttals/JitsuPEER_data_and_models_v1/models/bert-base-uncased_neg and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1811/1811 [00:00<00:00, 2086.23 examples/s]


In [5]:
# split_datasets = dataset.train_test_split(test_size=0.4, seed=42)
# train_dataset = split_datasets['train']
# temp_test_dataset = split_datasets['test']
# # Second split: further split temp_test into eval and test
# split_temp_test = temp_test_dataset.train_test_split(test_size=0.5, seed=42)  # 50% for eval, 50% for test
# eval_dataset = split_temp_test['train']
# test_dataset = split_temp_test['test']

# # Ensure the dataset is in the correct format (PyTorch or TensorFlow format)
# train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
# eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
# test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
dataset = dataset.train_test_split(test_size=0.2, seed=42)  # 80% train, 20% test
train_dataset = dataset['train']
eval_dataset = dataset['test']

# Ensure the dataset is in the correct format (PyTorch or TensorFlow format)
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


In [6]:
# Evaluate using metrics
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Convert logits to predictions for multi-class classification
    probs = torch.sigmoid(torch.tensor(predictions))
    # Convert probabilities to binary predictions using a threshold of 0.5
    threshold = 0.5
    binary_preds = (probs > threshold).int()
    # Calculate metrics
    accuracy = accuracy_score(labels, binary_preds)
    precision = precision_score(labels, binary_preds, average="macro")
    recall = recall_score(labels, binary_preds, average="macro")
    f1 = f1_score(labels, binary_preds, average="macro")

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }


In [7]:
from transformers import TrainingArguments
from transformers import Trainer

base_output_dir = "./results"
epoch = 10
learning_rate = 5e-5
training_args = TrainingArguments(
    output_dir=f"{base_output_dir}/pretrained_model_normal_tokenizer/epoch_{epoch}/{learning_rate}",           # Directory to save results
    evaluation_strategy="epoch",      # Evaluate after each epoch
    learning_rate=learning_rate,               # Learning rate
    per_device_train_batch_size=8,    # Batch size per device (GPU or CPU)
    per_device_eval_batch_size=8,     # Batch size for evaluation
    num_train_epochs=epoch,               # Number of epochs
    logging_dir="./logs",             # Logging directory
    save_strategy="epoch"             # Save model after each epoch
)

trainer = Trainer(
    model=model,                      # The model we defined above
    args=training_args,               # Training arguments
    train_dataset=train_dataset,      # Training dataset
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)



In [8]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.142441,0.655647,0.533432,0.40201,0.436354
2,No log,0.105106,0.763085,0.66741,0.657074,0.6583
3,0.143000,0.088906,0.763085,0.713142,0.670302,0.688247
4,0.143000,0.085962,0.752066,0.741509,0.728985,0.705833
5,0.143000,0.082211,0.796143,0.822112,0.779199,0.774066
6,0.031200,0.082519,0.804408,0.835256,0.796993,0.802949
7,0.031200,0.085222,0.801653,0.82257,0.802749,0.798308
8,0.031200,0.085971,0.790634,0.825471,0.797795,0.797602
9,0.012700,0.085812,0.807163,0.827012,0.829175,0.819857
10,0.012700,0.085349,0.804408,0.826045,0.800152,0.798976


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=1810, training_loss=0.053118013545294496, metrics={'train_runtime': 633.9204, 'train_samples_per_second': 22.842, 'train_steps_per_second': 2.855, 'total_flos': 952538986352640.0, 'train_loss': 0.053118013545294496, 'epoch': 10.0})

In [9]:
# Evaluate the model
eval_results = trainer.evaluate(metric_key_prefix="eval")
print(eval_results)
trainer.save_metrics("eval", eval_results)

{'eval_loss': 0.08534885197877884, 'eval_accuracy': 0.8044077134986226, 'eval_precision': 0.8260446029099545, 'eval_recall': 0.8001515385063018, 'eval_f1': 0.7989762963808297, 'eval_runtime': 4.4109, 'eval_samples_per_second': 82.296, 'eval_steps_per_second': 10.429, 'epoch': 10.0}


In [59]:
eval_dataset

Dataset({
    features: ['sentence', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 363
})

In [60]:
test_dataset

Dataset({
    features: ['sentence', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 363
})

In [56]:
# Predict and compute metrics on the test dataset
predict_results = trainer.predict(test_dataset, metric_key_prefix="predict")
metrics_predict = predict_results.metrics

# Save the test metrics without logging
# metrics_file_path_predict = f"{trainer.args.output_dir}/predict_results.json"
trainer.save_metrics("predict", metrics_predict)

In [61]:
# Make predictions
predictions = trainer.predict(eval_dataset)
logits = predictions.predictions

# Apply sigmoid activation
probs = torch.sigmoid(torch.tensor(logits))

# Convert probabilities to binary predictions
threshold = 0.5
binary_preds = (probs > threshold).int()

print(binary_preds)


tensor([[0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 1,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], dtype=torch.int32)


In [62]:
# Evaluate using metrics
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

true_labels = predictions.label_ids

# Calculate metrics
f1 = f1_score(true_labels, binary_preds, average="macro")
precision = precision_score(true_labels, binary_preds, average="macro")
recall = recall_score(true_labels, binary_preds, average="macro")

print(f"F1 Score: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")

F1 Score: 0.7993, Precision: 0.8217, Recall: 0.8032


In [63]:
from transformers import Trainer

# Assume `trainer` is your trained Trainer object
output_dir = "./results/final_model"  # Specify the directory where you want to save the model

# Save the model, tokenizer, and config
trainer.save_model(output_dir)

# Save the tokenizer separately
tokenizer.save_pretrained(output_dir)

('./results/final_model/tokenizer_config.json',
 './results/final_model/special_tokens_map.json',
 './results/final_model/vocab.txt',
 './results/final_model/added_tokens.json')

In [64]:
# Decode input_ids back to text using the tokenizer
def decode_text(tokenizer, input_ids):
    return tokenizer.decode(input_ids, skip_special_tokens=True)

# Visualize predictions
num_samples = 20  # Number of samples to visualize
for i in range(num_samples):
    # Decode the text from input_ids
    input_ids = test_dataset[i]['input_ids']
    text = decode_text(tokenizer, input_ids)

    # True labels and predicted labels
    true_label = true_labels[i]  # Original label
    predicted_label = binary_preds[i].numpy()  # Predicted binary label

    # Print results
    print(f"Sample {i + 1}")
    print(f"Text: {text}")
    print(f"True Labels: {true_label}")
    print(f"Predicted Labels: {predicted_label}")
    print("-" * 50)


Sample 1
Text: - it wasn't clear how the sparsity percentage on page 3 was defined?
True Labels: [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
Predicted Labels: [0 0 0 0 0 0 0 0 1 0 0]
--------------------------------------------------
Sample 2
Text: it is unclear whether the data augmentation techniques is applied only at training time or also at test time.
True Labels: [0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0.]
Predicted Labels: [0 0 1 0 0 0 0 0 1 0 0]
--------------------------------------------------
Sample 3
Text: without doing so, it leaves the reader wondering why not simply a standard rbm trained using a standard method ( e. g. contrastive divergence ).
True Labels: [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
Predicted Labels: [0 0 0 0 0 1 0 0 0 0 0]
--------------------------------------------------
Sample 4
Text: the inference algorithm builds on standard techniques of deep generative models and, also, on previously proposed methods ( wand and blei, 2003 ) for dealing with the complex hierarchical prior