In [1]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

df = pd.read_csv('data.csv')
df['theme'] = df['theme'].apply(lambda x: x.split('_'))

# Step 3: Initialize MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# Step 4: One-hot encode the themes
encoded_labels = mlb.fit_transform(df['theme'])

# Step 5: Create a DataFrame for the encoded labels
df_encoded = pd.DataFrame(encoded_labels, columns=mlb.classes_)
df_encoded = df_encoded.astype(float)

# Step 6: Convert the one-hot encoded columns back to a list of labels
df['labels'] = df_encoded.values.tolist()

# Step 6: Combine the original dataframe with the encoded labels
df = pd.concat([df, df_encoded], axis=1)

# Step 7: Print the processed DataFrame
print(df.columns)

df.to_csv('data_encoded.csv', index=False)

Index(['sentence', 'aspect', 'theme', 'description', 'labels', 'ANA', 'BIB',
       'DAT', 'EXP', 'INT', 'MET', 'OAL', 'PDI', 'RES', 'RWK', 'TNF'],
      dtype='object')


In [2]:
# Calculate the frequency of each label
label_counts = df_encoded.sum(axis=0)

# Print the counts for each label
print(label_counts)

ANA     45.0
BIB     17.0
DAT    124.0
EXP    261.0
INT     13.0
MET    937.0
OAL    121.0
PDI     38.0
RES    165.0
RWK    227.0
TNF    108.0
dtype: float64


In [3]:
df_train = df[['sentence', 'labels']]
from datasets import Dataset
# Convert DataFrame to Dataset
dataset = Dataset.from_pandas(df_train)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.nn import BCEWithLogitsLoss
import torch
from sklearn.metrics import f1_score, accuracy_score
from transformers import TrainingArguments


num_themes = 11
# Load the pretrained MLM model
#"../../JitsuPEER_data_and_models_v1/models/bert-base-uncased_neg"
model = BertForSequenceClassification.from_pretrained("/home/nana/EMNLP2023_jiu_jitsu_argumentation_for_rebuttals/JitsuPEER_data_and_models_v1/models/bert-base-uncased_neg", num_labels=num_themes, problem_type="multi_label_classification")
# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize your dataset
def tokenize_function(examples):
    return tokenizer(examples["sentence"], truncation=True, padding="max_length", max_length=128)

# Assuming dataset has columns 'text' and 'labels' (labels should be a binary vector per example)
dataset = dataset.map(tokenize_function, batched=True)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /home/nana/EMNLP2023_jiu_jitsu_argumentation_for_rebuttals/JitsuPEER_data_and_models_v1/models/bert-base-uncased_neg and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1811/1811 [00:01<00:00, 1797.45 examples/s]


In [5]:
dataset = dataset.train_test_split(test_size=0.2)  # 80% train, 20% test
train_dataset = dataset['train']
test_dataset = dataset['test']

# Ensure the dataset is in the correct format (PyTorch or TensorFlow format)
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


In [6]:
from transformers import TrainingArguments
from transformers import Trainer

base_output_dir = "./results"
epoch = 10
training_args = TrainingArguments(
    output_dir=f"{base_output_dir}/pretrained/epoch_{epoch}",           # Directory to save results
    evaluation_strategy="epoch",      # Evaluate after each epoch
    learning_rate=5e-5,               # Learning rate
    per_device_train_batch_size=8,    # Batch size per device (GPU or CPU)
    per_device_eval_batch_size=8,     # Batch size for evaluation
    num_train_epochs=epoch,               # Number of epochs
    logging_dir="./logs",             # Logging directory
    save_strategy="epoch"             # Save model after each epoch
)

trainer = Trainer(
    model=model,                      # The model we defined above
    args=training_args,               # Training arguments
    train_dataset=train_dataset,      # Training dataset
    eval_dataset=test_dataset         # Validation dataset
)

# Train the model
trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,0.153121
2,No log,0.116766
3,0.138600,0.102843
4,0.138600,0.098311
5,0.138600,0.09383
6,0.032100,0.094455
7,0.032100,0.09672
8,0.032100,0.095794
9,0.013000,0.098763
10,0.013000,0.099393


TrainOutput(global_step=1810, training_loss=0.05230014982803092, metrics={'train_runtime': 629.7128, 'train_samples_per_second': 22.995, 'train_steps_per_second': 2.874, 'total_flos': 952538986352640.0, 'train_loss': 0.05230014982803092, 'epoch': 10.0})

In [7]:
# Evaluate the model
results = trainer.evaluate()
print(results)

{'eval_loss': 0.09939303994178772, 'eval_runtime': 4.3852, 'eval_samples_per_second': 82.779, 'eval_steps_per_second': 10.49, 'epoch': 10.0}


In [None]:
# Make predictions
predictions = trainer.predict(test_dataset)
logits = predictions.predictions

# Apply sigmoid activation
probs = torch.sigmoid(torch.tensor(logits))

# Convert probabilities to binary predictions
threshold = 0.5
binary_preds = (probs > threshold).int()

print(binary_preds)


tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 1,  ..., 0, 0, 0]], dtype=torch.int32)


In [None]:
# Evaluate using metrics
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

true_labels = predictions.label_ids

# Calculate metrics
f1 = f1_score(true_labels, binary_preds, average="macro")
precision = precision_score(true_labels, binary_preds, average="macro")
recall = recall_score(true_labels, binary_preds, average="macro")

print(f"F1 Score: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")

In [11]:
from transformers import Trainer

# Assume `trainer` is your trained Trainer object
output_dir = "./results/final_model"  # Specify the directory where you want to save the model

# Save the model, tokenizer, and config
trainer.save_model(output_dir)

# Save the tokenizer separately
tokenizer.save_pretrained(output_dir)

('./results/final_model/tokenizer_config.json',
 './results/final_model/special_tokens_map.json',
 './results/final_model/vocab.txt',
 './results/final_model/added_tokens.json')

In [10]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('/home/nana/DASP_report_template/backend/sentences_author.csv')