In [2]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

df = pd.read_csv('data.csv')
df['theme'] = df['theme'].apply(lambda x: x.split('_'))

# Step 3: Initialize MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# Step 4: One-hot encode the themes
encoded_labels = mlb.fit_transform(df['theme'])

# Step 5: Create a DataFrame for the encoded labels
df_encoded = pd.DataFrame(encoded_labels, columns=mlb.classes_)
df_encoded = df_encoded.astype(float)

# Step 6: Convert the one-hot encoded columns back to a list of labels
df['labels'] = df_encoded.values.tolist()

# Step 6: Combine the original dataframe with the encoded labels
df = pd.concat([df, df_encoded], axis=1)

# Step 7: Print the processed DataFrame
print(df.columns)

df.to_csv('data_encoded.csv', index=False)

Index(['sentence', 'aspect', 'theme', 'description', 'labels', 'ANA', 'BIB',
       'DAT', 'EXP', 'INT', 'MET', 'OAL', 'PDI', 'RES', 'RWK', 'TNF'],
      dtype='object')


In [3]:
# Calculate the frequency of each label
label_counts = df_encoded.sum(axis=0)

# Print the counts for each label
print(label_counts)

ANA     45.0
BIB     17.0
DAT    124.0
EXP    261.0
INT     13.0
MET    937.0
OAL    121.0
PDI     38.0
RES    165.0
RWK    227.0
TNF    108.0
dtype: float64


In [4]:
df_train = df[['sentence', 'labels']]
from datasets import Dataset
# Convert DataFrame to Dataset
dataset = Dataset.from_pandas(df_train)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.nn import BCEWithLogitsLoss
import torch
from sklearn.metrics import f1_score, accuracy_score
from transformers import TrainingArguments


num_themes = 11
# Load the pretrained MLM model
#"../../JitsuPEER_data_and_models_v1/models/bert-base-uncased_neg"
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_themes, problem_type="multi_label_classification")
# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize your dataset
def tokenize_function(examples):
    return tokenizer(examples["sentence"], truncation=True, padding="max_length", max_length=128)

# Assuming dataset has columns 'text' and 'labels' (labels should be a binary vector per example)
dataset = dataset.map(tokenize_function, batched=True)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1811/1811 [00:01<00:00, 1403.86 examples/s]


In [6]:
dataset = dataset.train_test_split(test_size=0.2)  # 80% train, 20% test
train_dataset = dataset['train']
test_dataset = dataset['test']

# Ensure the dataset is in the correct format (PyTorch or TensorFlow format)
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


In [7]:
from transformers import TrainingArguments
from transformers import Trainer

training_args = TrainingArguments(
    output_dir="./results",           # Directory to save results
    evaluation_strategy="epoch",      # Evaluate after each epoch
    learning_rate=5e-5,               # Learning rate
    per_device_train_batch_size=8,    # Batch size per device (GPU or CPU)
    per_device_eval_batch_size=8,     # Batch size for evaluation
    num_train_epochs=3,               # Number of epochs
    logging_dir="./logs",             # Logging directory
    save_strategy="epoch"             # Save model after each epoch
)

trainer = Trainer(
    model=model,                      # The model we defined above
    args=training_args,               # Training arguments
    train_dataset=train_dataset,      # Training dataset
    eval_dataset=test_dataset         # Validation dataset
)

# Train the model
trainer.train()



Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
# Evaluate the model
results = trainer.evaluate()
print(results)

NameError: name 'trainer' is not defined

In [None]:
# Make predictions
predictions = trainer.predict(test_dataset)
logits = predictions.predictions

# Apply sigmoid activation
probs = torch.sigmoid(torch.tensor(logits))

# Convert probabilities to binary predictions
threshold = 0.5
binary_preds = (probs > threshold).int()

# Evaluate using metrics
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

true_labels = predictions.label_ids

# Calculate metrics
f1 = f1_score(true_labels, binary_preds, average="macro")
precision = precision_score(true_labels, binary_preds, average="macro")
recall = recall_score(true_labels, binary_preds, average="macro")

print(f"F1 Score: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")


F1 Score: 0.6031, Precision: 0.6310, Recall: 0.5974


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
