In [None]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

df = pd.read_csv('data.csv')
df['theme'] = df['theme'].apply(lambda x: x.split('_'))

# Step 3: Initialize MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# Step 4: One-hot encode the themes
encoded_labels = mlb.fit_transform(df['theme'])

# Step 5: Create a DataFrame for the encoded labels
df_encoded = pd.DataFrame(encoded_labels, columns=mlb.classes_)

# Step 6: Convert the one-hot encoded columns back to a list of labels
df['labels'] = df_encoded.values.tolist()

# Step 6: Combine the original dataframe with the encoded labels
df = pd.concat([df, df_encoded], axis=1)

# Step 7: Print the processed DataFrame
print(df.columns)

df.to_csv('data_encoded.csv', index=False)

Index(['sentence', 'aspect', 'theme', 'description', 'ANA', 'BIB', 'DAT',
       'EXP', 'INT', 'MET', 'OAL', 'PDI', 'RES', 'RWK', 'TNF'],
      dtype='object')


In [3]:
# Calculate the frequency of each label
label_counts = df_encoded.sum(axis=0)

# Print the counts for each label
print(label_counts)

ANA     45
BIB     17
DAT    124
EXP    261
INT     13
MET    937
OAL    121
PDI     38
RES    165
RWK    227
TNF    108
dtype: int64


In [None]:
from sklearn.model_selection import train_test_split
df_train = df[['sentence', 'labels']]
from datasets import Dataset
# Convert DataFrame to Dataset
dataset = Dataset.from_pandas(df)
# Split DataFrame into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    df_train['sentence'],  # Input column
    df_train['labels'],    # Labels column
    test_size=0.2,   # Proportion of data for the test set
    random_state=42  # For reproducibility
)

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.nn import BCEWithLogitsLoss
import torch
from sklearn.metrics import f1_score, accuracy_score

num_themes = 11
# Load the pretrained MLM model
model = BertForSequenceClassification.from_pretrained("../../JitsuPEER_data_and_models_v1/models/bert-base-uncased_neg", num_labels=num_themes, problem_type="multi_label_classification")
# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize your dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

# Assuming dataset has columns 'text' and 'labels' (labels should be a binary vector per example)
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Convert dataset to PyTorch tensors
tokenized_datasets = tokenized_datasets.with_format("torch")

# Custom loss function for multi-label classification
class CustomBERTModel(torch.nn.Module):
    def __init__(self, base_model, num_labels):
        super(CustomBERTModel, self).__init__()
        self.bert = base_model
        self.classifier = torch.nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        logits = self.classifier(outputs.pooler_output)  # Pooler output for classification
        loss = None
        if labels is not None:
            loss_fn = BCEWithLogitsLoss()
            loss = loss_fn(logits, labels.float())
        return {"loss": loss, "logits": logits}

# Wrap your fine-tuned model
model = CustomBERTModel(model, num_labels=num_themes)

# Training arguments
training_args = TrainingArguments(
    output_dir="./classification_results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
)

# Define metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.sigmoid(torch.tensor(logits)).numpy() > 0.5
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"f1": f1, "accuracy": acc}

# Trainer for classification
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
trainer.save_model("./theme_classification_model")