In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from torch import cuda
import torch
from transformers import DistilBertConfig

# Load and preprocess the data
data = pd.read_csv('edos_labelled_aggregated.csv')
labels = data['label_sexist'].values
categories = data['label_category'].values
vectors = data['label_vector'].values
texts = data['text'].values

# Split the data into train and test sets
train_texts, test_texts, train_labels, test_labels, train_categories, test_categories, train_vectors, test_vectors = train_test_split(
    texts, labels, categories, vectors, test_size=0.2, random_state=42
)

# Define the label mapping
label_mapping = {
    'sexist': 1,
    'not sexist': 0
}

category_mapping = {
    'none': 0,
    '1. threats, plans to harm and incitement': 1,
    '2. derogation': 2,
    '3. animosity': 3,
    '4. prejudiced discussions': 4

    # Add more categories as necessary
}

vector_mapping = {
    'none': 0,
    '1.1 threats of harm': 1,
    '1.2 incitement and encouragement of harm': 2,
    '2.1 descriptive attacks': 3,
    '2.2 aggressive and emotive attacks': 4,
    '2.3 dehumanising attacks & overt sexual objectification': 5,
    '3.1 casual use of gendered slurs, profanities, and insults': 6,
    '3.2 immutable gender differences and gender stereotypes': 7,
    '3.3 backhanded gendered compliments': 8,
    '3.4 condescending explanations or unwelcome advice': 9,
    '4.1 supporting mistreatment of individual women': 10,
    '4.2 supporting systemic discrimination against women as a group': 11
    

    # Add more categories as necessary
}



In [2]:

# Define the custom dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, categories, vectors, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.categories = categories
        self.vectors = vectors
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        category = self.categories[idx]
        vector = self.vectors[idx]
    
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_len,
            padding='max_length',
            return_tensors='pt'
        )
    
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label_mapping[label]),  # Encode label as numerical value
            'category': torch.tensor(category_mapping[category]),  # Encode category as numerical value
            'vector': torch.tensor(vector_mapping[vector])
        }




In [3]:
# Set device
device = 'cuda' if cuda.is_available() else 'cpu'

# Set hyperparameters
MAX_LEN = 128
BATCH_SIZE = 16
EPOCHS = 5
LEARNING_RATE = 2e-5

from transformers import DistilBertForSequenceClassification, DistilBertModel
import torch.nn as nn

class CustomDistilBertForSequenceClassification(DistilBertForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)
        self.distilbert = DistilBertModel(config)
        self.dropout = nn.Dropout(config.dropout)
        self.classifier = nn.Linear(config.hidden_size + 1 + 1 + 1 , config.num_labels)  # Include 1 additional unit for each extra feature
    
    def forward(self, input_ids=None, attention_mask=None, category=None, labels=None, vector=None, **kwargs):
        distilbert_output = self.distilbert(input_ids=input_ids, attention_mask=attention_mask, **kwargs)
        hidden_state = distilbert_output.last_hidden_state[:, 0, :]  # Extract the [CLS] token embedding
        hidden_state = self.dropout(hidden_state)
    
        # Reshape the category tensor to match the dimensions of the hidden_state tensor
        if category is not None:
            category = category.unsqueeze(1)  # Add an extra dimension
        
        # Reshape the labels tensor to match the dimensions of the hidden_state tensor
        if labels is not None:
            labels = labels.unsqueeze(1)  # Add an extra dimension
    
        # Reshape the vector tensor to match the dimensions of the hidden_state tensor
        if vector is not None:
            vector = vector.unsqueeze(1)  # Add an extra dimension
    
        # Concatenate the hidden state with the extra features
        if category is not None:
            hidden_state = torch.cat((hidden_state, category), dim=1)
        if labels is not None:
            hidden_state = torch.cat((hidden_state, labels), dim=1)
        if vector is not None:
            hidden_state = torch.cat((hidden_state, vector), dim=1)
    
        logits = self.classifier(hidden_state)
        outputs = (logits,) + distilbert_output[1:]  # Add hidden states and attention if they are present
    
        return outputs







In [5]:
# Load the tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
config = DistilBertConfig.from_pretrained('distilbert-base-uncased', num_labels=2)
model = CustomDistilBertForSequenceClassification(config)

# Move model to device
model = model.to(device)



# Create data loaders
train_dataset = TextDataset(train_texts, train_labels, train_categories, train_vectors, tokenizer, MAX_LEN)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

test_dataset = TextDataset(test_texts, test_labels, test_categories, test_vectors, tokenizer, MAX_LEN)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Set optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

criterion = nn.CrossEntropyLoss()

# Training loop
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        category = batch['category'].to(device)
        vector = batch['vector'].to(device)
        
        optimizer.zero_grad()
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
            category=category,
            vector=vector
        )
        
        logits = outputs[0]
        loss = criterion(logits, labels)  # Calculate the loss
        
               
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch+1}/{EPOCHS}, Average Loss: {avg_loss:.4f}')
    
    # Evaluation on the test set
    model.eval()
    test_loss = 0
    predictions = []
    true_labels = []
    
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            category = batch['category'].to(device)
            vector = batch['vector'].to(device)
            
            optimizer.zero_grad()
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
                category=category,
                vector=vector
            )
            
            logits = outputs[0]
            loss = criterion(logits, labels)
            
            optimizer.step()
            test_loss += loss.item()
            
            logits = outputs[0]
            softmax_probs = torch.softmax(logits, dim=1)
            predicted_labels = torch.argmax(softmax_probs, dim=1)
            
            predictions.extend(predicted_labels.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
            
            predictions.extend(predicted_labels.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    
    avg_test_loss = test_loss / len(test_loader)
    accuracy = accuracy_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions)
    recall = recall_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions)
    
    print(f'Test Loss: {avg_test_loss}, Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}')



Epoch 1/5, Average Loss: 0.4972
Test Loss: 0.40456696376204493, Accuracy: 0.82725, Precision: 0.7796610169491526, Recall: 0.38534031413612563, F1: 0.5157673440784862
Epoch 2/5, Average Loss: 0.3447
Test Loss: 0.36600663208961487, Accuracy: 0.841, Precision: 0.6952264381884945, Recall: 0.5947643979057592, F1: 0.6410835214446953
Epoch 3/5, Average Loss: 0.2623
Test Loss: 0.35711890085041526, Accuracy: 0.84675, Precision: 0.7544642857142857, Recall: 0.5308900523560209, F1: 0.6232329440688383
Epoch 4/5, Average Loss: 0.1996
Test Loss: 0.39021957623958586, Accuracy: 0.85, Precision: 0.7278562259306803, Recall: 0.5937172774869109, F1: 0.6539792387543253
Epoch 5/5, Average Loss: 0.1511
Test Loss: 0.3791333614215255, Accuracy: 0.84125, Precision: 0.6886792452830188, Recall: 0.6115183246073298, F1: 0.6478092068774266
