In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('mLabel_tweets.csv')

# Display the first few rows of the dataset
print(df.head())

# Check the distribution of labels
label_counts = df['labels'].apply(lambda x: x.split()).explode().value_counts()
print(label_counts)

                     ID                                              tweet  \
0  1296010336907038720t  @cath__kath AstraZeneca is made with the kidne...   
1  1336808189677940736t  It begins. Please find safe alternatives to th...   
2  1329488407307956231t  @PaolaQP1231 Well, I mean congratulations Covi...   
3  1364194604459900934t  @BorisJohnson for those of us that do not wish...   
4  1375938799247765515t  She has been trying to speak out: writing lett...   

               labels  
0         ingredients  
1         side-effect  
2         side-effect  
3           mandatory  
4  side-effect rushed  
labels
side-effect    3805
ineffective    1672
rushed         1477
pharma         1273
mandatory       783
unnecessary     722
none            629
political       626
conspiracy      487
ingredients     436
country         201
religious        64
Name: count, dtype: int64


In [2]:
from transformers import BertTokenizer
import torch

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the tweet text
tokens = tokenizer.batch_encode_plus(
    df['tweet'].tolist(),
    max_length=128,
    padding='max_length',
    truncation=True,
    return_tensors='pt'
)

# Create attention masks
attention_masks = tokens['attention_mask']



In [3]:
from sklearn.preprocessing import MultiLabelBinarizer

# Split the labels by space and convert to list
df['labels'] = df['labels'].apply(lambda x: x.split())

# Initialize the MultiLabelBinarizer
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(df['labels'])

# Convert labels to tensor
labels = torch.tensor(labels)

In [4]:
from transformers import BertForSequenceClassification, AdamW

# Load the pre-trained BERT model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=12,  # Number of labels in your dataset
    output_attentions=False,
    output_hidden_states=False
)

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from torch.nn import BCEWithLogitsLoss

# Create TensorDataset
dataset = TensorDataset(tokens['input_ids'], attention_masks, labels)

# Split into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# Create DataLoader for training and validation sets
train_dataloader = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset),
    batch_size=32
)

val_dataloader = DataLoader(
    val_dataset,
    sampler=SequentialSampler(val_dataset),
    batch_size=32
)

In [6]:
for batch in train_dataloader:
    b_input_ids, b_attention_mask, b_labels = batch
    outputs = model(b_input_ids, attention_mask=b_attention_mask)
    print(f"Logits shape: {outputs.logits.shape}")
    break  # Print shapes for the first batch only

Logits shape: torch.Size([32, 12])


In [7]:
# Training loop with progress monitoring
epochs = 1
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(train_dataloader):
        b_input_ids, b_attention_mask, b_labels = batch
        
        model.zero_grad()
        
        # Forward pass
        outputs = model(b_input_ids, attention_mask=b_attention_mask)
        
        # Calculate loss
        loss_fct = BCEWithLogitsLoss()
        loss = loss_fct(outputs.logits, b_labels.float())
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        if step % 10 == 0:
            print(f"Epoch {epoch + 1}/{epochs}, Step {step}/{len(train_dataloader)}, Loss: {loss.item()}")
    
    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{epochs} completed. Average Loss: {avg_train_loss}")

# Evaluation loop
model.eval()
total_eval_loss = 0
for batch in val_dataloader:
    b_input_ids, b_attention_mask, b_labels = batch
    
    with torch.no_grad():
        outputs = model(b_input_ids, attention_mask=b_attention_mask)
        
        # Calculate loss
        loss_fct = BCEWithLogitsLoss()
        loss = loss_fct(outputs.logits, b_labels.float())
        
        total_eval_loss += loss.item()

avg_val_loss = total_eval_loss / len(val_dataloader)
print(f"Validation Loss: {avg_val_loss}")


Epoch 1/1, Step 0/248, Loss: 0.7129769921302795
Epoch 1/1, Step 10/248, Loss: 0.5116710662841797
Epoch 1/1, Step 20/248, Loss: 0.44817209243774414
Epoch 1/1, Step 30/248, Loss: 0.38861727714538574
Epoch 1/1, Step 40/248, Loss: 0.3531745374202728
Epoch 1/1, Step 50/248, Loss: 0.356967955827713
Epoch 1/1, Step 60/248, Loss: 0.3099426329135895
Epoch 1/1, Step 70/248, Loss: 0.32017675042152405
Epoch 1/1, Step 80/248, Loss: 0.30607494711875916
Epoch 1/1, Step 90/248, Loss: 0.27705344557762146
Epoch 1/1, Step 100/248, Loss: 0.3017367422580719
Epoch 1/1, Step 110/248, Loss: 0.26648417115211487
Epoch 1/1, Step 120/248, Loss: 0.2903866767883301
Epoch 1/1, Step 130/248, Loss: 0.24678438901901245
Epoch 1/1, Step 140/248, Loss: 0.2542618215084076
Epoch 1/1, Step 150/248, Loss: 0.2980482876300812
Epoch 1/1, Step 160/248, Loss: 0.29154321551322937
Epoch 1/1, Step 170/248, Loss: 0.28666046261787415
Epoch 1/1, Step 180/248, Loss: 0.27955004572868347
Epoch 1/1, Step 190/248, Loss: 0.290488600730896
Epo

In [14]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Initialize empty lists to store predictions and true labels
all_predictions = []
all_true_labels = []

# Evaluation loop
model.eval()
for batch in val_dataloader:
    b_input_ids, b_attention_mask, b_labels = batch
    
    with torch.no_grad():
        outputs = model(b_input_ids, attention_mask=b_attention_mask)
        
    # Store logits and true labels
    logits = outputs.logits
    all_predictions.append(logits.cpu().numpy())
    all_true_labels.append(b_labels.cpu().numpy())

# Convert lists to numpy arrays
predictions = np.concatenate(all_predictions, axis=0)
true_labels = np.concatenate(all_true_labels, axis=0)

# Convert logits to binary predictions
predictions = (predictions > 0.5).astype(int)

# Calculate evaluation metrics
accuracy = accuracy_score(true_labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='micro')

# Print metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.2070528967254408
Precision: 0.8800705467372134
Recall: 0.20392317123007764
F1 Score: 0.33112143331121435
