In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
#"/content/drive/MyDrive/train.csv"
#"/content/drive/MyDrive/test.csv"

In [3]:
pip install transformers



In [4]:
pip install datasets

Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [5]:
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import RobertaTokenizer, RobertaModel, Trainer, TrainingArguments
import torch
import torch.nn as nn
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split




# Load data
train_data = pd.read_csv("/content/drive/MyDrive/train.csv")

# Define emotion columns
emotion_columns = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust']

# Initialize tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Split the DataFrame
train_df, val_df = train_test_split(train_data, test_size=0.2, random_state=42)

# Convert to Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

print("Train dataset size:", len(train_dataset))
print("Val dataset size:", len(val_dataset))



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Train dataset size: 6179
Val dataset size: 1545




In [6]:
def calculate_class_weights(label_counts, total_samples):
    # Option 1: Inverse of class frequency
    weights_inverse = total_samples / label_counts

    # Option 2: Logarithmic scaling
    weights_log = 1 + np.log(total_samples / label_counts)

    # Option 3: Square root of inverse frequency
    weights_sqrt = np.sqrt(total_samples / label_counts)

    return {
        'inverse': weights_inverse,
        'log': weights_log,
        'sqrt': weights_sqrt
    }

In [7]:
# Tokenize function
def tokenize_function(examples):
    return tokenizer(examples['Tweet'], truncation=True, padding='max_length', max_length=128)

# Prepare labels
def prepare_labels(examples):
    labels = []
    for i in range(len(examples[emotion_columns[0]])):
        label = [float(examples[col][i] not in ['NONE', '0', 0]) for col in emotion_columns]
        labels.append(label)
    examples['labels'] = labels
    return examples

# Apply tokenization and label preparation
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

train_dataset = train_dataset.map(prepare_labels, batched=True)
val_dataset = val_dataset.map(prepare_labels, batched=True)

# Remove unnecessary columns
columns_to_remove = [col for col in train_dataset.column_names if col not in ['input_ids', 'attention_mask', 'labels']]
train_dataset = train_dataset.remove_columns(columns_to_remove)
val_dataset = val_dataset.remove_columns(columns_to_remove)



Map:   0%|          | 0/6179 [00:00<?, ? examples/s]

Map:   0%|          | 0/1545 [00:00<?, ? examples/s]

Map:   0%|          | 0/6179 [00:00<?, ? examples/s]

Map:   0%|          | 0/1545 [00:00<?, ? examples/s]

In [8]:
# Set format
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Check label distribution
print("\nLabel distribution in train dataset:")
train_labels = np.array([sample['labels'] for sample in train_dataset])
print(train_labels.sum(axis=0))

print("\nLabel distribution in val dataset:")
val_labels = np.array([sample['labels'] for sample in val_dataset])
print(val_labels.sum(axis=0))

# Calculate class weights
label_counts = np.sum(train_labels, axis=0)
total_samples = len(train_dataset)
class_weights = total_samples / (len(emotion_columns) * label_counts)
class_weights = torch.tensor(class_weights, dtype=torch.float)




Label distribution in train dataset:
[2287.  880. 2325. 1058. 2318.  681. 1858.  708. 1813.  326.  308.]

Label distribution in val dataset:
[572. 222. 596. 305. 559. 151. 433. 187. 460.  70.  92.]


In [9]:
# Calculate label counts and total samples
label_counts = np.sum(train_labels, axis=0)
total_samples = len(train_dataset)

# Calculate class weights
class_weights = calculate_class_weights(label_counts, total_samples)

# Choose a weighting scheme (you can change this to 'log' or 'sqrt' to try different schemes)
chosen_weights = 'inverse'

# Convert to tensor
class_weights_tensor = torch.tensor(class_weights[chosen_weights], dtype=torch.float)

# Update the model definition
class MultiLabelClassification(nn.Module):
    def __init__(self, num_labels, class_weights):
        super().__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.roberta.config.hidden_size, num_labels)
        self.loss_fct = nn.BCEWithLogitsLoss(pos_weight=class_weights)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss = self.loss_fct(logits, labels)

        return {'loss': loss, 'logits': logits}

# Initialize the model with class weights
model = MultiLabelClassification(num_labels=len(emotion_columns), class_weights=class_weights_tensor)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:

# Training arguments
training_args = TrainingArguments(
    output_dir='./results_roberta',
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs_roberta',
    logging_steps=100,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    learning_rate=2e-5,
    remove_unused_columns=False,
)




In [11]:
# Compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.sigmoid(torch.Tensor(logits)).numpy()

    print("Predictions shape:", predictions.shape)
    print("Labels shape:", labels.shape)
    print("Sample predictions (first 5):")
    print(predictions[:5])
    print("Sample labels (first 5):")
    print(labels[:5])

    thresholds = [0.1, 0.3, 0.5]
    for threshold in thresholds:
        binary_predictions = (predictions > threshold).astype(float)
        f1_scores = f1_score(labels, binary_predictions, average=None, zero_division=1)
        macro_f1 = np.mean(f1_scores)

        print(f"\nThreshold: {threshold}")
        for i, score in enumerate(f1_scores):
            print(f"F1 score for label {i}: {score}")
        print(f"Macro F1: {macro_f1}")

    # Use 0.5 as the final threshold
    final_predictions = (predictions > 0.5).astype(float)
    final_f1_scores = f1_score(labels, final_predictions, average=None, zero_division=1)
    final_macro_f1 = np.mean(final_f1_scores)
    final_micro_f1 = f1_score(labels, final_predictions, average='micro', zero_division=1)

    print(f"\nFinal Macro F1: {final_macro_f1}")
    print(f"Final Micro F1: {final_micro_f1}")

    return {
        'macro_f1': final_macro_f1,
        'micro_f1': final_micro_f1
    }

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

In [12]:
# Train the model
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Macro F1,Micro F1
1,1.0107,0.887232,0.50956,0.563077
2,0.7891,0.809111,0.551226,0.614503
3,0.7148,0.80646,0.564425,0.621763
4,0.611,0.825361,0.565418,0.63112
5,0.5397,0.857443,0.575707,0.64548
6,0.4636,0.913209,0.584649,0.657975
7,0.419,0.953708,0.584241,0.657425
8,0.3715,1.014041,0.58913,0.67036
9,0.3425,1.067394,0.589554,0.672239
10,0.3124,1.076891,0.589026,0.673869


Predictions shape: (1545, 11)
Labels shape: (1545, 11)
Sample predictions (first 5):
[[0.13823201 0.51181746 0.17050675 0.17572597 0.90371376 0.8408064
  0.8810077  0.21691175 0.28452256 0.2542052  0.6915567 ]
 [0.13099065 0.48816195 0.16144624 0.1724936  0.9054698  0.86899287
  0.86192715 0.24739105 0.31583932 0.26465312 0.6098603 ]
 [0.21452086 0.6672795  0.23634411 0.19225462 0.87974733 0.8450842
  0.7823198  0.16285586 0.1747136  0.57186264 0.49565852]
 [0.74703246 0.22700305 0.7744445  0.5313264  0.18068081 0.13583206
  0.17567475 0.80284864 0.8623761  0.1691332  0.09818614]
 [0.6906849  0.51161844 0.6305108  0.69673425 0.20935494 0.1060503
  0.2552133  0.555945   0.7692489  0.58018035 0.14811178]]
Sample labels (first 5):
[[0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0.]
 [1. 0. 1. 1. 0. 0. 0. 1. 1. 0. 0.]
 [1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]]

Threshold: 0.1
F1 score for label 0: 0.5403873405762872
F1 score for label 1: 0.

TrainOutput(global_step=3870, training_loss=0.570553672652528, metrics={'train_runtime': 1721.2384, 'train_samples_per_second': 35.899, 'train_steps_per_second': 2.248, 'total_flos': 0.0, 'train_loss': 0.570553672652528, 'epoch': 10.0})

In [13]:
# Evaluate on validation set
val_results = trainer.evaluate()
print("Validation results:", val_results)
columns_to_keep = ['input_ids', 'attention_mask', 'labels']


Predictions shape: (1545, 11)
Labels shape: (1545, 11)
Sample predictions (first 5):
[[0.05471525 0.10931339 0.04333122 0.04672876 0.9535104  0.07107715
  0.9873541  0.05290819 0.18395649 0.00761496 0.08790798]
 [0.01887967 0.03622173 0.03168551 0.06053397 0.97913396 0.5789728
  0.9388626  0.03796799 0.11174219 0.01126722 0.03770398]
 [0.01570203 0.9139894  0.01290343 0.0416326  0.99375796 0.8156597
  0.9469499  0.01197701 0.01324349 0.512663   0.6302822 ]
 [0.97005737 0.01803998 0.9783569  0.24590378 0.00685643 0.02700451
  0.00939548 0.904414   0.98326266 0.03035252 0.02359092]
 [0.9709712  0.06012898 0.9462226  0.02756586 0.04665333 0.0112108
  0.02057246 0.04246216 0.7118592  0.01373169 0.00896011]]
Sample labels (first 5):
[[0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0.]
 [1. 0. 1. 1. 0. 0. 0. 1. 1. 0. 0.]
 [1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]]

Threshold: 0.1
F1 score for label 0: 0.7118863049095607
F1 score for label 1: 0.

In [14]:
from torch.utils.data import DataLoader
test_data = pd.read_csv("/content/drive/MyDrive/test.csv")
# Assuming you've already prepared your test_dataset as before
test_dataset = Dataset.from_pandas(test_data)
test_dataset = test_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.remove_columns([col for col in test_dataset.column_names if col not in ['input_ids', 'attention_mask']])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# Create the test_dataloader
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)
# Determine the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move the model to the device
model = model.to(device)

# Set the model to evaluation mode
model.eval()

# Make predictions
all_predictions = []

with torch.no_grad():
    for batch in test_dataloader:
        inputs = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**inputs)
        logits = outputs['logits']
        predictions = torch.sigmoid(logits).cpu().numpy()
        all_predictions.append(predictions)

# Concatenate all predictions
all_predictions = np.concatenate(all_predictions, axis=0)

# Convert predictions to binary (0 or 1)
binary_predictions = (all_predictions > 0.5).astype(int)

# Create submission DataFrame
submission = pd.DataFrame(binary_predictions, columns=emotion_columns)
submission['ID'] = test_data['ID']
submission = submission[['ID'] + emotion_columns]

# Save submission file
submission.to_csv('submission.csv', index=False)
print("Submission file created: submission.csv")

Map:   0%|          | 0/3259 [00:00<?, ? examples/s]

Using device: cuda
Submission file created: submission.csv
