In [1]:
!pip install transformers datasets torch torchvision
!pip install 'accelerate>={ACCELERATE_MIN_VERSION}'



## Task 2 ##

In [2]:

from datasets import load_dataset

dataset = load_dataset("sem_eval_2018_task_1", "subtask5.english")

print(dataset["validation"][-1])


{'ID': '2018-En-03386', 'Tweet': 'I am really flattered and happy to hear those complements for my blog! You guys motivates me to write more for my blog. Thank you! sml 💞', 'anger': False, 'anticipation': False, 'disgust': False, 'fear': False, 'joy': True, 'love': False, 'optimism': True, 'pessimism': False, 'sadness': False, 'surprise': False, 'trust': False}


## Task 3 ##

In [3]:
from transformers import BertTokenizer
import torch

def tokenize_data(dataset):
    # Load the BERT tokenizer
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    
    # Define the tokenization function
    def preprocess(example):
        # Tokenize the text
        encoding = tokenizer(
            example["Tweet"],
            padding="max_length",  # Pad to max length
            truncation=True,       # Truncate if too long
            max_length=128,        # Define max token length
        )
        # Convert labels to a tensor of floats
        labels = [example[label] for label in dataset["train"].features.keys() if label not in ["ID", "Tweet"]]
        encoding["labels"] = torch.tensor(labels, dtype=torch.float)
        return encoding

    # Apply the tokenization function to the dataset
    encoded_dataset = dataset.map(preprocess, batched=False)
    
    # Set format to PyTorch
    encoded_dataset.set_format("torch", columns=["input_ids", "token_type_ids", "attention_mask", "labels"])
    
    return encoded_dataset


# Tokenize the dataset
encoded_dataset = tokenize_data(dataset)

# Print the keys of the last data point in the validation set
print(encoded_dataset["validation"][-1].keys())


dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])


## Task 4 ##

In [4]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

# Define the number of labels
num_labels = len(dataset["train"].features) - 2  # Exclude "ID" and "Tweet"

# Create id2label and label2id mappings
id2label = {i: label for i, label in enumerate([key for key in dataset["train"].features.keys() if key not in ["ID", "Tweet"]])}
label2id = {label: i for i, label in id2label.items()}

# Define the model
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
    problem_type="multi_label_classification"
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions
    preds = (preds > 0.5).astype(int)  # Convert logits to binary predictions
    
    # Calculate metrics
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="micro")
    accuracy = accuracy_score(labels, preds)
    
    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }


training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/2565 [00:00<?, ?it/s]

{'loss': 0.6561, 'grad_norm': 1.9666318893432617, 'learning_rate': 1.9922027290448344e-05, 'epoch': 0.01}
{'loss': 0.5696, 'grad_norm': 1.4267334938049316, 'learning_rate': 1.984405458089669e-05, 'epoch': 0.02}
{'loss': 0.5282, 'grad_norm': 1.4609507322311401, 'learning_rate': 1.976608187134503e-05, 'epoch': 0.04}


KeyboardInterrupt: 

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Extract tweets and labels
tweets = dataset["train"]["Tweet"]
labels = np.array([list(example.values())[2:] for example in dataset["train"]])  # Exclude "ID" and "Tweet"

# Split into train and validation sets
tweets_train, tweets_val, labels_train, labels_val = train_test_split(tweets, labels, test_size=0.2, random_state=42)

# TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(tweets_train)
X_val = vectorizer.transform(tweets_val)

# Logistic Regression with OneVsRest for multi-label classification
clf = OneVsRestClassifier(LogisticRegression(max_iter=1000))
clf.fit(X_train, labels_train)

# Evaluate
predictions = clf.predict(X_val)
print(classification_report(labels_val, predictions, target_names=id2label.values()))


              precision    recall  f1-score   support

       anger       0.83      0.52      0.64       507
anticipation       0.50      0.01      0.02       200
     disgust       0.78      0.48      0.59       516
        fear       0.93      0.27      0.41       283
         joy       0.87      0.55      0.68       507
        love       0.83      0.11      0.19       136
    optimism       0.76      0.38      0.50       400
   pessimism       0.50      0.01      0.02       166
     sadness       0.82      0.27      0.41       424
    surprise       1.00      0.03      0.05        76
       trust       0.00      0.00      0.00        71

   micro avg       0.82      0.35      0.49      3286
   macro avg       0.71      0.24      0.32      3286
weighted avg       0.77      0.35      0.46      3286
 samples avg       0.55      0.37      0.42      3286



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
