In [39]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_scheduler, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [33]:
# Load your data
train = pd.read_csv('/kaggle/input/datasetscsv/train_data.csv')
val = pd.read_csv('/kaggle/input/datasetscsv/val_data.csv')
test = pd.read_csv('/kaggle/input/datasetscsv/test.csv')

# Ensure the columns are named correctly
train_inputs = train['text'].astype(str).tolist()
train_labels = train['labels'].astype(str).tolist()

val_inputs = val['text'].astype(str).tolist()
val_labels = val['labels'].astype(str).tolist()

test_inputs = test['text'].astype(str).tolist()

# Encode labels
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform([label.strip("[]'") for label in train_labels])
val_labels = label_encoder.transform([label.strip("[]'") for label in val_labels])

# Save label mapping for later use
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Mapping:", label_mapping)

Label Mapping: {'multi': 0, 'passage': 1, 'phrase': 2}


In [34]:
model_name = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(model_name)

train_encodings = tokenizer(train_inputs, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_inputs, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_inputs, truncation=True, padding=True, max_length=512)


In [35]:
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = CustomDataset(train_encodings, train_labels)
val_dataset = CustomDataset(val_encodings, val_labels)

In [36]:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

In [40]:
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=len(set(train_labels)), hidden_dropout_prob=0.2)

optimizer = AdamW(model.parameters(), lr=3e-5, weight_decay=0.01)

num_epochs = 20
num_training_steps = num_epochs * len(train_loader)

num_warmup_steps = int(0.1 * num_training_steps)  # 10% of training steps for warmup
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.2, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [41]:
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [42]:
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        lr_scheduler.step()

    print(f"Epoch: {epoch + 1}/{num_epochs}, Loss: {loss.item()}")

Epoch: 1/20, Loss: 0.7959491014480591
Epoch: 2/20, Loss: 1.1583460569381714
Epoch: 3/20, Loss: 1.1604667901992798
Epoch: 4/20, Loss: 0.7535871863365173
Epoch: 5/20, Loss: 0.29121577739715576
Epoch: 6/20, Loss: 0.4476976990699768
Epoch: 7/20, Loss: 0.34571388363838196
Epoch: 8/20, Loss: 0.7011649012565613
Epoch: 9/20, Loss: 0.5663749575614929
Epoch: 10/20, Loss: 0.025963740423321724
Epoch: 11/20, Loss: 0.009006178937852383
Epoch: 12/20, Loss: 0.0018188500544056296
Epoch: 13/20, Loss: 0.7061797976493835
Epoch: 14/20, Loss: 0.0005079236580058932
Epoch: 15/20, Loss: 1.0225307941436768
Epoch: 16/20, Loss: 0.8170844316482544
Epoch: 17/20, Loss: 0.00045374358887784183
Epoch: 18/20, Loss: 0.0010210965992882848
Epoch: 19/20, Loss: 0.0001452578726457432
Epoch: 20/20, Loss: 0.00026584244915284216


In [43]:
# Switch to evaluation mode
model.eval()

# Prepare to collect predictions and references
predictions = []
references = []

# Perform evaluation
with torch.no_grad():
    for batch in val_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        preds = torch.argmax(outputs.logits, dim=-1)
        predictions.extend(preds.cpu().numpy())
        references.extend(batch['labels'].cpu().numpy())

# Decode integer labels back to original label strings
decoded_predictions = label_encoder.inverse_transform(predictions)
decoded_references = label_encoder.inverse_transform(references)

# Calculate evaluation metrics
accuracy = accuracy_score(references, predictions)
f1 = f1_score(references, predictions, average='weighted')
report = classification_report(references, predictions, target_names=label_encoder.classes_)

print(f"Validation Accuracy: {accuracy}")
print(f"Validation F1 Score: {f1}")
print("Classification Report:")
print(report)

Validation Accuracy: 0.725
Validation F1 Score: 0.7241485371342836
Classification Report:
              precision    recall  f1-score   support

       multi       0.77      0.65      0.71        84
     passage       0.73      0.69      0.71       154
      phrase       0.70      0.79      0.74       162

    accuracy                           0.73       400
   macro avg       0.74      0.71      0.72       400
weighted avg       0.73      0.72      0.72       400



In [45]:
# Save the trained model and tokenizer
model.save_pretrained("/kaggle/working/roberta_clickbait_model")
tokenizer.save_pretrained("/kaggle/working/roberta_clickbait_tokenizer")

('/kaggle/working/roberta_clickbait_tokenizer/tokenizer_config.json',
 '/kaggle/working/roberta_clickbait_tokenizer/special_tokens_map.json',
 '/kaggle/working/roberta_clickbait_tokenizer/vocab.json',
 '/kaggle/working/roberta_clickbait_tokenizer/merges.txt',
 '/kaggle/working/roberta_clickbait_tokenizer/added_tokens.json')

In [44]:
# Load test data
test_inputs = test['text'].astype(str).tolist()

# Tokenize test data
test_encodings = tokenizer(test_inputs, truncation=True, padding=True, max_length=512)

class TestDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

# Create DataLoader for test data
test_dataset = TestDataset(test_encodings)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Switch to evaluation mode
model.eval()

# Prepare to collect predictions
predictions = []

# Perform prediction
with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        preds = torch.argmax(outputs.logits, dim=-1)
        predictions.extend(preds.cpu().numpy())

# Decode integer labels back to original label strings
decoded_predictions = label_encoder.inverse_transform(predictions)

# Prepare the output DataFrame
pred_df = pd.DataFrame({
    'id': test['id'],  # Adjust based on your actual test data format
    'spoilerType': decoded_predictions
})

# Save predictions to CSV
pred_df.to_csv('/kaggle/working/roberta_predictions.csv', index=False)