In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import DebertaTokenizer, DebertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [2]:
# Load your data
train = pd.read_csv('/kaggle/input/datasetscsv/train_data.csv')
val = pd.read_csv('/kaggle/input/datasetscsv/val_data.csv')
test = pd.read_csv('/kaggle/input/datasetscsv/test.csv')

# Ensure the columns are named correctly
train_inputs = train['text'].astype(str).tolist()
train_labels = train['labels'].astype(str).tolist()

val_inputs = val['text'].astype(str).tolist()
val_labels = val['labels'].astype(str).tolist()

test_inputs = test['text'].astype(str).tolist()

# Encode labels
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform([label.strip("[]'") for label in train_labels])
val_labels = label_encoder.transform([label.strip("[]'") for label in val_labels])

# Save label mapping for later use
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Mapping:", label_mapping)

Label Mapping: {'multi': 0, 'passage': 1, 'phrase': 2}


In [3]:
model_name = 'microsoft/deberta-base'
tokenizer = DebertaTokenizer.from_pretrained(model_name)

train_encodings = tokenizer(train_inputs, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_inputs, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_inputs, truncation=True, padding=True, max_length=512)

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

In [4]:
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = CustomDataset(train_encodings, train_labels)
val_dataset = CustomDataset(val_encodings, val_labels)

In [5]:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

In [6]:
model = DebertaForSequenceClassification.from_pretrained(model_name, num_labels=len(set(train_labels)))

optimizer = AdamW(model.parameters(), lr=3e-5, weight_decay=0.01)

num_epochs = 10
num_training_steps = num_epochs * len(train_loader)

num_warmup_steps = int(0.1 * num_training_steps)  # 10% of training steps for warmup
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(f'Using device: {device}')


pytorch_model.bin:   0%|          | 0.00/559M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda


In [7]:
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        lr_scheduler.step()

    print(f"Epoch: {epoch + 1}/{num_epochs}, Loss: {loss.item()}")

Epoch: 1/20, Loss: 1.0868505239486694
Epoch: 2/20, Loss: 0.6080507636070251
Epoch: 3/20, Loss: 0.4876149892807007
Epoch: 4/20, Loss: 0.8239059448242188
Epoch: 5/20, Loss: 0.028545040637254715
Epoch: 6/20, Loss: 0.5291695594787598
Epoch: 7/20, Loss: 0.0013126988196745515
Epoch: 8/20, Loss: 0.00021257127809803933
Epoch: 9/20, Loss: 0.00022988552518654615
Epoch: 10/20, Loss: 0.000445537269115448


KeyboardInterrupt: 

In [8]:
# Switch to evaluation mode
model.eval()

# Prepare to collect predictions and references
predictions = []
references = []

# Perform evaluation
with torch.no_grad():
    for batch in val_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        preds = torch.argmax(outputs.logits, dim=-1)
        predictions.extend(preds.cpu().numpy())
        references.extend(batch['labels'].cpu().numpy())

# Decode integer labels back to original label strings
decoded_predictions = label_encoder.inverse_transform(predictions)
decoded_references = label_encoder.inverse_transform(references)

# Calculate evaluation metrics
accuracy = accuracy_score(references, predictions)
f1 = f1_score(references, predictions, average='weighted')
report = classification_report(references, predictions, target_names=label_encoder.classes_)

print(f"Validation Accuracy: {accuracy}")
print(f"Validation F1 Score: {f1}")
print("Classification Report:")
print(report)

Validation Accuracy: 0.7175
Validation F1 Score: 0.7153727398901079
Classification Report:
              precision    recall  f1-score   support

       multi       0.78      0.61      0.68        84
     passage       0.72      0.68      0.70       154
      phrase       0.69      0.81      0.75       162

    accuracy                           0.72       400
   macro avg       0.73      0.70      0.71       400
weighted avg       0.72      0.72      0.72       400



In [9]:
# Load test data
test_inputs = test['text'].astype(str).tolist()

# Tokenize test data
test_encodings = tokenizer(test_inputs, truncation=True, padding=True, max_length=512)

class TestDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

# Create DataLoader for test data
test_dataset = TestDataset(test_encodings)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Switch to evaluation mode
model.eval()

# Prepare to collect predictions
predictions = []

# Perform prediction
with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        preds = torch.argmax(outputs.logits, dim=-1)
        predictions.extend(preds.cpu().numpy())

# Decode integer labels back to original label strings
decoded_predictions = label_encoder.inverse_transform(predictions)

# Prepare the output DataFrame
pred_df = pd.DataFrame({
    'id': test['id'],  # Adjust based on your actual test data format
    'spoilerType': decoded_predictions
})

# Save predictions to CSV
pred_df.to_csv('/kaggle/working/deberta_predictions.csv', index=False)

In [10]:
# Save the trained model and tokenizer
model.save_pretrained("/kaggle/working/deberta_clickbait_model")
tokenizer.save_pretrained("/kaggle/working/deberta_clickbait_tokenizer")

('/kaggle/working/deberta_clickbait_tokenizer/tokenizer_config.json',
 '/kaggle/working/deberta_clickbait_tokenizer/special_tokens_map.json',
 '/kaggle/working/deberta_clickbait_tokenizer/vocab.json',
 '/kaggle/working/deberta_clickbait_tokenizer/merges.txt',
 '/kaggle/working/deberta_clickbait_tokenizer/added_tokens.json')