In [1]:
import pandas as pd
df_train = pd.read_csv('data/cleaned_train.csv')
df_test = pd.read_csv('data/cleaned_test.csv')

In [2]:
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup

In [4]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(df_train["cleaned_text"].tolist(),
                     df_train["target"].tolist(),
                     test_size=0.1,
                     random_state=42)


In [5]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]



In [6]:
train_enc = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
test_enc   = tokenizer(val_texts,   truncation=True, padding=True, max_length=128)

df_train_enc  = tokenizer(df_train["cleaned_text"].tolist(),
                      truncation=True, padding=True, max_length=128)
df_test_enc  = tokenizer(df_test["cleaned_text"].tolist(),
                      truncation=True, padding=True, max_length=128)


In [7]:
import torch
class TweetDataset(Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels
        
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item["labels"] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.encodings["input_ids"])


In [8]:
train_dataset = TweetDataset(train_enc, train_labels)
test_dataset   = TweetDataset(test_enc, val_labels)

df_train_dataset = TweetDataset(df_train_enc)
df_test_dataset  = TweetDataset(df_test_enc)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader   = DataLoader(test_dataset, batch_size=16)


df_train_loader = DataLoader(df_train_dataset, batch_size=16, shuffle=True)
df_test_loader   = DataLoader(df_test_dataset, batch_size=16)

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2
).to(device)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
optimizer = AdamW(model.parameters(), lr=2e-5)

epochs = 3
total_steps = len(train_loader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)




In [11]:
from tqdm import tqdm

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader):
        optimizer.zero_grad()

        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)

        loss = outputs.loss
        loss.backward()

        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1} - Train Loss: {total_loss/len(train_loader)}")

    # validation
    model.eval()
    val_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in test_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)

            val_loss += outputs.loss.item()
            preds = torch.argmax(outputs.logits, dim=1)
            correct += (preds.cpu() == batch["labels"].cpu()).sum().item()
            total += batch["labels"].size(0)

    print(f"Val Loss: {val_loss/len(test_loader)}  |  Val Acc: {correct/total}")


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 429/429 [05:31<00:00,  1.29it/s]


Epoch 1 - Train Loss: 0.4277256889082057
Val Loss: 0.4110750515634815  |  Val Acc: 0.821522309711286


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 429/429 [04:56<00:00,  1.45it/s]


Epoch 2 - Train Loss: 0.31566311707307687
Val Loss: 0.43905776444201666  |  Val Acc: 0.8083989501312336


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 429/429 [05:05<00:00,  1.41it/s]


Epoch 3 - Train Loss: 0.24796595317604658
Val Loss: 0.4563577047859629  |  Val Acc: 0.8241469816272966


In [14]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
import numpy as np
model.eval()

all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        labels = batch["labels"].cpu().numpy()

        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)

        preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()

        all_preds.extend(preds)
        all_labels.extend(labels)

# Convert to arrays
all_preds = np.array(all_preds)
all_labels = np.array(all_labels)

# Metrics
acc  = accuracy_score(all_labels, all_preds)
f1   = f1_score(all_labels, all_preds)
prec = precision_score(all_labels, all_preds)
rec  = recall_score(all_labels, all_preds)

print("ðŸ“Š Validation Metrics")
print("-----------------------")
print(f"Accuracy : {acc:.4f}")
print(f"F1 Score : {f1:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")
print("\nClassification Report:\n")
print(classification_report(all_labels, all_preds))


ðŸ“Š Validation Metrics
-----------------------
Accuracy : 0.8241
F1 Score : 0.7988
Precision: 0.8061
Recall   : 0.7917

Classification Report:

              precision    recall  f1-score   support

           0       0.84      0.85      0.84       426
           1       0.81      0.79      0.80       336

    accuracy                           0.82       762
   macro avg       0.82      0.82      0.82       762
weighted avg       0.82      0.82      0.82       762



In [12]:
model.eval()
all_preds = []

with torch.no_grad():
    for batch in df_test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        preds = torch.argmax(outputs.logits, dim=1)
        all_preds.extend(preds.cpu().numpy())


In [13]:
submission = pd.DataFrame({
    "id": df_test["id"],
    "target": all_preds
})

submission.to_csv("distilbert_submission.csv", index=False)
