In [3]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizerFast, DataCollatorWithPadding

In [5]:
df = pd.read_json("Sarcasm_Headlines_Dataset_v2.json", lines=True)

In [7]:
texts = df["headline"].tolist() # data
labels = df["is_sarcastic"].tolist() # class
# covert pandas series to python list for train, test, split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(texts, labels, train_size=0.8, stratify=labels, random_state=42)
# 80/20 split with seed 42

In [10]:
print(len(X_train), len(X_test))

22895 5724


In [11]:
tokenizer = DistilBertTokenizerFast.from_pretrained(
    "distilbert-base-uncased"
)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [12]:
train_encodings = tokenizer(
        X_train,
        truncation=True,
        padding=False,
        max_length=128
    )
    
test_encodings  = tokenizer(
        X_test,
        truncation=True,
        padding=False,
        max_length=128
    )

In [13]:
train_samples = [
    {
        "input_ids": train_encodings["input_ids"][i],
        "attention_mask": train_encodings["attention_mask"][i],
        "labels": y_train[i]
    }
    for i in range(len(y_train))
]

test_samples = [
    {
        "input_ids": test_encodings["input_ids"][i],
        "attention_mask": test_encodings["attention_mask"][i],
        "labels": y_test[i]
    }
    for i in range(len(y_test))
]

In [14]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_loader = DataLoader(
    train_samples,
    batch_size=16,
    shuffle=True,
    collate_fn=data_collator
)

test_loader = DataLoader(
    test_samples,
    batch_size=16,
    shuffle=False,
    collate_fn=data_collator
)