# Encode Data with Distilbert Tokenizer

In [10]:
import pathlib
import pickle
import sys
import time

import numpy as np
from sklearn.model_selection import train_test_split

from tqdm import tqdm
import transformers as hft
import torch
import torch.utils.data as tudata

sys.path.insert(0, '/home/jupyter')
import util.data

In [5]:
DATA_PATH = r"../../../data/FakeNewsNet/filtered_dataset/"
MODEL_NAME = "distilbert-base-uncased"
ENCODINGS_FILE_LABEL = "23Nov_gcpf_distilbert"

In [6]:
tokenizer = hft.DistilBertTokenizerFast.from_pretrained(
    MODEL_NAME, return_dict=True)

In [7]:
texts, labels = util.data.load(
    path=DATA_PATH,
    datasets=["gossipcop","politifact"])

gossipcop_train:real: 10652it [00:00, 18221.37it/s]
gossipcop_train:fake: 3459it [00:00, 18150.66it/s]
politifact_train:real: 283it [00:00, 12652.43it/s]
politifact_train:fake: 255it [00:00, 15785.28it/s]


In [8]:
util.data.get_size(path=DATA_PATH)

gossipcop_test_real: 2661
gossipcop_test_fake: 875
politifact_train_real: 283
politifact_train_fake: 255
gossipcop_train_real: 10652
gossipcop_train_fake: 3459
politifact_test_real: 69
politifact_test_fake: 57


In [11]:
train_texts, val_texts, train_labels, val_labels = (
    train_test_split(texts, labels, test_size=0.25))

In [12]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors="pt")

In [13]:
val_encodings = tokenizer(val_texts, truncation=True, padding=True, return_tensors="pt")

In [14]:
with open(f"train_encodings_{ENCODINGS_FILE_LABEL}.pickle", "wb") as pfile:
    pickle.dump((train_encodings, train_labels), pfile)
with open(f"val_encodings_{ENCODINGS_FILE_LABEL}.pickle", "wb") as pfile:
    pickle.dump((val_encodings, val_labels), pfile)

In [15]:
with open(f"train_encodings_{ENCODINGS_FILE_LABEL}.pickle", "rb") as pfile:
    train_encodings, train_labels = pickle.load(pfile)
with open(f"val_encodings_{ENCODINGS_FILE_LABEL}.pickle", "rb") as pfile:
    val_encodings, val_labels = pickle.load(pfile)

In [16]:
len(train_encodings["input_ids"])

10986

In [17]:
len(train_labels)

10986

In [18]:
len(val_encodings["input_ids"])

3663

In [19]:
len(val_labels)

3663