# Encode Data with Distilbert Tokenizer

In [1]:
import pathlib
import pickle
import sys
import time

import numpy as np
from sklearn.model_selection import train_test_split

from tqdm import tqdm
import transformers as hft
import torch
import torch.utils.data as tudata

sys.path.insert(0, '/home/jupyter')
import util.data

In [2]:
DATA_PATH = r"../../data/FakeNewsNet/fnndata/"
MODEL_NAME = "distilbert-base-uncased"
ENCODINGS_FILE_LABEL = "25Nov_gcpf_distilbert"

In [3]:
tokenizer = hft.DistilBertTokenizerFast.from_pretrained(
    MODEL_NAME, return_dict=True)

In [4]:
train_data, val_data = util.data.load_full_data(
    data_path=DATA_PATH, datasets=["gossipcop","politifact"],
    min_char=500, val_split=0.25)

gossipcop:real: 12056it [00:02, 5529.68it/s]
gossipcop:fake: 3800it [00:00, 5831.14it/s]
politifact:real: 369it [00:00, 2618.41it/s]
politifact:fake: 302it [00:00, 7220.55it/s]


In [5]:
len(train_data["texts"])

10986

In [6]:
len(val_data["texts"])

3663

In [7]:
train_dataset = util.data.FNDataset(train_data, tokenizer)

In [8]:
val_dataset = util.data.FNDataset(val_data, tokenizer)

In [9]:
with open(f"train_dataset_{ENCODINGS_FILE_LABEL}.pickle", "wb") as pfile:
    pickle.dump(train_dataset, pfile)
with open(f"val_dataset_{ENCODINGS_FILE_LABEL}.pickle", "wb") as pfile:
    pickle.dump(val_dataset, pfile)

In [10]:
with open(f"train_dataset_{ENCODINGS_FILE_LABEL}.pickle", "rb") as pfile:
    train_dataset = pickle.load(pfile)
with open(f"val_dataset_{ENCODINGS_FILE_LABEL}.pickle", "rb") as pfile:
    val_dataset = pickle.load(pfile)

In [11]:
type(train_dataset[0]["input_ids"])

torch.Tensor

In [12]:
len(train_dataset)

10986

In [13]:
len(val_dataset)

3663

In [15]:
torch.cuda.is_available()

True