The goal of this script is to take the original Cryptonite dataset, and tokenize it, pad it and so on, so that we can directly load the data for training in the next script (save GPU time): This script only needs CPU.

In [1]:
!pip install datasets

import os
from datasets import load_dataset, load_from_disk
from transformers import DataCollatorWithPadding, BartTokenizer
import torch
import ast

# Mount to google drive
from google.colab import drive
drive.mount('/content/drive')
# from google.colab import drive
# drive.mount('/content/drive')
# Change it to your google drive path where this notebook located.
drive_path = '/content/drive/MyDrive/Projects/CryptoniteAnalysis/Baselines/Seq2Seq'
os.chdir(drive_path)

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[

# Load Original Datasets

In [2]:
def load_dataset_from_disk():
    data_dir = '../../datasets/cryptonite-official-split/'
    train_fp = data_dir + 'cryptonite-train.jsonl'
    val_fp = data_dir + 'cryptonite-val.jsonl'
    test_fp = data_dir + 'cryptonite-test.jsonl'
    datasets = load_dataset('json', data_files={'train': train_fp, 'validation': val_fp, 'test': test_fp})
    return datasets

# Tokenize Datasets
*2024.07.14*.  
Today I tried again to use Datacollector for dynamic padding, and I still cannot figure it out. I am kind of busy so I guess I will just use datasets.

In [3]:
def custom_tokenize_function(sample, tokenizer):
    # no need padding, no need truncation
    tokenized_sample = tokenizer(sample["clue"],
                                 padding='max_length', # the data will be padded by the data collector
                                 max_length=40,
                                 truncation=True # dynamic padding later
                                 )
    tokenized_sample['labels'] = tokenizer(sample["answer"],
                                           padding='max_length', # the data will be padded by the data collector
                                           max_length=40,
                                           truncation=True # dynamic padding later
                                           )['input_ids']

    # # get enumeration value
    # enumeration_value = ast.literal_eval(sample["enumeration"])
    # if isinstance(enumeration_value, tuple):
    #     data = list(enumeration_value)
    # else:
    #     data = [enumeration_value]
    # tokenized_sample["enumeration"] = torch.tensor(data)
    return tokenized_sample

# function to save time
def load_or_create_tokenized_dataset(tokenized_dataset_fp, tokenizer, removing_columns, create=False):
    '''
    Load or create tokenized dataset
    Notice: if there are new input data, and we want to recreate the tokenized
    dataset, we need to delete what's under tokenized_dataset_fp and run this function again.
    '''
    if create:
        # delete the directory tokenized_dataset_fp
        if os.path.exists(tokenized_dataset_fp):
            os.system('rm -rf ' + tokenized_dataset_fp)
    if os.path.exists(tokenized_dataset_fp):
        # load from disk
        tokenized_datasets = load_from_disk(tokenized_dataset_fp)
    else:
        # create
        datasets = load_dataset_from_disk()
        tokenized_datasets = datasets.map(custom_tokenize_function, fn_kwargs={'tokenizer': tokenizer})
        tokenized_datasets = tokenized_datasets.remove_columns(removing_columns)
        tokenized_datasets.save_to_disk(tokenized_dataset_fp)
    tokenized_datasets.set_format("torch")
    return tokenized_datasets




## Create Datasets for BART-base

In [None]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
# keep 'enumeration' for seq2seq training
removing_columns = ['publisher','date', 'author', 'number', 'orientation', 'clue', 'answer', 'quick','sub_publisher']
tokenized_dataset_fp = 'ProcessedDatasets/bart-base/'
if not os.path.exists(tokenized_dataset_fp):
    os.makedirs(tokenized_dataset_fp)
tokenized_datasets = load_or_create_tokenized_dataset(tokenized_dataset_fp, tokenizer, removing_columns, create=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/470804 [00:00<?, ? examples/s]

Map:   0%|          | 0/26156 [00:00<?, ? examples/s]

Map:   0%|          | 0/26157 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/470804 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/26156 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/26157 [00:00<?, ? examples/s]

## Create Datasets for T5-small

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
# keep 'enumeration' for seq2seq training
removing_columns = ['publisher','date', 'author', 'number', 'orientation', 'clue', 'answer', 'quick','sub_publisher']
tokenized_dataset_fp = 'ProcessedDatasets/t5-small/'
if not os.path.exists(tokenized_dataset_fp):
    os.makedirs(tokenized_dataset_fp)
tokenized_datasets = load_or_create_tokenized_dataset(tokenized_dataset_fp, tokenizer, removing_columns, create=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/470804 [00:00<?, ? examples/s]

Map:   0%|          | 0/26156 [00:00<?, ? examples/s]

Map:   0%|          | 0/26157 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/470804 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/26156 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/26157 [00:00<?, ? examples/s]

## Create Dataset for BART-large-CNN

In [None]:
# Load model directly
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

# keep 'enumeration' for seq2seq training
removing_columns = ['publisher','date', 'author', 'number', 'orientation', 'clue', 'answer', 'quick','sub_publisher']
tokenized_dataset_fp = 'ProcessedDatasets/bart-large-cnn/'
if not os.path.exists(tokenized_dataset_fp):
    os.makedirs(tokenized_dataset_fp)
tokenized_datasets = load_or_create_tokenized_dataset(tokenized_dataset_fp, tokenizer, removing_columns, create=True)

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/470804 [00:00<?, ? examples/s]

Map:   0%|          | 0/26156 [00:00<?, ? examples/s]

Map:   0%|          | 0/26157 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/470804 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/26156 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/26157 [00:00<?, ? examples/s]

# T5-large
Techniquely I can fine tune the flan-t5-xl, but it's too costly to fine tune because I will need to use the bigger GPU

In [4]:
from transformers import AutoTokenizer

# define model
# Load model directly
from transformers import AutoTokenizer
model_fp = 't5-large'
model_name = "google-t5/t5-large"


tokenizer = AutoTokenizer.from_pretrained(model_name)
# keep 'enumeration' for seq2seq training
removing_columns = ['publisher','date', 'author', 'number', 'orientation', 'clue', 'answer', 'quick','sub_publisher']
tokenized_dataset_fp = f'ProcessedDatasets/{model_fp}/'
if not os.path.exists(tokenized_dataset_fp):
    os.makedirs(tokenized_dataset_fp)
tokenized_datasets = load_or_create_tokenized_dataset(tokenized_dataset_fp, tokenizer, removing_columns, create=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/470804 [00:00<?, ? examples/s]

Map:   0%|          | 0/26156 [00:00<?, ? examples/s]

Map:   0%|          | 0/26157 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/470804 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/26156 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/26157 [00:00<?, ? examples/s]