# Data download

In [None]:
!pip install datasets tiktoken -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
llmx 0.0.15a0 requires cohere, which is not installed.
llmx 0.0.15a0 requires openai, which is not installed.[0m[31m
[0m

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## CNN Dailymail dataset

In [None]:
from datasets import load_dataset
import numpy as np
import pandas as pd
import tiktoken
from tqdm import tqdm

In [None]:
encode_1 = tiktoken.get_encoding("gpt2")
encode_1.encode("abc   ")

[39305, 220, 220, 220]

In [None]:
dataset_name = "cnn_dailymail"
dataset_version = '3.0.0'
num_processors = 8
cache_directory = "/content/dataset"

dataset = load_dataset(dataset_name, version=dataset_version, num_proc=num_processors, cache_dir=cache_directory)

Downloading builder script:   0%|          | 0.00/8.33k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/9.88k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/15.1k [00:00<?, ?B/s]

     

Downloading data files #2:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #1:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #3:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #0:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #4:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data:   0%|          | 0.00/661k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/572k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/159M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/376M [00:00<?, ?B/s]

Setting num_proc from 8 to 2 for the train split as it only contains 2 shards.


Generating train split: 0 examples [00:00, ? examples/s]

Setting num_proc from 8 to 2 for the validation split as it only contains 2 shards.


Generating validation split: 0 examples [00:00, ? examples/s]

Setting num_proc from 8 to 2 for the test split as it only contains 2 shards.


Generating test split: 0 examples [00:00, ? examples/s]

In [None]:
maximum_length = 256
encoded_summary = encode_1.encode("<# generate summary#>")
print(encoded_summary)

[27, 2, 7716, 10638, 2, 29]


In [None]:
def cnn_token(data):

    count, solution = 0, []

    for datapoint in tqdm(data):

        article_text = datapoint["article"]
        art = encode_1.encode(article_text)

        highlights_text = datapoint["highlights"]
        high = encoded_summary + encode_1.encode(highlights_text)

        concatenated_len = len(art) + len(high)

        if len(high) > 200: count = count + 1; continue

        if concatenated_len == maximum_length:

            combined_array = np.array(art + high, dtype=np.uint16)
            solution.append(combined_array)

        elif concatenated_len < maximum_length:

          combined_text = art + high + [220] * (maximum_length - concatenated_len)
          combined_array = np.array(combined_text, dtype=np.uint16)
          solution.append(combined_array)

        else:
            for j in range(0, len(art), maximum_length - len(high)):

                temp = j + maximum_length - len(high)
                art_chunk = art[j:temp] + high

                solution.append(np.array(art_chunk + [220] * (maximum_length - len(art_chunk)), dtype=np.uint16))

    return np.array(solution)

cnn_train_dataset = cnn_token(dataset["train"])
cnn_val_dataset  = cnn_token(dataset["validation"])
cnn_test_dataset = cnn_token(dataset["test"])

100%|██████████| 287113/287113 [05:46<00:00, 828.05it/s]
100%|██████████| 13368/13368 [00:15<00:00, 874.48it/s]
100%|██████████| 11490/11490 [00:13<00:00, 847.29it/s]


## SQUAD dataset

In [None]:
dataset_name = "squad"
num_proc = 8
cache_dir = "/content/dataset"

dataset_squad = load_dataset(dataset_name, num_proc=num_proc, cache_dir=cache_dir)

In [None]:
total_validation_examples = len(dataset_squad["validation"])
custom_split_index = total_validation_examples // 2

dataset_squad["test"] = dataset_squad["validation"].select(range(custom_split_index, total_validation_examples))
dataset_squad["validation"] = dataset_squad["validation"].select(range(custom_split_index))

In [None]:
maximum_length = 256
ques_join = encode_1.encode("<#question#>")
ans_join = encode_1.encode("<#get answer#>")

def squad_token(data):
    solution = []

    for datapoint in tqdm(data):

        context_text = datapoint["context"]
        cont = encode_1.encode(context_text)

        code = encode_1.encode(datapoint["answers"]["text"][0]) if datapoint["answers"]["text"] else encode_1.encode("No answer found")

        question_text = datapoint["question"]
        encoded_question = encode_1.encode(question_text)

        ques = ques_join + encoded_question + ans_join + code

        concatenated_length = len(cont) + len(ques)

        if concatenated_length == maximum_length:

            context_array = np.array(cont, dtype=np.uint16)
            question_array = np.array(ques, dtype=np.uint16)

            combined_array = np.concatenate([context_array, question_array])
            solution.append(combined_array)


        elif concatenated_length < maximum_length:

            concatenated = cont + ques
            concatenated_length = len(concatenated)
            padding_length = maximum_length - concatenated_length
            padded_list = concatenated + [220] * padding_length
            result_array = np.array(padded_list, dtype=np.uint16)
            solution.append(result_array)

        else:
            for j in range(0, len(cont), maximum_length - len(ques)):

                temp = j + maximum_length - len(ques)

                cont_chunk = cont[j:temp] + ques

                solution.append(np.array(cont_chunk + [220] * (maximum_length - len(cont_chunk)), dtype=np.uint16))

    return np.array(solution)

squad_train_dataset = squad_token(dataset_squad["train"])
squad_val_dataset   = squad_token(dataset_squad["validation"])
squad_test_dataset  = squad_token(dataset_squad["test"])

100%|██████████| 87599/87599 [00:31<00:00, 2805.13it/s]
100%|██████████| 5285/5285 [00:01<00:00, 3581.97it/s]
100%|██████████| 5285/5285 [00:01<00:00, 3482.18it/s]


### Concatenating both the datasets

In [None]:
complete_train = np.concatenate((cnn_train_dataset,squad_train_dataset), axis=0)
np.random.shuffle(cnn_train_dataset)

complete_val = np.concatenate((cnn_val_dataset, squad_val_dataset),axis=0)
complete_test = np.concatenate((cnn_test_dataset, squad_test_dataset),axis=0)

In [None]:
path = "/content/drive/MyDrive/IDL/HW5/Finetune_tiral"
np.savez(path+'/complete_data_2.npz', train=complete_train, val=complete_val, test=complete_test)