# Part 1 - Data Collection and cleaning

In [None]:
!pip install accelerate bitsandbytes transformers datasets music21 einops


Collecting bitsandbytes
  Downloading bitsandbytes-0.49.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.49.0-py3-none-manylinux_2_24_x86_64.whl (59.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.49.0


We installed several packages necessary for the extraction

First of all I am writing some necessary functions like creating batches from data, creating folders

In [None]:
import torch

def get_batch_from_data(dataset, size_of_the_block, batch_tokens=2_000_000):
    Batch = batch_tokens // size_of_the_block
    ix = torch.randint(len(dataset)-size_of_the_block-1, (B,))
    x = torch.stack([dataset[i:i+size_of_the_block] for i in ix])
    y = torch.stack([dataset[i+1:i+size_of_the_block+1] for i in ix])
    return x.cuda(), y.cuda()


In [None]:
import os

os.makedirs("mlproject/preprocess", exist_ok=True)
os.makedirs("mlproject/data/midi", exist_ok=True)
os.makedirs("mlproject/data/abc", exist_ok=True)
os.makedirs("mlproject/data/tokens", exist_ok=True)

print("Folders created successfully!")
!ls -R mlproject


Folders created successfully!
mlproject:
data  preprocess

mlproject/data:
abc  midi  tokens

mlproject/data/abc:

mlproject/data/midi:

mlproject/data/tokens:

mlproject/preprocess:


Our dataset is coming as a .mid files, and for this task I need abc files which contains musical notations, octaves

In [None]:
%%writefile mlproject/preprocess/midi_to_abc.py
import os
from pathlib import Path
from music21 import converter
from tqdm import tqdm

def midi_to_abc(input, output):
    input = Path(input)
    output = Path(output)
    output.mkdir(parents=True, exist_ok=True)

    files_obtained = list(input.rglob("*.mid")) + list(input.rglob("*.midi"))
    success, fail = 0, 0

    for f in tqdm(files_obtained, desc="Converting MIDI files into → ABC"):
        out_path = output / (f.stem + ".abc")
        try:
            score = converter.parse(f)
            score.write("abc", fp=str(out_path))
            success += 1
        except:
            fail += 1

    return {
        "total": len(files_obtained),
        "success": success,
        "failed": fail,
        "success_rate": success / max(len(files_obtained), 1)
    }


Writing mlproject/preprocess/midi_to_abc.py


In [None]:
%%writefile mlproject/preprocess/utils.py
import json
from collections import Counter

def build_vocab(list_of_tokens, vocab_path):
    collectionsCounter = Counter()
    for tokens in list_of_tokens:
        collectionsCounter.update(tokens)

    most_common_vocabulary = {tok: i for i, (tok, _) in enumerate(collectionsCounter.most_common())}

    with open(vocab_path, "w") as f:
        json.dump(most_common_vocabulary, f)

    return most_common_vocabulary


Writing mlproject/preprocess/utils.py


In [None]:
%%writefile mlproject/preprocess/build_dataset.py
import os
import json
import numpy as np
from pathlib import Path
from tqdm import tqdm
from .tokenize_abc import tokenize_abc
from .utils import build_vocab

def encoding_the_token(tokens, vobubulary):
    return [vobubulary[t] for t in tokens if t in vobubulary]

def build_dataset(abc_dir, output_dir, train_frac=0.98, val_frac=0.01):
    directory_abc = Path(abc_dir)
    output_dir = Path(output_dir)
    output_dir.mkdir(exist_ok=True, parents=True)

    files = list(directory_abc.rglob("*.abc"))
    token_lists = []

    print("Tokenizing ABC files for each text in progress...")
    for f in tqdm(files):
        try:
            text = f.read_text()
            tokens = tokenize_abc(text)
            if len(tokens) > 10:
                token_lists.append(tokens)
        except:
            pass

    print("Building the vocabulary from token_list ...")
    vocab = build_vocab(token_lists, output_dir / "vocab.json")

    encoded_token_list = [encoding_the_token(toks, vocab) for toks in token_lists]
    all_tokens = np.concatenate([np.array(e, dtype=np.uint32) for e in encoded_token_list])

    number_of_tokens = len(all_tokens)
    train_end = int(number_of_tokens * train_frac)
    val_end = int(number_of_tokens * (train_frac + val_frac))

    train = all_tokens[:train_end]
    val   = all_tokens[train_end:val_end]
    test  = all_tokens[val_end:]

    train.tofile(output_dir / "train.bin")
    val.tofile(output_dir / "val.bin")
    test.tofile(output_dir / "test.bin")

    stats = {
        "total_tokens": int(number_of_tokens),
        "train_tokens": int(len(train)),
        "val_tokens": int(len(val)),
        "test_tokens": int(len(test)),
        "vocab_size": len(vocab),
        "num_abc_files": len(files)
    }

    with open(output_dir / "stats.json", "w") as f:
        json.dump(stats, f, indent=2)

    return stats


Writing mlproject/preprocess/build_dataset.py


Here we wrote the functions to create json for the dataset and building the dataset from our raw files

In [None]:
!ls -R mlproject/preprocess


mlproject/preprocess:
build_dataset.py  midi_to_abc.py  utils.py


In [None]:
!git lfs install
!git clone https://huggingface.co/datasets/amaai-lab/MidiCaps mlproject/data/midicaps


Git LFS initialized.
Cloning into 'mlproject/data/midicaps'...
remote: Enumerating objects: 233, done.[K
remote: Total 233 (delta 0), reused 0 (delta 0), pack-reused 233 (from 1)[K
Receiving objects: 100% (233/233), 61.26 KiB | 20.42 MiB/s, done.
Resolving deltas: 100% (95/95), done.


I loaded the data from huggingface midicaps dataset which is an extended version of midi dataset.

In [None]:
%cd mlproject/data/midicaps
!git lfs install
!git lfs pull
%cd /content


/content/mlproject/data/midicaps
Updated git hooks.
Git LFS initialized.
/content


In [None]:
!tar -xvf mlproject/data/midicaps/midicaps.tar.gz -C mlproject/data/midicaps


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
lmd_full/4/4e30d7e3d1d6ee586eb9754017b5205d.mid
lmd_full/4/4ead16e2ca5c380cb6c5097645bf789e.mid
lmd_full/4/498f48f540720491810453dcf7daf3ff.mid
lmd_full/4/4d020d6775b9c42d29705d1c9f0bd317.mid
lmd_full/4/466d3a5158fcd62f669aa515a2cf57db.mid
lmd_full/4/4f0100bf9b460f632413a54907839d10.mid
lmd_full/4/491ced95c3348a5533c51782df041959.mid
lmd_full/4/442c7a56d31582a7010562d453304e37.mid
lmd_full/4/457e6071c01ee5a091f51690495fb9c5.mid
lmd_full/4/434086b94ece79ff1b5a6a06baf79e46.mid
lmd_full/4/4c4869b42a040d509252f39f647314f8.mid
lmd_full/4/49cc0f692469af6e000ae15015caf13a.mid
lmd_full/4/43d8c6191058f62a2e65dca5143cf45b.mid
lmd_full/4/41d7f180ca573a454a940daae650b29b.mid
lmd_full/4/4fa46b35a2ac225663fde17763bcf57f.mid
lmd_full/4/49a0ba04546ad193a9fe32cd34a8eb41.mid
lmd_full/4/480cf89a34536e03ff04b25172d9e917.mid
lmd_full/4/4fc8f1612a342cbfa41014dd0368fb8c.mid
lmd_full/4/47945ea8ab640114794244d786295c15.mid
lmd_full/4/48b002662cdc

Extracting the gives a very large number of .mid files having musical data and we need to convert this to abc files, for which I am using abc2midi library.

In [None]:
!wget -q https://sourceforge.net/projects/abcmidi/files/latest/download -O abcmidi.tar.gz
!pip install miditoolkit tqdm
!apt-get update
!apt-get install -y build-essential
!apt-get install -y libarchive-tools
!bsdtar -xf abcmidi.tar.gz
%cd abcmidi
!make



Collecting miditoolkit
  Downloading miditoolkit-1.0.1-py3-none-any.whl.metadata (4.9 kB)
Collecting mido>=1.1.16 (from miditoolkit)
  Downloading mido-1.3.3-py3-none-any.whl.metadata (6.4 kB)
Downloading miditoolkit-1.0.1-py3-none-any.whl (24 kB)
Downloading mido-1.3.3-py3-none-any.whl (54 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.6/54.6 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mido, miditoolkit
Successfully installed miditoolkit-1.0.1 mido-1.3.3
Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 https://cli.github.com/packages stable InRelease [3,917 B]
Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [2,202 kB]
Get:5 https://cli.github.com/packages stable/main amd64 Packages [345 B]
Get:6 http://security.ubuntu.com/ubuntu jammy-securi

In [None]:
!apt install abcmidi


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
Suggested packages:
  abcm2ps timidity | pmidi postscript-viewer
The following NEW packages will be installed:
  abcmidi
0 upgraded, 1 newly installed, 0 to remove and 48 not upgraded.
Need to get 306 kB of archives.
After this operation, 868 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 abcmidi amd64 20220218+ds1-1 [306 kB]
Fetched 306 kB in 1s (209 kB/s)
Selecting previously unselected package abcmidi.
(Reading database ... 121699 files and directories currently installed.)
Preparing to unpack .../abcmidi_20220218+ds1-1_amd64.deb ...
Unpacking abcmidi (20220218+ds1-1) ...
Setting up abcmidi (20220218+ds1-1) ...
Processing triggers for man-db (2.10.2-1) ...


In [None]:
import os, glob, subprocess
from tqdm import tqdm

input_dir = "/content/mlproject/data/midicaps"
output_dir = "/content/mlproject/data/abc_fast"
os.makedirs(output_dir, exist_ok=True)

midi_files = glob.glob(input_dir + "/**/*.mid", recursive=True)

failed = []

for i, mf in enumerate(tqdm(midi_files)):
    out = f"{output_dir}/{i:07d}.abc"
    try:
        result = subprocess.run(
            ["midi2abc", mf, "-o", out],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            timeout=15
        )
        if result.returncode != 0:
            failed.append((mf, result.stderr.decode()))
    except subprocess.TimeoutExpired:
        failed.append((mf, "timeout"))
    except Exception as e:
        failed.append((mf, str(e)))

print("Failed conversions:", len(failed))

100%|██████████| 168385/168385 [12:19<00:00, 227.55it/s]

Failed conversions: 1439





So, here most of the files are converted, although there are 1439 files which were not able to convert properly, but since it is less than 1 percent of our entire dataset, we would be moving towards next step.

In [None]:
import glob

abc_dir = "/content/mlproject/data/abc_fast"
out_path = "/content/mlproject/data/abc_corpus.txt"

files_to_be_merged = glob.glob(abc_dir + "/*.abc")

with open(out_path, "w") as out:
    for f in files_to_be_merged:
        try:
            text = open(f).read()
            out.write(text + "\n\n")
        except:
            continue

print("Merged ABC file saved:", out_path)


Merged ABC file saved: /content/mlproject/data/abc_corpus.txt


In [None]:
len(os.listdir("/content/mlproject/data/abc_fast"))


168385

Here we have a dataset of 168385 abc files, and we need to tokenize them to create our vocabulary, Here we have different tokenizors like sentence tokenization, character tokenizer, but since there is a time constraints, I am creating a regex pattern to tokenize our data.

In [None]:
%%writefile /content/mlproject/preprocess/tokenize_abc.py
import re

TOKEN_PATTERN = r"""
    [=^_]+[A-Ga-g][,']*   |
    [A-Ga-g][,']*         |
    z[0-9]*               |
    [0-9]+\/[0-9]+        |
    [0-9]+                |
    \|\:|\:\||\|\||\|     |
    [\[\]\(\)]            |
    [<>]                  |
    [A-Z]\:[^\s]+         |
    [a-zA-Z]+             | 
    .                     |
"""

TOKEN_REGEX = re.compile(TOKEN_PATTERN, re.VERBOSE)

def tokenize_abc(text_to_tokenize: str):
    return TOKEN_REGEX.findall(text_to_tokenize)


Writing /content/mlproject/preprocess/tokenize_abc.py


In [118]:
import os
from multiprocessing import Pool
from mlproject.preprocess.tokenize_abc import tokenize_abc
from tqdm import tqdm

abc_dir = "/content/mlproject/data/abc"
tok_dir = "/content/mlproject/data/tok3"
os.makedirs(tok_dir, exist_ok=True)

files = [f for f in os.listdir(abc_dir) if f.endswith(".abc")]

def process_file(f):
    try:
        text = open(os.path.join(abc_dir, f)).read()
        tokens = tokenize_abc(text)
        out_fp = os.path.join(tok_dir, f.replace(".abc", ".tok"))
        with open(out_fp, "w") as out:
            out.write(" ".join(tokens))
    except:
        pass

with Pool(processes=24) as pool:
    list(tqdm(pool.imap_unordered(process_file, files), total=len(files)))


100%|██████████| 168292/168292 [12:23<00:00, 226.49it/s]


Here, we got 168292 different .tok files our tokenization

In [None]:
%%writefile /content/mlproject/preprocess/build_vocab_aggressive.py
import os, json, re
from collections import Counter
from multiprocessing import Pool, cpu_count
from tqdm import tqdm

TOK_DIR = "/content/mlproject/data/tok3"
MAX_VOCAB = 8000
CHUNK_SIZE = 5000
fraction_re = re.compile(r"^\d+\/\d+$")

def normalize(token):
    if len(token) > 20:
        return None
    if fraction_re.match(token):
        return None
    if sum(c.isdigit() for c in token) > 6:
        return None
    return token


def process_chunk(file_list):
    local = Counter()
    path = os.path.join

    for f in file_list:
        try:
            with open(path(TOK_DIR, f)) as fp:
                for tok in fp.read().split():
                    t = normalize(tok)
                    if t:
                        local[t] += 1
        except:
            pass

    return local


if __name__ == "__main__":
    files = [f for f in os.listdir(TOK_DIR) if f.endswith(".tok")]
    print("Found", len(files), "files")

    # aggressive chunking
    chunks = [files[i:i+CHUNK_SIZE] for i in range(0, len(files), CHUNK_SIZE)]
    print("Chunks:", len(chunks))

    workers = min(cpu_count(), 8)
    print("Using workers:", workers)

    master = Counter()

    with Pool(workers) as p:
        for c in tqdm(p.imap_unordered(process_chunk, chunks), total=len(chunks)):
            master.update(c)

    sorted_tokens = [t for t, _ in master.most_common(MAX_VOCAB)]
    vocab = ["<PAD>", "<UNK>"] + sorted_tokens

    with open("/content/mlproject/data/vocab.json", "w") as f:
        json.dump({"vocab": vocab}, f, indent=2)

    print("Vocab size:", len(vocab))


Overwriting /content/mlproject/preprocess/build_vocab_aggressive.py


This is to create our vocab.json file which will contain the data training vocabulary. I am using muliprocessing threads for fast computations here.

In [124]:
%%writefile /content/mlproject/preprocess/create_splits.py
import os, random, json

tok_dir = "/content/mlproject/data/tok3"
out_dir = "/content/mlproject/data"

files = [f for f in os.listdir(tok_dir) if f.endswith(".tok")]
random.shuffle(files)

n = len(files)
train = files[: int(n*0.98)]
val   = files[int(n*0.98): int(n*0.99)]
test  = files[int(n*0.99):]

json.dump(train, open(f"{out_dir}/train_files.json", "w"))
json.dump(val,   open(f"{out_dir}/val_files.json", "w"))
json.dump(test,  open(f"{out_dir}/test_files.json", "w"))

print("Train:", len(train))
print("Val:", len(val))
print("Test:", len(test))


Overwriting /content/mlproject/preprocess/create_splits.py


Ultimately, I need a single train, test, validation file, for this I am creating the splits and then encoding my dataset.

In [125]:
%%writefile /content/mlproject/preprocess/encode_dataset.py
import os
import json
import numpy as np
from tqdm import tqdm

tok_dir = "/content/mlproject/data/tok3"
vocab_data = json.load(open("/content/mlproject/data/vocab.json"))
vocab_list = vocab_data["vocab"]
stoi = {tok: i for i, tok in enumerate(vocab_list)}
unk = stoi.get("<UNK>", 0)


def encode(path):
    with open(path, "r") as f:
        tokens = f.read().split()
    return [stoi.get(tok, unk) for tok in tokens]

base = "/content/mlproject/data"

splits = {
    "train": json.load(open(f"{base}/train_files.json")),
    "val":   json.load(open(f"{base}/val_files.json")),
    "test":  json.load(open(f"{base}/test_files.json")),
}

for name, filelist in splits.items():
    ids = []
    for f in tqdm(filelist, desc=name):
        tok_path = os.path.join(tok_dir, f)
        ids.extend(encode(tok_path))

    arr = np.array(ids, dtype=np.uint32)
    arr.tofile(f"/content/mlproject/data/{name}.bin")
    print(f"{name}: {len(arr)} tokens")


Overwriting /content/mlproject/preprocess/encode_dataset.py


In [122]:
!python /content/mlproject/preprocess/build_vocab_aggressive.py


Found 168292 files
Chunks: 34
Using workers: 8
100% 34/34 [28:42<00:00, 50.65s/it] 
Vocab size: 8002


In [126]:
!python /content/mlproject/preprocess/create_splits.py

Train: 164926
Val: 1683
Test: 1683


In [127]:
!python /content/mlproject/preprocess/encode_dataset.py


train:  46% 75762/164926 [10:44<04:03, 365.43it/s]^C


The encoding part stopped here so, I am again using multiprocessing threads for this in order to encode our data faster

In [128]:
%%writefile /content/mlproject/preprocess/encode_fast.py
import os, json, numpy as np
from tqdm import tqdm
import random

TOK_DIR = "/content/mlproject/data/tok3"
OUT_DIR = "/content/mlproject/data"
v = json.load(open("/content/mlproject/data/vocab.json"))
vocab = v.get("vocab") or v
stoi = {tok: i for i, tok in enumerate(vocab)}
unk = stoi.get("<UNK>", 1)
files = sorted([f for f in os.listdir(TOK_DIR) if f.endswith(".tok")])
random.shuffle(files)

n = len(files)
train = files[: int(n*0.98)]
val   = files[int(n*0.98): int(n*0.99)]
test  = files[int(n*0.99):]

splits = [("train", train), ("val", val), ("test", test)]

def encode_stream(split, split_files):
    out_path = f"{OUT_DIR}/{split}.bin"
    with open(out_path, "wb") as fout:
        for f in tqdm(split_files, desc=f"Encoding {split}", miniters=1000):
            fp = os.path.join(TOK_DIR, f)
            text = open(fp).read().split()
            ids = np.array([stoi.get(tok, unk) for tok in text], dtype=np.uint32)
            fout.write(ids.tobytes())
    print(f"Finished {split}: wrote {out_path}")

for name, ff in splits:
    encode_stream(name, ff)


Writing /content/mlproject/preprocess/encode_fast.py


In [129]:
!python /content/mlproject/preprocess/encode_fast.py

Encoding train: 100% 164926/164926 [32:49<00:00, 83.75it/s] 
Finished train: wrote /content/mlproject/data/train.bin
Encoding val: 100% 1683/1683 [00:05<00:00, 322.05it/s]
Finished val: wrote /content/mlproject/data/val.bin
Encoding test: 100% 1683/1683 [00:05<00:00, 322.63it/s]
Finished test: wrote /content/mlproject/data/test.bin


In [None]:
%%writefile /content/mlproject/preprocess/create_clean_abc.py
import os
import re
from tqdm import tqdm

RAW_DIR = "/content/mlproject/data/abc_fast"
OUT_DIR = "/content/mlproject/data/abc"
os.makedirs(OUT_DIR, exist_ok=True)

VALID_HEADER = re.compile(r"^(X:|T:|M:|L:|K:)", re.MULTILINE)

files = [f for f in os.listdir(RAW_DIR) if f.endswith(".abc")]

for f in tqdm(files):
    fp = os.path.join(RAW_DIR, f)
    try:
        txt = open(fp).read()
        if not VALID_HEADER.search(txt):
            continue
        if not re.search(r"[A-Ga-gz]", txt):
            continue

        open(os.path.join(OUT_DIR, f), "w").write(txt)

    except:
      continue

Overwriting /content/mlproject/preprocess/create_clean_abc.py


In [66]:
!python /content/mlproject/preprocess/create_clean_abc.py

100% 168385/168385 [03:15<00:00, 859.59it/s] 


In [None]:
%%writefile /content/mlproject/preprocess/tokenize_abc.py
import re

TOKEN_PATTERN = r"""
    [=^_]+[A-Ga-g][,']*     |
    [A-Ga-g][,']*           |
    z[0-9]*                 |
    [0-9]+\/[0-9]+          |
    [0-9]+                  | 
    \|\:|\:\||\|\||\|       |
    [\[\]\(\)]              | 
    [<>]                    | 
    [A-Z]\:[^\s]+           | 
    .                       | 
"""

TOKEN_REGEX = re.compile(TOKEN_PATTERN, re.VERBOSE)

def tokenize_abc(text):
    return TOKEN_REGEX.findall(text)

Overwriting /content/mlproject/preprocess/tokenize_abc.py


In [78]:
!find /content/mlproject -maxdepth 4 -name "*files.json"


/content/mlproject/data/val_files.json
/content/mlproject/data/train_files.json
/content/mlproject/data/test_files.json


In [99]:
!touch /content/mlproject/_init_.py
!touch /content/mlproject/preprocess/_init_.py

In [110]:
import sys
sys.path.append("/content/mlproject/preprocess")
try:
    from tokenize_abc import tokenize_abc
    print("OK: tokenize_abc importable")
except Exception as e:
    print("FAIL:",e)

OK: tokenize_abc importable


In [113]:
%%writefile /content/tokenize_all_fast.py
import os
import multiprocessing as mp
from tqdm import tqdm
import sys

sys.path.append("/content/mlproject/preprocess")

from tokenize_abc import tokenize_abc

ABC_DIR = "/content/mlproject/data/abc"
TOK_DIR = "/content/mlproject/data/tok3"
os.makedirs(TOK_DIR, exist_ok=True)

files = [f for f in os.listdir(ABC_DIR) if f.endswith(".abc")]

def process_one(ff):
    try:
        with open(os.path.join(ABC_DIR, ff), "r") as f:
            text = f.read()

        tokens = tokenize_abc(text)

        out_path = os.path.join(TOK_DIR, ff.replace(".abc", ".tok"))
        with open(out_path, "w") as o:
            o.write(" ".join(tokens))

        return 1
    except Exception as e:
        return f"ERROR: {ff} | {e}"

if _name_ == "_main_":
    print("Total files:", len(files))
    pool = mp.Pool(mp.cpu_count())

    for result in tqdm(pool.imap_unordered(process_one, files), total=len(files)):
        if isinstance(result, str) and result.startswith("ERROR"):
            print(result)

    pool.close()
    pool.join()

Overwriting /content/mlproject/preprocess/tokenize_all.py


In [130]:
!zip /content/dataset_small.zip \
    /content/mlproject/data/train.bin \
    /content/mlproject/data/val.bin \
    /content/mlproject/data/test.bin \
    /content/mlproject/data/vocab.json \
    /content/mlproject/data/train_files.json \
    /content/mlproject/data/val_files.json \
    /content/mlproject/data/test_files.json


  adding: content/mlproject/data/train.bin (deflated 98%)
  adding: content/mlproject/data/val.bin (deflated 94%)
  adding: content/mlproject/data/test.bin (deflated 94%)
  adding: content/mlproject/data/vocab.json (deflated 73%)
  adding: content/mlproject/data/train_files.json (deflated 77%)
  adding: content/mlproject/data/val_files.json (deflated 76%)
  adding: content/mlproject/data/test_files.json (deflated 76%)


This created a good dataset having train.bin, test.bin, val.bin, vocab.json. I am storing this in a drive and then moving further in next notebooks as I would a fresh GPU.