In [14]:
from datasets import load_dataset
url = "https://mystic.the-eye.eu/public/AI/pile_preliminary_components/NIH_ExPORTER_awarded_grant_text.jsonl.zst"
url="https://the-eye.eu/public/AI/training_data/code_clippy_data/code_clippy_dup_data/train/data_0_time1625801885_default.jsonl.zst"
url='./data_0_time1625801885_default.jsonl'
nih_dataset = load_dataset("json", data_files=url, split= "train")

In [15]:
import psutil

print(f"RAM used: {psutil.Process().memory_info().rss / (1024 * 1024):.2f} MB")

print(f"Number of files in dataset : {nih_dataset.dataset_size}")

size_gb = nih_dataset.dataset_size / (1024**3)
print(f"Dataset size (cache file) : {size_gb:.2f} GB")

RAM used: 828.24 MB
Number of files in dataset : 3784253266
Dataset size (cache file) : 3.52 GB


In [17]:
import timeit

code_snippet = """batch_size = 1000

for idx in range(0, len(nih_dataset), batch_size):
    _ = nih_dataset[idx:idx + batch_size]
"""

time = timeit.timeit(stmt=code_snippet, number=1, globals=globals())
print(
    f"Iterated over {len(nih_dataset)} examples (about {size_gb:.1f} GB) in "
    f"{time:.1f}s, i.e. {size_gb/time:.3f} GB/s"
)

Iterated over 309148 examples (about 3.5 GB) in 4.6s, i.e. 0.770 GB/s


In [18]:
nih_dataset_streamed = load_dataset(
    "json", data_files=url, split="train", streaming=True
)

In [19]:
nih_dataset_streamed

<datasets.iterable_dataset.IterableDataset at 0x14bcb769a850>

In [20]:
next(iter(nih_dataset_streamed))

 'meta': {'repo_name': 'BenSampo/laravel-enum',
  'stars': '1047',
  'repo_language': 'PHP',
  'file_name': 'messages.php',
  'mime_type': 'text/x-php'}}

In [21]:
shuffled_dataset = nih_dataset_streamed.shuffle(buffer_size=10_000, seed=5566)
next(iter(shuffled_dataset))

{'text': '#include "io.h"\n\nint main(void)\n{\n    long long rt, rs, dsp;\n    long long achi, acli;\n    long long acho, aclo;\n    long long resulth, resultl;\n\n    achi = 0x05;\n    acli = 0xB4CB;\n    rs  = 0x87898765432;\n    rt  = 0x7878fdeca987;\n    resulth = 0x05;\n    resultl = 0x18278587;\n\n    __asm\n        ("mthi %2, $ac1\\n\\t"\n         "mtlo %3, $ac1\\n\\t"\n         "maq_s.l.pwr $ac1, %4, %5\\n\\t"\n         "mfhi %0, $ac1\\n\\t"\n         "mflo %1, $ac1\\n\\t"\n         : "=r"(acho), "=r"(aclo)\n         : "r"(achi), "r"(acli), "r"(rs), "r"(rt)\n        );\n    if ((resulth != acho) || (resultl != aclo)) {\n        printf("maq_s.w.pwr wrong\\n");\n\n        return -1;\n    }\n\n    achi = 0x05;\n    acli = 0xB4CB;\n    rs  = 0x89899980000000;\n    rt  = 0x88780000000;\n    resulth = 0x05;\n    resultl = 0xb4ca;\n\n    __asm\n        ("mthi %3, $ac1\\n\\t"\n         "mtlo %4, $ac1\\n\\t"\n         "maq_s.l.pwr $ac1, %5, %6\\n\\t"\n         "mfhi %0, $ac1\\n\\t"\n  

In [22]:
string = "Only those who will risk going too far can possibly find out how far one can go."
tokenized_str = list(string)
print(tokenized_str)

['O', 'n', 'l', 'y', ' ', 't', 'h', 'o', 's', 'e', ' ', 'w', 'h', 'o', ' ', 'w', 'i', 'l', 'l', ' ', 'r', 'i', 's', 'k', ' ', 'g', 'o', 'i', 'n', 'g', ' ', 't', 'o', 'o', ' ', 'f', 'a', 'r', ' ', 'c', 'a', 'n', ' ', 'p', 'o', 's', 's', 'i', 'b', 'l', 'y', ' ', 'f', 'i', 'n', 'd', ' ', 'o', 'u', 't', ' ', 'h', 'o', 'w', ' ', 'f', 'a', 'r', ' ', 'o', 'n', 'e', ' ', 'c', 'a', 'n', ' ', 'g', 'o', '.']


In [23]:
token2idx = {ch: idx for idx, ch in enumerate(sorted(set(tokenized_str)))}
print(token2idx)
input_ids = [token2idx[token] for token in tokenized_str]
print(input_ids)

{' ': 0, '.': 1, 'O': 2, 'a': 3, 'b': 4, 'c': 5, 'd': 6, 'e': 7, 'f': 8, 'g': 9, 'h': 10, 'i': 11, 'k': 12, 'l': 13, 'n': 14, 'o': 15, 'p': 16, 'r': 17, 's': 18, 't': 19, 'u': 20, 'w': 21, 'y': 22}
[2, 14, 13, 22, 0, 19, 10, 15, 18, 7, 0, 21, 10, 15, 0, 21, 11, 13, 13, 0, 17, 11, 18, 12, 0, 9, 15, 11, 14, 9, 0, 19, 15, 15, 0, 8, 3, 17, 0, 5, 3, 14, 0, 16, 15, 18, 18, 11, 4, 13, 22, 0, 8, 11, 14, 6, 0, 15, 20, 19, 0, 10, 15, 21, 0, 8, 3, 17, 0, 15, 14, 7, 0, 5, 3, 14, 0, 9, 15, 1]


In [24]:
from transformers import AutoTokenizer

string = "Only those who will risk going too far can possibly find out how far one can go."

model_name = "distilbert-base-uncased-finetuned-sst-2-english" #直接叫model名字
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [25]:
encoded_str = tokenizer(string, padding=True, truncation=True) 
encoded_str

{'input_ids': [101, 2069, 2216, 2040, 2097, 3891, 2183, 2205, 2521, 2064, 4298, 2424, 2041, 2129, 2521, 2028, 2064, 2175, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [26]:
len(set(encoded_str['input_ids']))

18

In [27]:
len(string.split())

17

In [28]:
tokens = tokenizer.convert_ids_to_tokens(encoded_str.input_ids)
tokens

['[CLS]',
 'only',
 'those',
 'who',
 'will',
 'risk',
 'going',
 'too',
 'far',
 'can',
 'possibly',
 'find',
 'out',
 'how',
 'far',
 'one',
 'can',
 'go',
 '.',
 '[SEP]']

In [29]:
from datasets import load_dataset
sentiment = load_dataset("poem_sentiment")
def tokenize(batch):
    return tokenizer(batch["verse_text"], padding=True, truncation=True)
print(tokenize(sentiment["train"][:3]))    

{'input_ids': [[101, 2007, 5122, 2630, 22681, 1012, 1999, 2122, 9379, 13178, 1011, 1011, 102], [101, 2009, 6223, 2061, 2146, 2004, 4212, 1996, 4542, 1010, 102, 0, 0], [101, 1998, 2008, 2003, 2339, 1010, 1996, 10459, 14045, 2154, 1010, 102, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]]}


In [30]:
sentiment_encoded = sentiment.map(tokenize, batched=True, batch_size=None)
print(sentiment_encoded["train"].column_names)

Map:   0%|          | 0/892 [00:00<?, ? examples/s]

Map:   0%|          | 0/105 [00:00<?, ? examples/s]

Map:   0%|          | 0/104 [00:00<?, ? examples/s]

['id', 'verse_text', 'label', 'input_ids', 'attention_mask']
