In [1]:
# !pip install transformers
# !pip install 'accelerate>=0.26.0'

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Set a seed for reproducibility
import torch

def fix_torch_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

fix_torch_seed()

In [3]:
# model_path_or_name = "./models/upstage/TinySolar-248m-4k"
save_path = './models'
model_name = 'upstage/TinySolar-248m-4k'

In [22]:
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="cuda", # change to auto if you have access to a GPU
    torch_dtype=torch.bfloat16,
    cache_dir = save_path
)

In [23]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    cache_dir = save_path,
    device_map="cuda"
)

In [24]:
# Save them locally
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

('./models/tokenizer_config.json',
 './models/special_tokens_map.json',
 './models/tokenizer.json')

## generate text samples

In [47]:
# prompt = "I am an engineer. I love"
prompt = "what is swamy vivekanand kriya yoga"
# prompt = "इस नदी की धार म ठंडी हवा आती तो ह,ै"

In [48]:
inputs = tiny_general_tokenizer(prompt, return_tensors="pt").to('cuda')
inputs

{'input_ids': tensor([[    1,   767,   349,  1719, 27313,   363,   495,  9763,   391,   446,
           373,  5157, 21615]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [49]:
# inputs['input_ids'][0].numpy()


In [50]:
for key in inputs['input_ids'][0]:
   print(f'{key} == {tokenizer.decode([key])}')

1 == <s>
767 == what
349 == is
1719 == sw
27313 == amy
363 == v
495 == ive
9763 == kan
391 == and
446 == k
373 == ri
5157 == ya
21615 == yoga


In [56]:
from transformers import TextStreamer
streamer = TextStreamer(
    tiny_general_tokenizer,
    skip_prompt=True, # If you set to false, the model will first return the prompt and then the generated text
    skip_special_tokens=True,
    device_map="cuda"
)

In [57]:
outputs = model.generate(
    **inputs, 
    streamer=streamer, 
    use_cache=True,
    max_new_tokens=128,
    do_sample=False, 
    temperature=0.0,
    repetition_penalty=1.1,

)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


.
The 10 Best Things To Do In The Philippines For A Weekend Getaway
By: Katie Mills
When you're planning a trip to the Philippines, it can be hard to decide which one is your favorite. But if you're looking for a great place to stay in the country, here are some of the best things to do in the Philippines that will make your vacation memorable and memorable.
Best Places To Visit In The Philippines For A Weekend
By: Katie Mills
If you're looking for a place to visit in the Philippines, then you've come to


In [45]:
# outputs
# for key in outputs[0]:
#    # print(f'{key} == {tokenizer.decode([key])}')
#     print(f'{tokenizer.decode([key])}')

## create dataset

In [62]:
# !pip install datasets
! pip install pymupdf4llm==0.0.17

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting pymupdf4llm==0.0.17
  Downloading pymupdf4llm-0.0.17-py3-none-any.whl.metadata (4.1 kB)
Collecting pymupdf>=1.24.10 (from pymupdf4llm==0.0.17)
  Downloading pymupdf-1.25.3-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf4llm-0.0.17-py3-none-any.whl (26 kB)
Downloading pymupdf-1.25.3-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m114.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pymupdf, pymupdf4llm
Successfully installed pymupdf-1.25.3 pymupdf4llm-0.0.17
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [64]:
import datasets
import pymupdf4llm

In [65]:
pdf_path = 'Complete_Works_of_Swami_Vivekananda_all_volumes.pdf'
md_text = pymupdf4llm.to_markdown(pdf_path,show_progress=False)

In [67]:
# md_text

In [87]:
my_dataset = []
my_dataset.append(
        {'text':md_text }
    )
my_dataset = datasets.Dataset.from_list(my_dataset)
print(my_dataset)

Dataset({
    features: ['text'],
    num_rows: 1
})


In [93]:
file_path = "preprocessed_dataset.parquet"
my_dataset.to_parquet(file_path)

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

8279628

In [94]:
# my_dataset.to_list()
# import datasets

# dataset = datasets.load_dataset(
#     "parquet", 
#     data_files="./data/preprocessed_dataset.parquet", 
#     split="train"
# )
# print(dataset)

## data packaging

In [98]:
!pip install sentencepiece

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Downloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.2.0
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [1]:
import datasets

In [24]:
my_dataset = datasets.load_dataset(
    "parquet", 
    data_files="./preprocessed_dataset.parquet", 
    split="train"
)
print(my_dataset)

Dataset({
    features: ['text'],
    num_rows: 1
})


In [25]:
my_dataset

Dataset({
    features: ['text'],
    num_rows: 1
})

In [26]:
model_name = 'upstage/SOLAR-10.7B-v1.0'

In [27]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_name, 
    use_fast=False
)

In [28]:
tokenizer.tokenize("I'm a short sentence")

['▁I', "'", 'm', '▁a', '▁short', '▁sentence']

In [29]:
def tokenization(example):
    # Tokenize
    tokens = tokenizer.tokenize(example["text"])

    # Convert tokens to ids
    token_ids = tokenizer.convert_tokens_to_ids(tokens)

    # Add <bos>, <eos> tokens to the front and back of tokens_ids 
    # bos: begin of sequence, eos: end of sequence
    token_ids = [
        tokenizer.bos_token_id] \
        + token_ids \
        + [tokenizer.eos_token_id
    ]
    example["input_ids"] = token_ids

    # We will be using this column to count the total number of tokens 
    # in the final dataset
    example["num_tokens"] = len(token_ids)
    return example

In [30]:
my_dataset = my_dataset.map(tokenization, load_from_cache_file=False)
print(my_dataset)

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'input_ids', 'num_tokens'],
    num_rows: 1
})


In [31]:
sample = my_dataset[0]

print("text", sample["text"][:30]) # 
print("\ninput_ids", sample["input_ids"][:30])
print("\nnum_tokens", sample["num_tokens"])

text -----

# Complete Works of Swa

input_ids [1, 20041, 13, 13, 28771, 21929, 19012, 302, 3904, 6449, 550, 495, 9763, 5904, 13, 13, 27332, 3904, 6449, 550, 495, 9763, 5904, 13, 13, 8193, 28747, 3550, 1508, 3212]

num_tokens 2025559


In [32]:
import numpy as np
np.sum(my_dataset[0]["num_tokens"])

2025559

In [33]:
my_dataset

Dataset({
    features: ['text', 'input_ids', 'num_tokens'],
    num_rows: 1
})

In [34]:
input_ids = np.concatenate(my_dataset["input_ids"])
print(len(input_ids))

2025559


In [35]:
max_seq_length = 128

In [36]:
total_length = len(input_ids) - len(input_ids) % max_seq_length
print(total_length)

2025472


In [37]:
input_ids = input_ids[:total_length]
print(input_ids.shape)

(2025472,)


In [38]:
input_ids_reshaped = input_ids.reshape(-1, max_seq_length).astype(np.int32)
input_ids_reshaped.shape  

(15824, 128)

In [39]:
input_ids_list = input_ids_reshaped.tolist()
packaged_pretrain_dataset = datasets.Dataset.from_dict(
    {"input_ids": input_ids_list}
)
print(packaged_pretrain_dataset)

Dataset({
    features: ['input_ids'],
    num_rows: 15824
})


In [40]:
packaged_pretrain_dataset.to_parquet("./packaged_pretrain_dataset.parquet")

Creating parquet from Arrow format:   0%|          | 0/16 [00:00<?, ?ba/s]

8165184