In [2]:
!pip install transformers datasets accelerate

Collecting datasets
  Downloading datasets-3.4.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12=

In [3]:
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
import wandb

# Loading mini Flan dataset

In [6]:
df = pd.read_parquet("train-00000-of-00001.parquet")

In [7]:
df = df.iloc[:, :2]
df

Unnamed: 0,inputs,targets
0,"The thyroid gland, or simply the thyroid, is a...",no
1,Hayes High School (Birmingham Alabama) - Carol...,Animal
2,Answer the following question: Read the articl...,D
3,Please answer this: what is the charge for ent...,"June 24, 2017"
4,Write down the solution for this math problem:...,-5
...,...,...
143995,Problem: Build a movie plot around this: What ...,"In an unidentified city a gangster, Eddie Blac..."
143996,input question: Generate a question that has t...,What does Fiona do right before Shrek disappears?
143997,input: Please answer the following: Question: ...,Castle of Chaythe
143998,Build a movie plot around this: What has happe...,Despairing over the loss of a 19-year-old Mari...


# prompt

In [8]:
print(df.iloc[450,0])

Archaeological evidence shows that Homo erectus lived in the region now known as Myanmar as early as 400,000 years ago. The first evidence of Homo sapiens is dated to about 11,000 BC, in a Stone Age culture called the Anyathian with discoveries of stone tools in central Myanmar. Evidence of neolithic age domestication of plants and animals and the use of polished stone tools dating to sometime between 10,000 and 6,000 BC has been discovered in the form of cave paintings near the city of Taunggyi.

Answer this question, if possible (if impossible, reply "unanswerable"): What form was the evidence of ancient cultures discovered in ?


# target

In [9]:
print(df.iloc[450,1])

discovered in the form of cave paintings


source:`https://huggingface.co/datasets/pszemraj/flan-subsets-mini`

some ref:`https://huggingface.co/BEE-spoke-data/tFINE-680m-e32-d16-gqa-flan`

#for the forward

In [19]:
model_name = "EleutherAI/pythia-70m"  # for testing purposes smaller
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [20]:
dataset = Dataset.from_pandas(df)

`note that in the Mini Flan dataset, the instruction and question are already merged into one as mentioned in the paper pg 5 for fine tuning`

`Original Sequence: What is the capital of France? → Paris`

`Tokenized input_ids: [A', B', C', D', E', F', G', H']`

`Correct labels: [B', C', D', E', F', G', H', -100'] (shift left).`

`Note that A becomes the label here and loss is calculated based on that.`

In [21]:
#forward tokenization
def preprocess_forward(examples):
    combined_text = [f"{inp}\n{target}" for inp, target in zip(examples['inputs'], examples['targets'])]
    tokenized_output = tokenizer(combined_text, truncation=True, padding="max_length", max_length=512, return_tensors="pt")
    input_ids = tokenized_output["input_ids"]
    labels = input_ids.clone()
    labels[:, :-1] = input_ids[:, 1:]  # Shift tokens to the right
    labels[:, -1] = -100  # Ignore the last token (no next token to predict)
    tokenized_output["labels"] = labels
    return tokenized_output

tokenized_forward = dataset.map(preprocess_forward, batched=True)

Map:   0%|          | 0/144000 [00:00<?, ? examples/s]

`The following configs were used by others`

`learning_rate: 8e-05`

`train_batch_size: 4`

`eval_batch_size: 2`

`seed: 17868`

`distributed_type: multi-GPU`

`num_devices: 2`

`gradient_accumulation_steps: 32`

`total_train_batch_size: 256`

`total_eval_batch_size: 4`

`optimizer: Use paged_ademamix_32bit and the args are: No additional optimizer arguments`

`lr_scheduler_type: constant_with_warmup`

`lr_scheduler_warmup_ratio: 0.05`

`num_epochs: 1.0`

In [22]:
print("\nForward Direction - Tokenized Examples:")
for i in range(min(1, len(tokenized_forward))):
    print(f"Example {i + 1}:")
    print(f"Input IDs: {tokenized_forward[i]['input_ids']}")
    print(f"Decoded Tokens: {tokenizer.decode(tokenized_forward[i]['input_ids'], skip_special_tokens=True)}")
    print("-" * 50)


Forward Direction - Tokenized Examples:
Example 1:
Input IDs: [510, 17298, 21147, 13, 390, 3365, 253, 17298, 13, 310, 271, 35527, 21147, 275, 253, 7623, 13, 11253, 273, 767, 43253, 4802, 407, 271, 310, 7801, 316, 15, 733, 310, 1119, 387, 253, 2914, 273, 253, 7623, 13, 2708, 253, 13187, 434, 19126, 15, 380, 17298, 21147, 4279, 265, 17298, 23884, 13, 534, 8558, 4833, 253, 10761, 2281, 285, 2601, 9066, 15, 380, 23884, 671, 452, 1142, 643, 2538, 1690, 1110, 327, 2440, 15, 380, 17298, 23884, 1195, 1970, 16715, 1406, 460, 313, 53, 10, 285, 10386, 41636, 460, 313, 53, 10, 403, 3562, 432, 39887, 285, 25367, 15, 380, 17298, 671, 11330, 253, 14689, 9039, 24786, 249, 13, 534, 7120, 247, 2554, 275, 11672, 25641, 15, 187, 187, 1276, 434, 253, 1682, 3662, 281, 436, 1953, 27, 310, 253, 17298, 629, 273, 253, 6946, 985, 32, 187, 2369, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [23]:
#split the dataset
tokenized_forward = tokenized_forward.train_test_split(test_size=0.1)  # 90% train, 10% test


In [26]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

# loading the model for fine tuning
model_forward = AutoModelForCausalLM.from_pretrained("EleutherAI/pythia-70m")

# hyperparameter config
training_args = TrainingArguments(
    output_dir="./fine-tuned-pythia-forward",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    save_steps=500,
    logging_dir="./logs-forward",
    logging_steps=10,
    push_to_hub=False,
)

# initilize of train
trainer_forward = Trainer(
    model=model_forward,
    args=training_args,
    train_dataset=tokenized_forward["train"],
    eval_dataset=tokenized_forward["test"],
    tokenizer=tokenizer,
)

trainer_forward.train()

# saving the model
model_forward.save_pretrained("./fine-tuned-pythia-forward")
tokenizer.save_pretrained("./fine-tuned-pythia-forward")

  trainer_forward = Trainer(


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [25]:
wandb.finish()

0,1
train/epoch,▁▁▁▁▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇███▁▁▁▁▂▂▂▂▂▃
train/global_step,▁▁▁▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇████▁▁▁▂▂▃
train/grad_norm,▆█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▂▂▂▂▂▂
train/learning_rate,██▇▇▇▇▇▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁██▇▇▇▇▆
train/loss,█▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄▅▅▅▅▄

0,1
train/epoch,0.00741
train/global_step,120.0
train/grad_norm,9.52821
train/learning_rate,2e-05
train/loss,3.4755


# for the backward

`Original Sequence: What is the capital of France? → Paris`

`Reversed Sequence: France of capital the is What → Paris`

`Tokenized input_ids: [A', B', C', D', E', F', G', H']`

`Correct labels: [-100', A', B', C', D', E', F', G'] (shift right).`

`Note that A becomes the label here and loss is calculated based on that.`

In [45]:
model_name = "afterless/reverse-pythia-160m"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [46]:
#tokenizer.pad_token = tokenizer.eos_token
#tokenizer for this model doesn't have padding token, need to see how the explicit custom padding works
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
def preprocess_reverse(examples):
    # revering the order of inputs and targets at WORD level not char level
    reversed_inputs = [" ".join(inp.split()[::-1]) for inp in examples['inputs']]
    reversed_targets = [" ".join(target.split()[::-1]) for target in examples['targets']]
    combined_text = [f"{inp}\n{target}" for inp, target in zip(reversed_inputs, reversed_targets)]

    # tokenize
    tokenized_output = tokenizer(combined_text, truncation=True, padding="max_length", max_length=512, return_tensors="pt", padding_side="left")

    # shifting input_ids to create labels (shift left) and base the loss on the first token
    input_ids = tokenized_output["input_ids"]
    labels = input_ids.clone()
    labels[:, 1:] = input_ids[:, :-1]  # shift tokens to the right
    labels[:, 0] = -100  # ignoring the first token
    tokenized_output["labels"] = labels

    return tokenized_output

In [None]:
tokenized_backward = dataset.map(preprocess_reverse, batched=True)

Map:   0%|          | 0/144000 [00:00<?, ? examples/s]

In [None]:
print("\nBackward Direction - Tokenized Examples:")
for i in range(min(1, len(tokenized_backward))):
    print(f"Example {i + 1}:")
    print(f"Input IDs: {tokenized_backward[i]['input_ids']}")
    print(f"Decoded Tokens: {tokenizer.decode(tokenized_backward[i]['input_ids'], skip_special_tokens=True)}")
    print("-" * 50)

In [None]:
tokenized_backward = tokenized_backward.train_test_split(test_size=0.1)

In [None]:
# loading the model for fine tuning
model_backward = AutoModelForCausalLM.from_pretrained(model_name)

training_args = TrainingArguments(
    output_dir="./fine-tuned-pythia-backward",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    save_steps=500,
    logging_dir="./logs-backward",
    logging_steps=10,
    push_to_hub=False,
)

# initilize of train
trainer_backward = Trainer(
    model=model_backward,
    args=training_args,
    train_dataset=tokenized_backward["train"],
    eval_dataset=tokenized_backward["test"],
    tokenizer=tokenizer,
)

#.forward
trainer_backward.train()

# saving the model
model_backward.save_pretrained("./fine-tuned-pythia-backward")
tokenizer.save_pretrained("./fine-tuned-pythia-backward")

In [44]:
wandb.finish()

0,1
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/grad_norm,█▄▆▆▇▆▆▆▄▃▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,████▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁▁
train/loss,█▄▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
train/epoch,0.0284
train/global_step,460.0
train/grad_norm,0.05585
train/learning_rate,1e-05
train/loss,0.0026
