In [12]:
!pip install transformers datasets accelerate --quiet

In [13]:
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
import wandb

# Loading mini Flan dataset

In [3]:
df = pd.read_parquet("train-00000-of-00001.parquet")

In [4]:
df = df.iloc[:, :2]
df

Unnamed: 0,inputs,targets
0,"The thyroid gland, or simply the thyroid, is a...",no
1,Hayes High School (Birmingham Alabama) - Carol...,Animal
2,Answer the following question: Read the articl...,D
3,Please answer this: what is the charge for ent...,"June 24, 2017"
4,Write down the solution for this math problem:...,-5
...,...,...
143995,Problem: Build a movie plot around this: What ...,"In an unidentified city a gangster, Eddie Blac..."
143996,input question: Generate a question that has t...,What does Fiona do right before Shrek disappears?
143997,input: Please answer the following: Question: ...,Castle of Chaythe
143998,Build a movie plot around this: What has happe...,Despairing over the loss of a 19-year-old Mari...


# prompt

In [5]:
print(df.iloc[450,0])

Archaeological evidence shows that Homo erectus lived in the region now known as Myanmar as early as 400,000 years ago. The first evidence of Homo sapiens is dated to about 11,000 BC, in a Stone Age culture called the Anyathian with discoveries of stone tools in central Myanmar. Evidence of neolithic age domestication of plants and animals and the use of polished stone tools dating to sometime between 10,000 and 6,000 BC has been discovered in the form of cave paintings near the city of Taunggyi.

Answer this question, if possible (if impossible, reply "unanswerable"): What form was the evidence of ancient cultures discovered in ?


# target

In [6]:
print(df.iloc[450,1])

discovered in the form of cave paintings


source:`https://huggingface.co/datasets/pszemraj/flan-subsets-mini`

some ref:`https://huggingface.co/BEE-spoke-data/tFINE-680m-e32-d16-gqa-flan`

#for the forward

In [7]:
dataset = Dataset.from_pandas(df)

In [8]:
model_name = "EleutherAI/pythia-160m"  #select the lm model
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = "<pad>"  # seems without custom pad token set to zero the convergence behaves weirdly
tokenizer.pad_token_id = 0     # setting pad_token_id to 0

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/375M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

`note that in the Mini Flan dataset, the instruction and question are already merged into one as mentioned in the paper pg 5 for fine tuning`

`Original Sequence: What is the capital of France? → Paris`

`Tokenized input_ids: [A', B', C', D', E', F', G', H']`

`Correct labels: [B', C', D', E', F', G', H', <pad>'] (shift left).`

In [9]:
#MINI TESTING

import numpy as np

inputs = np.array([["A", "B", "C", 0,0,0,0,0,0], ["D", "E", "F",0,0,0,0,0,0]])
labels = inputs.copy()

labels[:, :-1] = inputs[:, 1:]  # shifting tokens to the left
labels[:, -1] = -100
print(labels)


[['B' 'C' '0' '0' '0' '0' '0' '0' '-100']
 ['E' 'F' '0' '0' '0' '0' '0' '0' '-100']]


In [10]:
#forward tokenization
def preprocess_forward(examples):
    combined_text = [f"{inp}\n{target}" for inp, target in zip(examples['inputs'], examples['targets'])]
    tokenized_output = tokenizer(combined_text, truncation=True, padding="max_length", max_length=768, return_tensors="pt")
    input_ids = tokenized_output["input_ids"]
    labels = input_ids.clone()
    labels[:, :-1] = input_ids[:, 1:]  # shifting tokens to the left
    labels[:, -1] = -100
    tokenized_output["labels"] = labels
    return tokenized_output

tokenized_forward = dataset.map(preprocess_forward, batched=True)

Map:   0%|          | 0/144000 [00:00<?, ? examples/s]

In [11]:
#shuffle the data
tokenized_forward = tokenized_forward.shuffle(seed=42)

In [12]:
print("\nForward Direction - Tokenized Examples:")
for i in range(3):
    print(f"Example {i + 1}:")
    print(f"Input IDs: {tokenized_forward[i]['input_ids']}")
    print(f"Decoded Tokens: {tokenizer.decode(tokenized_forward[i]['input_ids'], skip_special_tokens=True)}")
    print("-" * 50)


Forward Direction - Tokenized Examples:
Example 1:
Input IDs: [36798, 27, 19566, 247, 2762, 340, 47705, 2278, 15, 187, 34, 27, 831, 310, 247, 1175, 309, 41, 2795, 4328, 15, 380, 4750, 369, 1077, 11453, 285, 9371, 15, 380, 17552, 2170, 310, 5322, 285, 4076, 13, 285, 253, 7180, 497, 4076, 285, 9848, 15, 380, 2579, 369, 3468, 13, 533, 5010, 597, 497, 1077, 1175, 15, 187, 187, 36798, 27, 19566, 247, 4016, 340, 47705, 2278, 15, 187, 34, 27, 8948, 352, 434, 619, 2363, 390, 253, 958, 326, 776, 1387, 369, 22813, 1735, 281, 247, 1387, 273, 952, 326, 3261, 751, 597, 12293, 562, 273, 253, 5579, 921, 3052, 763, 353, 41067, 326, 574, 20466, 594, 3076, 326, 359, 2546, 281, 320, 4395, 13, 533, 436, 921, 369, 11527, 2, 329, 2069, 309, 1119, 253, 16226, 594, 19328, 309, 3078, 281, 3835, 619, 2454, 15, 380, 2398, 1031, 79, 34334, 1607, 3796, 1080, 671, 25128, 347, 1608, 16642, 4867, 404, 13, 534, 309, 1119, 37750, 846, 309, 4720, 8156, 313, 76, 438, 375, 281, 617, 323, 10480, 272, 441, 481, 380, 45655,

In [13]:
#split the dataset
tokenized_forward = tokenized_forward.train_test_split(test_size=0.1)  # 90% train, 10% test

In [14]:
# loading the model for fine tuning
model_forward = AutoModelForCausalLM.from_pretrained("EleutherAI/pythia-160m")

# hyperparameter config
training_args = TrainingArguments(
    output_dir="./fine-tuned-pythia-forward",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.05,
    save_total_limit=2,
    save_steps=500,
    logging_dir="./logs-forward",
    logging_steps=10,
    push_to_hub=False,
)

# initilize of train
trainer_forward = Trainer(
    model=model_forward,
    args=training_args,
    train_dataset=tokenized_forward["train"],
    eval_dataset=tokenized_forward["test"],
    tokenizer=tokenizer,
)

trainer_forward.train()

# saving the model
model_forward.save_pretrained("./fine-tuned-pythia-forward")
tokenizer.save_pretrained("./fine-tuned-pythia-forward")

  trainer_forward = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjanbol[0m ([33m11785_finetuning[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [15]:
wandb.finish()

0,1
train/epoch,▁▃▄▆█
train/global_step,▁▃▅▆█
train/grad_norm,█▂▁▄█
train/learning_rate,█▆▅▃▁
train/loss,█▃▁▁▁

0,1
train/epoch,0.00309
train/global_step,50.0
train/grad_norm,16.5545
train/learning_rate,1e-05
train/loss,3.1549


# for the backward

`Original Sequence: What is the capital of France? → Paris`

`Reversed Sequence: France of capital the is What → Paris`

`Tokenized input_ids: [H', G', F', E', D', C', B', A']`

`Correct labels: ['-100', 'H', 'G', F', E', D', C', 'B'] (shift right).`



In [16]:
model_name = "afterless/reverse-pythia-160m"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = "<pad>"  # seems without custom pad token set to zero the convergence behaves weirdly
tokenizer.pad_token_id = 0     # setting pad_token_id to 0

config.json:   0%|          | 0.00/598 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/375M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/375M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/146 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.00 [00:00<?, ?B/s]

In [17]:
#MINI TESTING
import numpy as np

inputs = np.array([[0,0,0,0,"H", "G", "F"], [0,0,0,0,"A", "B", "C"]])
labels = inputs.copy()

labels[:, 1:] = inputs[:, :-1]  # Shift elements left in each row
labels[:, 0] = -100
print(labels)


[['-100' '0' '0' '0' '0' 'H' 'G']
 ['-100' '0' '0' '0' '0' 'A' 'B']]


In [18]:
import torch
#forward tokenization
def preprocess_reverse(examples):
    combined_text = [f"{inp}\n{target}" for inp, target in zip(examples['inputs'], examples['targets'])]
    tokenized_output = tokenizer(combined_text, truncation=True, padding="max_length", max_length=768, return_tensors="pt")
    input_ids = tokenized_output["input_ids"]
    reversed_input_ids = input_ids.flip(dims = [1]) # training data
    labels = reversed_input_ids.clone()
    labels = torch.roll(labels, shifts=1, dims=1)  # Shift right by 1
    labels[:, 0] = -100
    tokenized_output["labels"] = labels
    tokenized_output["input_ids"] = reversed_input_ids

    return tokenized_output

tokenized_reverse = dataset.map(preprocess_reverse, batched=True)

Map:   0%|          | 0/144000 [00:00<?, ? examples/s]

In [19]:
text = "write a yelp review"

# Tokenize (correct subword splits)
tokens = tokenizer.tokenize(text)
print(tokens)  # Output: ['write', ' a', ' yelp', ' review']

# Convert to IDs (correct numerical encoding)
input_ids = tokenizer.encode(text, return_tensors="pt")
print(input_ids)  # e.g., tensor([[123, 456, 789, 1012]])

# Reverse IDs (numerically correct)
reversed_ids = input_ids.flip(dims=[1])
print(reversed_ids)  # e.g., tensor([[1012, 789, 456, 123]])

['write', 'Ġa', 'Ġy', 'elp', 'Ġreview']
tensor([[ 6343,   247,   340, 47705,  2278]])
tensor([[ 2278, 47705,   340,   247,  6343]])


In [20]:
# Tokenize reversed words (cleanest solution)
reversed_text = " ".join("write a yelp review".split()[::-1])
input_ids = tokenizer(reversed_text, return_tensors="pt")["input_ids"]
print(tokenizer.decode(input_ids[0]))  # "review yelp a write"

review yelp a write


In [21]:
text = "yelp"
tokens = tokenizer.tokenize(text)
print(tokens)  # Output: ['y', 'elp'] or ['Ġy', 'elp']

['yel', 'p']


In [22]:
#shuffle the data
tokenized_reverse = tokenized_reverse.shuffle(seed=42)

In [2]:
print("Backward Direction - Tokenized Examples:")
for i in range(3):
    print(f"Example {i + 1}:")
    print(f"Input IDs: {tokenized_reverse[i]['input_ids']}")
    print(f"Decoded Tokens: {tokenizer.decode(tokenized_reverse[i]['input_ids'], skip_special_tokens=True)}")
    print("-" * 50)

Backward Direction - Tokenized Examples:
Example 1:


NameError: name 'tokenized_reverse' is not defined

In [24]:
tokenized_reverse = tokenized_reverse.train_test_split(test_size=0.1)

In [25]:
# loading the model for fine tuning
model_backward = AutoModelForCausalLM.from_pretrained(model_name)

training_args = TrainingArguments(
    output_dir="./fine-tuned-pythia-backward",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    save_total_limit=2,
    save_steps=500,
    logging_dir="./logs-backward",
    logging_steps=10,
    push_to_hub=False
)

# initilize of train
trainer_backward = Trainer(
    model=model_backward,
    args=training_args,
    train_dataset=tokenized_reverse["train"],
    eval_dataset=tokenized_reverse["test"],
    tokenizer=tokenizer,
)

#.forward
trainer_backward.train()

# saving the model
model_backward.save_pretrained("./fine-tuned-pythia-backward")
tokenizer.save_pretrained("./fine-tuned-pythia-backward")

  trainer_backward = Trainer(


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [26]:
wandb.finish()

0,1
train/epoch,▁▁▂▂▃▃▄▄▅▅▅▆▆▇▇██
train/global_step,▁▁▂▂▃▃▄▄▅▅▅▆▆▇▇██
train/grad_norm,█▂▁▁▁▁▁▁▁▁▂▂▂▁▁▁▁
train/learning_rate,██▇▇▆▆▅▅▅▄▄▃▃▂▂▁▁
train/loss,█▄▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
train/epoch,0.01049
train/global_step,170.0
train/grad_norm,0.86867
train/learning_rate,1e-05
train/loss,0.0312


# SCALING to LARGER FLAN

In [5]:
from datasets import load_dataset
from transformers import AutoTokenizer

dataset = load_dataset("chiayewken/flan-v2", split="train", streaming=True)


model_name = "EleutherAI/pythia-160m"  #select the lm model
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = "<pad>"  # seems without custom pad token set to zero the convergence behaves weirdly
tokenizer.pad_token_id = 0     # setting pad_token_id to 0

def preprocess_forward(example):
    combined_text = f"{example['source']}\n{example['target']}"
    tokenized_output = tokenizer(combined_text, truncation=True, padding="max_length", max_length=768, return_tensors="pt")

    input_ids = tokenized_output["input_ids"].squeeze(0)
    labels = input_ids.clone()
    labels[:-1] = input_ids[1:]  # Shift left
    labels[-1] = -100  # Ignore loss for last token

    return {"input_ids": input_ids, "labels": labels}

for i, example in enumerate(dataset):
    tokenized_example = preprocess_forward(example)
    print(f"Example {i + 1}:")
    print(f"Input IDs: {tokenized_example['input_ids']}")
    print(f"Decoded Tokens: {tokenizer.decode(tokenized_example['input_ids'], skip_special_tokens=True)}")
    print("-" * 50)

    if i == 2:  # Show only 3 examples
        break


Resolving data files:   0%|          | 0/89 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/375M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Example 1:
Input IDs: tensor([  510,  5006,   556, 20229,   342,  3168,   492, 18357,  1223,   970,
          253, 11772,  4697,    15,   187,   510,  1953,   285,  3662,   403,
         2708,    15,   187, 43217,   885,    27,   346, 35384,   275,  2502,
        20229,   285,  4797,   285,  3168,   492, 18357,   327, 11772,  4697,
          449,   187, 15545,   327,   436, 26536,    13,   476,   359,  7525,
          326,   253,  9079,   346,   510,  5006,   310, 15150,   253, 11772,
         4697,   275,   253,  1824,   449,   310,  2032,    32,   187, 10976,
           27,   187,    14,  4754,   187,    14,   352,   310,   417,  1896,
          281,  2028,   187,    14,   642,   187,  9820,   187,    34,  1879,
          261, 28700,   285,  2739,   403,  1027,  1841,   323,   534,   281,
         3343,    15,   187,   510,  1953,   285,  3662,   403,  2708,    15,
          187,  2042,   346,    34,  8516,  4370, 38724,   323,   253,  1315,
          261, 28700,   281,  1705,  1066,

In [50]:
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

dataset = load_dataset("chiayewken/flan-v2", split="train", streaming=True)

model_name = "afterless/reverse-pythia-160m"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer.pad_token = "<pad>"
tokenizer.pad_token_id = 0

def preprocess_reverse(example):
    combined_text = f"{example['source']}\n{example['target']}"

    tokenized_output = tokenizer(
        combined_text,
        truncation=True,
        padding="max_length",
        max_length=768,
        return_tensors="pt",
    )

    input_ids = tokenized_output["input_ids"].squeeze(0)  # Remove batch dim
    reversed_input_ids = input_ids.flip(dims=[0])  # Reverse sequence

    labels = reversed_input_ids.clone()
    labels = torch.roll(labels, shifts=1, dims=0)  # Shift right by 1
    labels[0] = -100  # Ignore loss for first token

    return {
        "input_ids": reversed_input_ids,
        "labels": labels,
    }

for i, example in enumerate(dataset):
    tokenized_example = preprocess_reverse(example)
    print(f"Example {i + 1}:")
    print(f"Input IDs: {tokenized_example['input_ids']}")
    print(f"Decoded Tokens: {tokenizer.decode(tokenized_example['input_ids'], skip_special_tokens=True)}")
    print("-" * 50)

    if i == 2:  # Show only 3 examples
        break


Resolving data files:   0%|          | 0/89 [00:00<?, ?it/s]

Example 1:
Input IDs: tensor([    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,

In [18]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
import torch
from datasets import load_dataset

# Configuration
batch_size = 8
model_name = "EleutherAI/pythia-160m"
total_examples = 378_000_000  # Total examples in FLAN dataset
train_ratio = 0.9
val_ratio = 0.05
test_ratio = 0.05

# Load dataset and model
dataset = load_dataset("Open-Orca/FLAN", split="train", streaming=True)
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Configure tokenizer properly
tokenizer.pad_token = tokenizer.eos_token

def preprocess_forward(example):
    combined_text = f"{example['inputs']}\n{example['targets']}{tokenizer.eos_token}"
    tokenized = tokenizer(
        combined_text,
        truncation=True,
        max_length=768,
        padding="max_length",
        return_tensors="pt"
    )
    input_ids = tokenized["input_ids"][0]
    labels = input_ids.clone()
    labels[:-1] = input_ids[1:]
    labels[input_ids == tokenizer.pad_token_id] = -100
    return {"input_ids": input_ids, "labels": labels}


# Shuffle and split before preprocessing
shuffled = dataset.shuffle(seed=42, buffer_size=100_000)

# Calculate split sizes
train_size = int(total_examples * train_ratio)
val_size = int(total_examples * val_ratio)
test_size = int(total_examples * test_ratio)

# Create splits
train_raw = shuffled.take(train_size)
remaining = shuffled.skip(train_size)
val_raw = remaining.take(val_size)
test_raw = remaining.skip(val_size).take(test_size)

# Preprocess each split
tokenized_train = train_raw.map(preprocess_forward, batched=False)
tokenized_val = val_raw.map(preprocess_forward, batched=False)
tokenized_test = test_raw.map(preprocess_forward, batched=False)

# Training configuration
training_args = TrainingArguments(
    output_dir="./pythia-finetuned",
    eval_strategy="steps",
    eval_steps=500,
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    max_steps=10_000,  # Required for streaming datasets
    weight_decay=0.01,
    save_total_limit=2,
    save_steps=1000,
    logging_dir="./logs",
    logging_steps=100,
    gradient_accumulation_steps=2,
    fp16=True,
    report_to="wandb",
    push_to_hub=False
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
)

# Start training
trainer.train()

# Save final model
trainer.save_model("./pythia-finetuned-final")
tokenizer.save_pretrained("./pythia-finetuned-final")

Resolving data files:   0%|          | 0/2167 [00:00<?, ?it/s]

  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjanbol[0m ([33m11785_finetuning[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss


KeyboardInterrupt: 