# PSC ENV settings

In [1]:
import os
CACHE_DIR = "/ocean/projects/cis250068p/jangabyl/caches" 
os.environ['HF_HOME'] = CACHE_DIR
os.environ['HF_DATASETS_CACHE']  = CACHE_DIR

# Import packages

In [None]:
import torch
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
import wandb

# Load the datasets (ZSOPT)

In [None]:
from datasets import load_dataset
HF_KEY = "ENTER YOUR TOKEN HERE"
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"  # Faster binary transfers
os.environ["HF_DATASETS_DOWNLOAD_MAX_WORKERS"] = "8"  # Increase worker count
os.environ["HF_HUB_NUM_WORKERS"] = "16"  # Concurrent HTTP connections
os.environ["HF_DATASETS_DOWNLOAD_MAX_WORKERS"] = "16"  # Concurrent file downloads
# List of subsets to download
subset_patterns = [
    "flan_zsopt_data/*.parquet",
    "niv2_zsopt_data/*.parquet",
    "t0_zsopt_data/*.parquet",
    "cot_zsopt_data/*.parquet",
    "dialog_zsopt_data/*.parquet"
]

# Load all subsets
'''dataset = load_dataset(
    "Open-Orca/FLAN",
    data_files=subset_patterns,
    use_auth_token="HF_KEY",
)'''

dataset = load_dataset(
    "Open-Orca/FLAN",
    data_files=subset_patterns)
# Access each subset (if split into separate datasets)
#print(dataset["flan_zsopt_data"][0])  # Example for FLAN subset

Resolving data files:   0%|          | 0/240 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/240 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/84 [00:00<?, ?it/s]

# Sample dataset

In [4]:
# Get total number of examples
num_examples = len(dataset["train"])

# Show last 5 examples
for i in range(num_examples - 5, num_examples):
    print(dataset["train"][i])

{'inputs': 'Read the dialog and predict the next turn. See the 2 person dialog:\n Anonymous 1) What do you know about the butterfly, Microdes quadristrigata ?\n Anonymous 2) \n', 'targets': 'Microdes quadristrigata Microdes quadristrigata is a moth in the family Geometridae.', '_template_idx': 2, '_task_source': 'Dialog', '_task_name': 'wiki_dialog', '_template_type': 'zs_opt'}
{'inputs': "Write the response. Conversation transcript:\n *What is known about Jelena Lieven's adult life?\n *Jelena Lieven Jelena Lieven (1842–1917), was an Imperial Russian pedagogue.\n *Where did she work?\n *She was the principal of the Smolny Institute in Saint Petersburg in 1895–1917.\n *Who were her parents?\n *Daughter of Alexander Lieven.\n *Where did she go to school?\n *", 'targets': 'Jelena Lieven was not formally educated but received a very high level of education through Autodidacticism.', '_template_idx': 5, '_task_source': 'Dialog', '_task_name': 'wiki_dialog', '_template_type': 'zs_opt'}
{'inp

# Checking special tokens in Reverse model

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("afterless/reverse-pythia-160m")

# Check special tokens
print("EOS token:", tokenizer.eos_token)
print("EOS token ID:", tokenizer.eos_token_id)

print("BOS/SOS token:", tokenizer.bos_token)  
print("BOS/SOS token ID:", tokenizer.bos_token_id)
print("PAD token:", tokenizer.pad_token)
print("PAD token ID:", tokenizer.pad_token_id)


EOS token: None
EOS token ID: None
BOS/SOS token: None
BOS/SOS token ID: None
PAD token: None
PAD token ID: None


In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    "afterless/reverse-pythia-160m"
)
# Access the vocabulary dictionary
vocab = tokenizer.get_vocab()

# To get a list sorted by ID
sorted_vocab = sorted(vocab.items(), key=lambda x: x[1])
for token, idx in sorted_vocab[:20]:  # show first 20
    print(f"Token ID: {idx} -> Token: {repr(token)}")

Token ID: 0 -> Token: '<|endoftext|>'
Token ID: 1 -> Token: '<|padding|>'
Token ID: 2 -> Token: '!'
Token ID: 3 -> Token: '"'
Token ID: 4 -> Token: '#'
Token ID: 5 -> Token: '$'
Token ID: 6 -> Token: '%'
Token ID: 7 -> Token: '&'
Token ID: 8 -> Token: "'"
Token ID: 9 -> Token: '('
Token ID: 10 -> Token: ')'
Token ID: 11 -> Token: '*'
Token ID: 12 -> Token: '+'
Token ID: 13 -> Token: ','
Token ID: 14 -> Token: '-'
Token ID: 15 -> Token: '.'
Token ID: 16 -> Token: '/'
Token ID: 17 -> Token: '0'
Token ID: 18 -> Token: '1'
Token ID: 19 -> Token: '2'


# SCALING to LARGER FLAN

# Subset of subset

In [None]:
small_dataset = dataset["train"].select(range(100000))
print(len(small_dataset))  # Should show 100

100000


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import torch
from datasets import load_dataset
import numpy as np
from datasets import Dataset  # Import Dataset class


# Configuration
batch_size = 16
model_name = "afterless/reverse-pythia-160m"

# Load LOCAL dataset
"""dataset = load_dataset(
    "Open-Orca/FLAN",
    data_files={
        "train": "/path/to/your/cache/Open-Orca__flan/train-*.parquet",  # Update path
        "validation": "/path/to/your/cache/Open-Orca__flan/validation-*.parquet"
    }
)"""


dataset = small_dataset

# Load model
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenizer setup
tokenizer.eos_token = '<|endoftext|>'
tokenizer.eos_token_id = 0
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token = tokenizer.eos_token

def preprocess_batch(examples):
    """Process batch with input/target flipping for reverse modeling"""
    # Format text
    texts = [f"{inp}\n{tgt} {tokenizer.eos_token}" 
             for inp, tgt in zip(examples['inputs'], examples['targets'])]
    
    # Tokenize
    tokenized = tokenizer(
        texts,
        truncation=True,
        max_length=768,
        padding="max_length",
        return_tensors="pt"
    )
    
    # Reverse sequences
    tokenized['input_ids'] = torch.stack([torch.flip(ids, [0]) for ids in tokenized['input_ids']])
    #tokenized['attention_mask'] = torch.stack([torch.flip(mask, [0]) for mask in tokenized['attention_mask']])
    
    return tokenized

# Preprocess ALL data (non-streaming)
tokenized_dataset = dataset.map(
    preprocess_batch,
    batched=True,
    batch_size=1000,  # Process 1k examples at once
)

# Split dataset
train_val = tokenized_dataset.train_test_split(
    test_size=0.1,  # 10% for validation
    seed=42
)

# Training args
training_args = TrainingArguments(
    output_dir="/ocean/projects/cis250068p/jangabyl/caches/training/pythia-reverse-finetuned",
    eval_strategy="steps",
    eval_steps=5000,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    learning_rate=1e-6,
    weight_decay=0.03,
    save_total_limit=2,
    logging_dir="/ocean/projects/cis250068p/jangabyl/caches/training/pythia-reverse-finetuned/logs",
    logging_steps=500,
    fp16=True,
    gradient_accumulation_steps=2,
    report_to="none"
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_val["train"],
    eval_dataset=train_val["test"],
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

# Train
trainer.train()

# Save
trainer.save_model("./pythia-finetuned-final")
tokenizer.save_pretrained("./pythia-finetuned-final")

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss,Validation Loss
5000,4.5893,4.577348


('./pythia-finetuned-final/tokenizer_config.json',
 './pythia-finetuned-final/special_tokens_map.json',
 './pythia-finetuned-final/tokenizer.json')

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_path = "./pythia-finetuned-final"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

In [92]:
def generate_flipped(prompt, max_length=200):
    # Tokenize without returning token_type_ids
    inputs = tokenizer(
        prompt, 
        return_tensors="pt",
        return_token_type_ids=False  # Key fix
    ).to(device)
    
    # Flip input sequence
    inputs['input_ids'] = t.flip(inputs.input_ids, (1,))
    
    with t.no_grad():
        output = model.generate(
            **inputs,
            max_length=max_length,
            do_sample=True,
            temperature=0.4,
            top_k=50,
            top_p=0.92,
            repetition_penalty=1.2,
            no_repeat_ngram_size=3,
            early_stopping=True
        )
    
    # Flip back and decode
    return tokenizer.decode(t.flip(output, (1,))[0], skip_special_tokens=True)
print(generate_flipped("Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday"))

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


 says.  They now own four children, and he believes they are just like their parents.  "We love our children because we did not even see them growing up," he says.   He also plans to take a job at a Â£1million annual salary, from which he estimates he will earn around $200,000 over the next six months.  His family were shocked when he spoke publicly for the first time, saying  "You must prove yourself or you'll never make any money". But he later said: "I couldn't do it - but I was really worried about me."  He had signed a new three-year contract earlier this year and is yet to decide on where he would spend the rest of his life. He may have been contacted by an agent who has agreed to stay with him for five years, in a deal that could be worth more than $100 million.
Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday
