# T5 Fine-Tuning for Sentence Splitting (Pizzas and Drinks)

## Setup: Import Required Libraries

In [None]:
import os
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import load_dataset, Dataset

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

## Step 1: Load and Prepare the Dataset

In [None]:
# Define the schema
schema = pa.schema([
    ('train_SRC', pa.string()),
    ('cleaned_pizza_orders_src', pa.string()),
    ('cleaned_drink_orders_src', pa.string())
])

# Function to format data for the model
def format_data_for_model(input_data):
    """
    Format the data for T5/BART model input.
    """
    formatted_data = []
    for row in input_data:
        input_text = row['train_SRC']
        output_text = f"[PIZZAS] {row['cleaned_pizza_orders_src']} [DRINKS] {row['cleaned_drink_orders_src']}"
        formatted_data.append({'input': input_text, 'output': output_text})
    return formatted_data

# Load the preprocessed data from the Parquet file
def load_and_format_data(file_path):
    """
    Load data from a Parquet file and format it for the model.
    """
    # Load the data
    print(f"Loading data from {file_path}")
    table = pq.read_table(file_path)
    df_splitter = table.to_pandas()
    print(f"Data loaded: {len(df_splitter)} records")

    # Print the headers of the input data
    print("Headers of the input data:", df_splitter.columns.tolist())

    # Format the data for the model
    print("Formatting data for the model")
    formatted_data = format_data_for_model(df_splitter.to_dict('records'))
    print(f"Data formatted: {len(formatted_data)} records")

    return formatted_data

# Save the formatted data into a Parquet file
def save_data_to_parquet(data, file_path):
    """
    Save the formatted data into a Parquet file.
    """
    print(f"Saving data to {file_path}")
    df = pd.DataFrame(data)
    
    # Define the schema for the formatted data
    formatted_schema = pa.schema([
        ('input', pa.string()),
        ('output', pa.string())
    ])
    
    table = pa.Table.from_pandas(df, schema=formatted_schema)
    pq.write_table(table, file_path)
    print(f"Data saved: {len(data)} records")

# Load the saved data and tabulate it
def load_and_tabulate_data(file_path):
    """
    Load data from a Parquet file and tabulate it.
    """
    # Load the data
    print(f"Loading data from {file_path}")
    table = pq.read_table(file_path)
    df = table.to_pandas()
    print(f"Data loaded: {len(df)} records")

    # Convert DataFrame to list of rows
    rows = df.values.tolist()
    headers = df.columns.tolist()

    # Print data using tabulate
    print(tabulate.tabulate(rows, headers=headers, tablefmt="grid"))

    return df

In [None]:
# Placeholder for dataset loading - Replace this with actual loading logic
# Dataset format: {'input': 'order text', 'output': 'split order'}
dataset = load_dataset('csv', data_files={'train': 'train.csv', 'test': 'test.csv'})

# Define a preprocessing function
def preprocess_function(example):
    return {
        'input_text': f"Split the order into pizzas and drinks: {example['input']}",
        'target_text': example['output']
    }

# Preprocess the dataset
#! why remove_columns?
#& After mapping preprocess_function, the dataset adds new fields (input_text, target_text). 
#& The original fields (input and output) are no longer needed and can cause redundancy or confusion. 
#& Removing them ensures that only the preprocessed fields are retained.
dataset = dataset.map(preprocess_function, remove_columns=['input', 'output'])

# Sample 500k if necessary (for 10-hour constraint)
use_subset = True  # Set to False for full dataset
if use_subset:
    dataset['train'] = dataset['train'].shuffle(seed=42).select(range(500000))

## Step 2: Load the T5 Model and Tokenizer

In [None]:
model_name = "t5-small"  # Use t5-small for faster fine-tuning
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)

## Step 3: Tokenize the Dataset

In [None]:
def tokenize_function(example):
    model_inputs = tokenizer(example['input_text'], max_length=128, truncation=True, padding='max_length')
    labels = tokenizer(example['target_text'], max_length=128, truncation=True, padding='max_length')
    #! why input_ids?
    #& input_ids are the tokenized numerical representations of text. 
    #& In a sequence-to-sequence task like T5, the labels field specifies the expected output sequence during training. 
    #& The model uses labels to compute the loss and adjust weights during backpropagation.

    #~ Why input_ids?
    #~ Tokenization: The tokenizer converts input text into numerical IDs (input_ids), which represent the words or tokens in the input string.
    #~ Labels for Supervised Training: The model needs the labels field to compute loss during training. 
    #~ By setting model_inputs['labels'] = labels['input_ids'], you provide the model with the expected output sequence for the corresponding input sequence.
    #~ Why Necessary?: For sequence-to-sequence tasks like T5, the input (input_text) and expected output (target_text) must both be tokenized for the model to learn to map one to the other.
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Tokenize the datasets
tokenized_datasets = dataset.map(tokenize_function, batched=True)

## Step 4: Define Training Arguments

In [None]:
batch_size = 4  # Adjusted for 7GB GPU VRAM
num_train_epochs = 15 if use_subset else 3

gradient_accumulation_steps = 16  # Simulates larger effective batch size

#! What is the purpose of this function?
#& Purpose: This function defines a learning rate scheduler.
#& Linear Scheduler: Gradually reduces the learning rate from the initial value to 0 as training progresses.
#& Warmup Steps: Starts with a low learning rate and increases it linearly over the first 500 steps to stabilize training.
#& Why Needed?: Improves training stability, especially with large models or noisy data.

#~ How Are Both Done Together?
#~ Linear Decay: The learning rate decreases linearly from the initial value to 0 over the course of training.
#~ Warmup Steps: During the first 500 steps, the learning rate starts at 0 and increases linearly to the initial learning rate. This helps stabilize training by avoiding large updates at the start.
#~ Combined Process: After the warmup period, the learning rate begins its linear decay. Together, this creates a two-phase schedule: warmup followed by decay.

def get_scheduler():
    from transformers import get_scheduler
    return get_scheduler(
        "linear",
        optimizer=None,  # Placeholder; will be attached during training
        num_warmup_steps=500,
        num_training_steps=(len(tokenized_datasets['train']) // batch_size) * num_train_epochs
    )

#! Explain these arguments weight_decay, save_total_limit, logging_steps, save_steps, warmup_steps
#& weight_decay: Adds a small penalty to the weights to prevent overfitting by discouraging large weights during training.
#& save_total_limit: Limits the number of saved checkpoints to save disk space. The two most recent checkpoints will be kept.
#& logging_steps: Frequency of logging progress. Every 500 steps, training metrics (e.g., loss) are logged.
#& save_steps: Frequency of saving model checkpoints. Every 1000 steps, the model checkpoint is saved.
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,  # Simulates batch size of 64
    num_train_epochs=num_train_epochs,
    weight_decay=0.01,
    save_total_limit=2,
    fp16=True,  # Mixed precision for faster training
    logging_dir="./logs",
    logging_steps=500,
    save_steps=1000,
    lr_scheduler_type="linear",
    warmup_steps=500
)

## Step 5: Fine-Tuning with Trainer

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    tokenizer=tokenizer
)

# Start training
trainer.train()

## Step 6: Save the Fine-Tuned Model

In [None]:

#! Same name?
#& Saving both the model and tokenizer to the same directory ("./t5_splitter_model") ensures they can be loaded together for inference or further training. 
#& The tokenizer is essential for converting text inputs into numerical formats compatible with the model.
model.save_pretrained("./t5_splitter_model")
tokenizer.save_pretrained("./t5_splitter_model")

## Step 7: Evaluation

In [None]:

def generate_output(input_text):
    inputs = tokenizer(input_text, return_tensors="pt", max_length=128, truncation=True, padding=True).to(device)
    outputs = model.generate(**inputs, max_length=128, num_beams=5, early_stopping=True)
    #! What is skip_special_tokens? What is the purpose of this function?
    #& Purpose: Special tokens (e.g., <pad>, <eos>, <unk>) are used by the model for structural purposes but are not meaningful in the output text.
    #& Example: The model might output a sequence like: <pad> This is the result <eos>.
    #& By setting skip_special_tokens=True, only the meaningful part (This is the result) is returned, improving output readability.

    #~ Does the Model Add These Tokens on Its Own?
    #~ Yes: The model automatically appends special tokens to help with:
    #~ Padding (<pad>): Ensures all sequences in a batch have the same length.
    #~ End of Sequence (<eos>): Indicates the end of the generated sequence.
    #~ Unknown Tokens (<unk>): Handles tokens not in the vocabulary.
    #~ Purpose of skip_special_tokens=True: During decoding, this argument removes these structural tokens (e.g., <pad>, <eos>, <unk>) to produce a clean and readable output.
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test example
input_text = "Split the order into pizzas and drinks: I would like two large chicago pizzas and three cokes."
output = generate_output(input_text)
print("Generated Output:", output)

## Notes
- Adjust the dataset loading logic to your specific setup.
- Use `use_subset=True` for faster training with the sampled dataset.
- Use mixed precision (`fp16`) for faster and memory-efficient training.
- Increase `gradient_accumulation_steps` if batch size needs to be simulated further.
- Logs and results will be saved in `./logs` and `./results`.