In [1]:
!pip install transformers datasets ipywidgets peft bitsandbytes accelerate

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl (76.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.3


In [2]:
import os
import math
import numpy as np
import torch
from tqdm.auto import tqdm 
from datetime import timedelta
import time
import gc
from peft import prepare_model_for_kbit_training

# Set memory optimization environment variable
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    TrainerCallback
)
from datasets import load_dataset, Dataset
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training

# Free up memory before starting
torch.cuda.empty_cache()
gc.collect()

# 1. Load your text dataset from the Kaggle input path
with open('/kaggle/input/nbody-data/cleaned.md', 'r', encoding='utf-8') as f:
    corpus = f.read()

# 2. Load the tokenizer and determine the model's maximum context length
model_name = "Qwen/Qwen2.5-Math-1.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
max_context_length = tokenizer.model_max_length
print(f"Maximum context length: {max_context_length}")

# Use smaller chunks to further reduce memory pressure
chunk_size = 2048  # Further reduced from 4096

# 3. Tokenize the entire corpus and split it into reasonably sized chunks
def prepare_corpus_for_training(corpus, tokenizer, chunk_size):
    tokens = tokenizer(corpus, truncation=False, return_tensors="np")["input_ids"][0]
    
    total_chunks = math.ceil(len(tokens) / chunk_size)
    print(f"Total tokens: {len(tokens)}, Creating {total_chunks} chunks of size {chunk_size}")
    
    chunks = []
    for i in range(0, len(tokens), chunk_size):
        chunk = tokens[i:i + chunk_size].tolist()
        if len(chunk) < chunk_size:
            chunk = chunk + [tokenizer.pad_token_id] * (chunk_size - len(chunk))
        chunks.append({"input_ids": chunk})
    
    return Dataset.from_list(chunks)

chunked_dataset = prepare_corpus_for_training(corpus, tokenizer, chunk_size)

# 4. Create a unified progress tracking callback
class EnhancedProgressCallback(TrainerCallback):
    def __init__(self):
        self.training_start = time.time()
        self.epoch_start = None
        self.progress_bar = None
        self.current_epoch = 0
        
    def on_train_begin(self, args, state, control, **kwargs):
        print(f"\n{'='*70}")
        print(f"TRAINING STARTED")
        print(f"{'='*70}")
        
        # Calculate total steps for all epochs
        self.total_steps = state.max_steps
        self.steps_per_epoch = len(trainer.train_dataset) // (args.per_device_train_batch_size * 
                                                            args.gradient_accumulation_steps * 
                                                            torch.cuda.device_count())  # Account for multi-GPU
    
    def on_epoch_begin(self, args, state, control, **kwargs):
        self.epoch_start = time.time()
        self.current_epoch = state.epoch + 1
        
        print(f"\n{'='*70}")
        print(f"Beginning Epoch {self.current_epoch}/{args.num_train_epochs}")
        print(f"{'='*70}")
        
        # Create a progress bar for this epoch
        self.progress_bar = tqdm(
            total=self.steps_per_epoch, 
            desc=f"Epoch {self.current_epoch}/{args.num_train_epochs}",
            position=0
        )
        self.last_logged_step = 0
        
    def on_epoch_end(self, args, state, control, **kwargs):
        # Close progress bar
        if self.progress_bar:
            self.progress_bar.close()
        
        # Calculate epoch time
        epoch_time = time.time() - self.epoch_start
        total_time = time.time() - self.training_start
        
        print(f"\n{'='*70}")
        print(f"Completed Epoch {self.current_epoch}/{args.num_train_epochs}")
        print(f"Epoch time: {timedelta(seconds=int(epoch_time))}")
        print(f"Total training time: {timedelta(seconds=int(total_time))}")
        
        # Estimate remaining time
        epochs_remaining = args.num_train_epochs - self.current_epoch
        est_remaining = epoch_time * epochs_remaining
        print(f"Estimated time remaining: {timedelta(seconds=int(est_remaining))}")
        print(f"{'='*70}\n")
        
    def on_log(self, args, state, control, logs=None, **kwargs):
        if state.is_local_process_zero and logs and self.progress_bar:
            # Calculate step progress
            current_step_in_epoch = state.global_step % self.steps_per_epoch
            if current_step_in_epoch == 0 and state.global_step > 0:
                current_step_in_epoch = self.steps_per_epoch
                
            # Update progress bar to current position
            self.progress_bar.n = current_step_in_epoch
            
            # Add metrics to progress bar
            postfix_dict = {}
            if "loss" in logs:
                postfix_dict["loss"] = f"{logs['loss']:.4f}"
            if "learning_rate" in logs:
                postfix_dict["lr"] = f"{logs['learning_rate']:.2e}"
            
            # Add GPU memory usage
            try:
                allocated = torch.cuda.memory_allocated() / (1024 ** 3)
                postfix_dict["GPU"] = f"{allocated:.1f}GB"
            except:
                pass
                
            self.progress_bar.set_postfix(**postfix_dict)
            self.progress_bar.update(0)  # Force refresh
            
            # Print step details with percentage completed
            current_overall = state.global_step
            progress_percent = current_overall / self.total_steps * 100
            if current_overall % 20 == 0:  # Print every 20 steps
                print(f"Step: {current_overall}/{self.total_steps} ({progress_percent:.1f}%) | "
                      f"Loss: {logs.get('loss', 0):.4f} | "
                      f"LR: {logs.get('learning_rate', 0):.2e}")

# Create data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# 5. LoRA-optimized training arguments for continued pretraining
training_args = TrainingArguments(
    output_dir="./qwen_math_nbody_lora",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    # Standard learning rate for non-embedding layers
    learning_rate=1e-4,
    weight_decay=0.01,
    logging_steps=10,
    save_steps=100,
    save_total_limit=3,
    dataloader_drop_last=True,
    report_to=["tensorboard"],
    fp16=True,
    logging_first_step=True,
    gradient_checkpointing=True,
    logging_dir="./logs",
    warmup_steps=100,
    logging_strategy="steps",
    gradient_checkpointing_kwargs={"use_reentrant": False},
    # Use adamw_hf which better supports parameter groups for decoupled learning rates
    optim="adamw_hf"
)

# 6. Load model for examination to identify all module names
print("Loading model to identify layer structure...")
temp_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    use_cache=False,
    torch_dtype=torch.float16,
    device_map={"": 0}  # Load on first GPU only temporarily
)

# Identify embedding and other linear layers for targeting
embedding_layers = []
linear_layers = []
for name, module in temp_model.named_modules():
    if 'embed' in name.lower() or 'lm_head' in name.lower():
        embedding_layers.append(name)
    elif isinstance(module, torch.nn.Linear) and 'embed' not in name.lower() and 'lm_head' not in name.lower():
        linear_layers.append(name)

print(f"Found {len(embedding_layers)} embedding layers: {embedding_layers}")
print(f"Found {len(linear_layers)} linear layers")

# Clean up memory
del temp_model
torch.cuda.empty_cache()
gc.collect()

# 7. Comprehensive LoRA configuration for continued pretraining
# Combine both standard target_modules and add embedding layers
# List adjusted based on Qwen2.5 specific architecture
lora_config = LoraConfig(
    r=8,                            # Reduced from 16
    lora_alpha=16,                  # Reduced from 32
    target_modules=[                # Only target key modules
        "q_proj", "k_proj", "v_proj", "o_proj", 
        "gate_proj", "up_proj", "down_proj", "wte", "lm_head"
    ],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

# 8. Initialize model with optimized settings
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    use_cache=False, # Load in fp16 for further efficiency
    device_map="auto"
)

# 9. Apply LoRA to the model
print("Applying LoRA adapters to model...")
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# 10. Create custom optimizer with decoupled learning rates
def get_optimizer_grouped_parameters(model, embedding_lr=1e-5, non_embedding_lr=1e-4):
    """Create parameter groups with different learning rates for embeddings vs other layers."""
    no_decay = ["bias", "LayerNorm.weight"]
    embedding_names = ["wte", "lm_head"]
    
    optimizer_grouped_parameters = [
        # Embedding params with lower learning rate and no weight decay
        {
            "params": [p for n, p in model.named_parameters() 
                      if any(nd in n for nd in embedding_names) and p.requires_grad],
            "lr": embedding_lr,
            "weight_decay": 0.0,
        },
        # Non-embedding params with regular learning rate and weight decay
        {
            "params": [p for n, p in model.named_parameters() 
                      if not any(nd in n for nd in embedding_names) 
                      and not any(nd in n for nd in no_decay) and p.requires_grad],
            "lr": non_embedding_lr,
            "weight_decay": training_args.weight_decay,
        },
        # Non-embedding params with regular learning rate and no weight decay
        {
            "params": [p for n, p in model.named_parameters() 
                      if not any(nd in n for nd in embedding_names) 
                      and any(nd in n for nd in no_decay) and p.requires_grad],
            "lr": non_embedding_lr,
            "weight_decay": 0.0,
        },
    ]
    return optimizer_grouped_parameters

# 11. Create a custom trainer with decoupled learning rates
class CustomPEFTTrainer(Trainer):
    def create_optimizer(self):
        """Create optimizer with separate learning rates for embedding vs. non-embedding parameters"""
        if self.optimizer is None:
            # Create parameter groups with decoupled learning rates
            embedding_lr = self.args.learning_rate / 10  # Lower embedding LR (10x smaller)
            non_embedding_lr = self.args.learning_rate
            
            print(f"Using decoupled learning rates: embedding={embedding_lr}, other={non_embedding_lr}")
            
            optimizer_grouped_parameters = get_optimizer_grouped_parameters(
                self.model, 
                embedding_lr=embedding_lr,
                non_embedding_lr=non_embedding_lr
            )
            
            # Create optimizer with parameter groups
            self.optimizer = torch.optim.AdamW(
                optimizer_grouped_parameters,
                lr=self.args.learning_rate,
                betas=(0.9, 0.999),
                eps=1e-8,
            )
        
        return self.optimizer

# 12. Initialize custom trainer with all optimizations
progress_callback = EnhancedProgressCallback()
trainer = CustomPEFTTrainer(
    model=model,
    args=training_args,
    train_dataset=chunked_dataset,
    data_collator=data_collator,
)
trainer.add_callback(progress_callback)

tokenizer_config.json:   0%|          | 0.00/7.32k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

Maximum context length: 131072


Token indices sequence length is longer than the specified maximum sequence length for this model (1538883 > 131072). Running this sequence through the model will result in indexing errors


Total tokens: 1538883, Creating 752 chunks of size 2048
Loading model to identify layer structure...


config.json:   0%|          | 0.00/676 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

Found 2 embedding layers: ['model.embed_tokens', 'lm_head']
Found 196 linear layers
Applying LoRA adapters to model...




trainable params: 10,460,160 || all params: 1,554,174,464 || trainable%: 0.6730


In [None]:

# Print training configuration
print(f"\n{'*'*70}")
print(f"ENHANCED LORA CONTINUED PRETRAINING CONFIGURATION:")
print(f"Model: {model_name}")
print(f"LoRA rank: {lora_config.r}")
print(f"Epochs: {training_args.num_train_epochs}")
print(f"Using decoupled learning rates: embedding={training_args.learning_rate/10}, other={training_args.learning_rate}")
print(f"Batch size: {training_args.per_device_train_batch_size} × "
      f"{training_args.gradient_accumulation_steps} steps × "
      f"{torch.cuda.device_count()} GPUs = "
      f"{training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps * torch.cuda.device_count()}")
print(f"Dataset size: {len(chunked_dataset)} chunks of {chunk_size} tokens each")
print(f"{'*'*70}\n")

# Start the training process
print("\n🚀 Starting research-optimized LoRA continued pretraining...\n")
try:
    trainer.train()
    print("\n✅ Training completed successfully!")
    # Save the final model
    print("Saving final model...")
    model.save_pretrained("./qwen_math_nbody_final_lora")
    tokenizer.save_pretrained("./qwen_math_nbody_final_lora")
    print("Model saved at ./qwen_math_nbody_final_lora")
except Exception as e:
    print(f"\n❌ Training interrupted: {e}")
    # Save checkpoint even if interrupted
    print("Saving emergency checkpoint...")
    model.save_pretrained("./qwen_math_nbody_checkpoint_lora")
    tokenizer.save_pretrained("./qwen_math_nbody_checkpoint_lora")
    print("Emergency checkpoint saved at ./qwen_math_nbody_checkpoint_lora")



**********************************************************************
ENHANCED LORA CONTINUED PRETRAINING CONFIGURATION:
Model: Qwen/Qwen2.5-Math-1.5B
LoRA rank: 8
Epochs: 3
Using decoupled learning rates: embedding=1e-05, other=0.0001
Batch size: 1 × 16 steps × 1 GPUs = 16
Dataset size: 752 chunks of 2048 tokens each
**********************************************************************


🚀 Starting research-optimized LoRA continued pretraining...

Using decoupled learning rates: embedding=1e-05, other=0.0001

TRAINING STARTED

Beginning Epoch 1/3


Epoch 1/3:   0%|          | 0/47 [00:00<?, ?it/s]

Step,Training Loss


In [4]:
import os
import json
import shutil
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer

# 1. Load the base model
base_model_id = "Qwen/Qwen2.5-Math-1.5B"
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(base_model_id)

# 2. Create a temporary directory to combine both adapter files
temp_adapter_dir = "/kaggle/working/combined_adapter"
os.makedirs(temp_adapter_dir, exist_ok=True)

# 3. Copy the adapter weights file
weights_source = "/kaggle/input/central_config_adapter/pytorch/default/1/adapter_model.safetensors"
weights_dest = os.path.join(temp_adapter_dir, "adapter_model.safetensors")
shutil.copy(weights_source, weights_dest)

# 4. Copy or create the config file
# Assuming you've uploaded the config JSON to a different location
config_source = "/kaggle/input/adapter-config/adapter_config.json"  # Update this path
config_dest = os.path.join(temp_adapter_dir, "adapter_config.json")
shutil.copy(config_source, config_dest)

# 5. Load the model with the combined adapter
model = PeftModel.from_pretrained(model, temp_adapter_dir)

# 6. Test the model
prompt = "What are trapezoidal central configurations?"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(
    **inputs,
    max_new_tokens=2048,  # Increased for more detailed explanation
    temperature=0.6,     # Lower temperature for more precise outputs
    # do_sample=True,
    # top_p=0.95
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


What are central configurations in the n-body problem? Explain with equations. Central configurations in the n-body problem are special arrangements of the bodies such that the gravitational force on each body is proportional to the position vector of that body relative to the center of mass of the system. In other words, the configuration is such that the acceleration of each body is proportional to its position vector.

To understand this mathematically, let's consider the n-body problem with masses \( m_1, m_2, \ldots, m_n \) located at positions \( \mathbf{r}_1, \mathbf{r}_2, \ldots, \mathbf{r}_n \) in \(\mathbb{R}^d\). The position vector of the center of mass is given by

\[
\mathbf{R} = \frac{1}{M} \sum_{i=1}^n m_i \mathbf{r}_i,
\]

where \( M = \sum_{i=1}^n m_i \) is the total mass of the system.

The acceleration of the \( i \)-th body is given by

\[
\mathbf{a}_i = \frac{d^2 \mathbf{r}_i}{dt^2} = \frac{d}{dt} \left( \frac{d \mathbf{r}_i}{dt} \right) = \frac{d}{dt} \left( \fra

In [5]:
prompt = "What are trapezoidal central configurations?"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(
    **inputs,
    max_new_tokens=2048,  # Increased for more detailed explanation
    temperature=0.6,     # Lower temperature for more precise outputs
    # do_sample=True,
    # top_p=0.95
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


What are trapezoidal central configurations? And how are they related to the spatial 4-body problem?
Trapezoidal central configurations are a type of central configuration in the spatial 4-body problem. In this context, a central configuration is a special arrangement of the bodies in which the acceleration vector of each body is proportional to the position vector of that body with respect to the center of mass of the system.

In the case of trapezoidal central configurations, the bodies are arranged in a trapezoidal shape, with two bodies at the top and two bodies at the bottom. The bodies at the top are connected by a line segment, and the bodies at the bottom are also connected by a line segment. The two line segments are parallel to each other, and the two bodies at the top are connected to the two bodies at the bottom by line segments that are perpendicular to the line segments connecting the bodies at the top and bottom.

The spatial 4-body problem is a problem in classical mech

In [7]:
prompt = "Explain the concept of bifurcation and the stacking of central configurations in the planar $1+4$ body problem. Include relevant mathematical equations or expressions."
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(
    **inputs,
    max_new_tokens=2048,  # Increased for more detailed explanation
    temperature=0.6,     # Lower temperature for more precise outputs
    # do_sample=True,
    # top_p=0.95
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Explain the concept of bifurcation and the stacking of central configurations in the planar $1+4$ body problem. Include relevant mathematical equations or expressions. Bifurcation in the context of the planar $1+4$ body problem refers to the phenomenon where a central configuration (CC) of the system undergoes a qualitative change as a parameter is varied. A central configuration is a special arrangement of the bodies such that the acceleration vector of each body is proportional to the position vector of that body relative to the center of mass of the system. In the planar $1+4$ body problem, we have one body at the origin and four bodies at the vertices of a square.

To understand the concept of bifurcation, let's consider the planar $1+4$ body problem with masses $m_1, m_2, m_3, m_4$ at the vertices of a square and mass $m_5$ at the origin. The positions of the bodies are given by:
\[
\mathbf{r}_1 = (0,0), \quad \mathbf{r}_2 = (a,0), \quad \mathbf{r}_3 = (a,a), \quad \mathbf{r}_4 = 

In [9]:
prompt = (
    "You are an expert on central configurations in mathematical physics. "
    "Please explain what co-circular central configurations are and show one derivation equation step-by-step."
)

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(
    **inputs,
    max_new_tokens=2048,  # Increased for more detailed explanation
    temperature=0.7,     # Lower temperature for more precise outputs
    do_sample=True,
    top_p=0.95,
    repetition_penalty = 1.1
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


You are an expert on central configurations in mathematical physics. Please explain what co-circular central configurations are and show one derivation equation step-by-step. Co-circular central configurations refer to a type of central configuration in which the particles lie on a common circle.

Consider three point masses $m_{1}, m_{2}$, and $m_{3}$ located at vertices of a triangle with positive area $\Delta$. Let $x_{i j}$ be the distance between masses $m_{i}$ and $m_{j}$. The mutual gravitational potential energy is given by

$$
U=\frac{1}{2} \sum_{1 \leq i<j \leq 3} m_{i} m_{j} / x_{i j}
$$

Let $r$ denote the radius of the circumscribed circle. Without loss of generality we can assume that the center of mass is at the origin, then there exist $\theta_{i} \in[0,2 \pi)$ such that $x_{i}=e^{i \theta_{i}}$, where $x_{i}=x_{i 1} e^{i \theta_{i}}$ for $i=1,2,3$. In terms of $\left(\theta_{i}\right)_{i=1}^{3}$, we have

$$
\begin{gathered}
x_{12}=r|\sin (\theta_{1}-\theta_{2})| \\
x_

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("facebook/galactica-30b")
model = AutoModelForCausalLM.from_pretrained("facebook/galactica-30b")

tokenizer_config.json:   0%|          | 0.00/166 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.14M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.00 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/754 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/67.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/7 [00:00<?, ?it/s]

pytorch_model-00001-of-00007.bin:   0%|          | 0.00/9.79G [00:00<?, ?B/s]

pytorch_model-00002-of-00007.bin:   0%|          | 0.00/9.87G [00:00<?, ?B/s]

pytorch_model-00003-of-00007.bin:   0%|          | 0.00/9.87G [00:00<?, ?B/s]

In [14]:
# Move the model to GPU
model = model.to("cuda")

input_text = ("Explain the concept of bifurcation and the stacking of central configurations in the planar "
              "$1+4$ body problem. Include relevant mathematical equations or expressions.")
# Transfer input tokens to GPU
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")

# Generate output tokens
outputs = model.generate(input_ids, max_length=1000)

# Decode and print the generated text
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Explain the concept of bifurcation and the stacking of central configurations in the planar $1+4$ body problem. Include relevant mathematical equations or expressions.


Answer:

I'm not sure what you mean by "central configuration", but I think you mean "a configuration of the bodies where the sum of the gravitational forces on each body is zero".
The [central configuration](https://en.wikipedia.org/wiki/Central_configuration) is a configuration of the bodies where the sum of the gravitational forces on each body is zero.
The [bifurcation](https://en.wikipedia.org/wiki/Bifurcation_theory) is a change in the number of solutions of a system of equations.
The [stacking](https://en.wikipedia.org/wiki/Stacking_(dynamics)) is a change in the number of solutions of a system of equations.




In [None]:
# Move the model to GPU
model = model.to("cuda")

input_text = ("Explain the concept of bifurcation and the stacking of central configurations in the planar "
              "$1+4$ body problem. Include relevant mathematical equations or expressions.")
# Transfer input tokens to GPU
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")

# Generate output tokens
outputs = model.generate(input_ids, max_length=1000)

# Decode and print the generated text
print(tokenizer.decode(outputs[0], skip_special_tokens=True))