In [1]:
import os
import re
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from tqdm import tqdm

def debug_message(message):
    print(f"[DEBUG] {message}")

# Set data path
data_path = "data"
os.makedirs(data_path, exist_ok=True)
debug_message(f"Data path set to: {data_path}")

# # Fetching LaTeX content from your dataset (dummy content for example)
# latex_texts = [
#     "This is an example of an equation: \\begin{equation} E = mc^2 \\end{equation}",
#     "Here is a figure: \\begin{figure} \\includegraphics{example.png} \\caption{An example figure} \\end{figure}",
#     "An itemized list: \\begin{itemize} \\item First item \\item Second item \\end{itemize}",
#     "A table: \\begin{table} \\begin{tabular}{c c} a & b \\end{tabular} \\end{table}"
# ]

# debug_message("Fetched LaTeX content from dataset")

# # Save your LaTeX texts to a file
latex_file_path = os.path.join(data_path, 'latex_data.txt')

# with open(latex_file_path, 'w') as f:
#     for text in tqdm(latex_texts, desc="Saving LaTeX texts"):
#         f.write(text + '\n')
# debug_message(f"LaTeX texts saved to {latex_file_path}")

# Tokenizer and model initialization

debug_message("Initializing tokenizer and model")
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.resize_token_embeddings(len(tokenizer))
debug_message("Tokenizer and model initialized")

# Prepare dataset
def load_dataset(file_path, tokenizer):

    debug_message(f"Loading dataset from {file_path}")
    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=64  # Adjust as needed
    )
    debug_message(f"Dataset loaded with {len(dataset)} samples")
    return dataset


# Load the dataset
dataset = load_dataset(latex_file_path, tokenizer)
print(dataset,latex_file_path)

# Prepare data collator

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)
debug_message("Data collator prepared")


# Training arguments

training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=1,  # Adjust based on your GPU memory
    save_steps=10_000,
    save_total_limit=2,
)
# debug_message(f"Training arguments set: {training_args}")

# Initialize Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)
debug_message("Trainer initialized")

# Start training

debug_message("Starting training")
trainer.train()
debug_message("Training completed")


[DEBUG] Data path set to: data
[DEBUG] Initializing tokenizer and model
[DEBUG] Tokenizer and model initialized
[DEBUG] Loading dataset from data\latex_data.txt
[DEBUG] Dataset loaded with 160 samples
<transformers.data.datasets.language_modeling.TextDataset object at 0x000001D5985A27B0> data\latex_data.txt
[DEBUG] Data collator prepared




[DEBUG] Trainer initialized
[DEBUG] Starting training


  0%|          | 0/480 [00:00<?, ?it/s]

{'train_runtime': 59.9065, 'train_samples_per_second': 8.012, 'train_steps_per_second': 8.012, 'train_loss': 2.7125673929850262, 'epoch': 3.0}
[DEBUG] Training completed


In [2]:
# save model
model_path = os.path.join(data_path, 'model')
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)
debug_message(f"Model saved to {model_path}")


[DEBUG] Model saved to data\model


In [3]:
# Load model
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
debug_message("Model loaded")

[DEBUG] Model loaded


In [7]:
import torch
# Generate text
prompt = """\
TITLE: Riemann Hypothesis
What is the Riemann Hypothesis?
"""
input_ids = tokenizer.encode(prompt, return_tensors='pt')

# Set the model to evaluation mode
model.eval()

# Move model and inputs to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
input_ids = input_ids.to(device)

# Generate tokens one by one
output_sequences = input_ids
generated_sequence = []

for _ in range(1000):  # Adjust the number of tokens to generate
    # Get the model's output
    with torch.no_grad():
        outputs = model(output_sequences)
    
    # Get the next token logits and find the most probable token
    next_token_logits = outputs.logits[:, -1, :]
    
    temperature = 0.9
    next_token_logits = next_token_logits / temperature
    next_token_id = torch.multinomial(torch.nn.functional.softmax(next_token_logits, dim=-1), num_samples=1)
    
    # Append the token id to the output sequence
    output_sequences = torch.cat((output_sequences, next_token_id), dim=1)
    
    # Decode the token and print it
    generated_token = tokenizer.decode(next_token_id.squeeze(), skip_special_tokens=True)
    generated_sequence.append(generated_token)
    print(generated_token, end='', flush=True)

# Join all tokens to form the final generated text
generated_text = "".join(generated_sequence)
print("\nGenerated text:")
print(generated_text)


~$ \osk eqn_{0}(~2)$ is a measure of the strength of regimes, not of the total number of atoms in the whole system. 
~\begin{equation}
\label{fig3}
\-----
\end{figure}
\ref{fig2}
In the case of the classical classical quantum energy $F(t)=\sum_{i=0}^2 + \sum_{j=0}^{i-j(t)}^2 \approx{n^2\omega_{t}$ where t is the time of state evolution, 
j and t the initial perturbation. In the case of perturbations in the bosons of $t=0$ the dynamics of the corresponding quantum state is perturbed by the coupling of two charged particles. 
This perturbation in the classical regime is due to the energy (or volume) $t=0$ (\sum_{j=0}^2}^2+\sum_{j=0}^{i-j(t)}^2$, 
which is a measure of the initial fidelity at position $t=0$. In the classical regime the above perturbations are seen in fig1~\ref{fig4}
while in the classical regime the perturbation is due to the interaction between two bosons. 
In the classical regime $F(t)=\sum_{j=0}^2+\sum_{j=0};$ the perturbation in the optical regime is perturbed by the 

KeyboardInterrupt: 