In [2]:
import os
import glob
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling

In [3]:
# Step 1: Prepare the dataset
def load_tex_files(directory):
    files = glob.glob(os.path.join(directory, "*.tex"))
    data = ""
    for file in files:
        with open(file, 'r', encoding='utf-8') as f:
            data += f.read() + "\n"
            print(f"Loaded {file}")
    return data

train_data = load_tex_files("data/latex_files/")

# Write the training data to a file (needed for TextDataset)
with open("train_data.txt", 'w', encoding='utf-8') as f:
    f.write(train_data)

# Step 2: Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Step 3: Create the dataset and data collator
def create_dataset(file_path, tokenizer):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=128
    )

train_dataset = create_dataset("train_data.txt", tokenizer)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Step 4: Set up the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=500,
)

# Step 5: Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Step 6: Start training
print("Starting training")
trainer.train()
print("Training complete")


Loaded data/latex_files\filtered_0312009v3.tex
Loaded data/latex_files\filtered_0312013v1.tex
Loaded data/latex_files\filtered_0312028v2.tex
Loaded data/latex_files\filtered_0312038v1.tex
Loaded data/latex_files\filtered_0312055v2.tex
Loaded data/latex_files\filtered_0312066v2.tex
Loaded data/latex_files\filtered_0312069v1.tex
Loaded data/latex_files\filtered_0312069v2.tex
Loaded data/latex_files\filtered_0312125v1.tex
Loaded data/latex_files\filtered_0312126v3.tex
Loaded data/latex_files\filtered_0312132v1.tex
Loaded data/latex_files\filtered_0312140v1.tex
Loaded data/latex_files\filtered_0312145v1.tex
Loaded data/latex_files\filtered_0312154v1.tex
Loaded data/latex_files\filtered_0312159v1.tex
Loaded data/latex_files\filtered_0312176v1.tex
Loaded data/latex_files\filtered_0312178v5.tex
Loaded data/latex_files\filtered_0312180v2.tex
Loaded data/latex_files\filtered_0312205v1.tex
Loaded data/latex_files\filtered_0312224v1.tex
Loaded data/latex_files\filtered_0312226v1.tex
Loaded data/l



Starting training


  0%|          | 0/77073 [00:00<?, ?it/s]

{'loss': 3.2259, 'grad_norm': 4.537155628204346, 'learning_rate': 4.9675632192856124e-05, 'epoch': 0.02}
{'loss': 3.0059, 'grad_norm': 3.253880023956299, 'learning_rate': 4.9351264385712246e-05, 'epoch': 0.04}
{'loss': 2.905, 'grad_norm': 2.8629026412963867, 'learning_rate': 4.9026896578568374e-05, 'epoch': 0.06}
{'loss': 2.8503, 'grad_norm': 3.1725378036499023, 'learning_rate': 4.8702528771424496e-05, 'epoch': 0.08}
{'loss': 2.7571, 'grad_norm': 2.8066012859344482, 'learning_rate': 4.8378160964280624e-05, 'epoch': 0.1}
{'loss': 2.75, 'grad_norm': 2.6228888034820557, 'learning_rate': 4.8053793157136746e-05, 'epoch': 0.12}
{'loss': 2.755, 'grad_norm': 2.635725975036621, 'learning_rate': 4.772942534999287e-05, 'epoch': 0.14}
{'loss': 2.701, 'grad_norm': 2.624300718307495, 'learning_rate': 4.740505754284899e-05, 'epoch': 0.16}
{'loss': 2.7079, 'grad_norm': 2.581331253051758, 'learning_rate': 4.708068973570511e-05, 'epoch': 0.18}
{'loss': 2.6734, 'grad_norm': 2.5429723262786865, 'learning_

KeyboardInterrupt: 

In [7]:

# Step 7: Save the model
model_path = "./data/finetuned_gpt2"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)
print("Model saved")


Model saved


In [8]:
# Load model
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
print("Model loaded")

Model loaded


In [10]:
import torch
# Generate text
prompt = """\
TITLE: How to cook pasta
Quick Tutorial (Concise)
"""
input_ids = tokenizer.encode(prompt, return_tensors='pt')

# Set the model to evaluation mode
model.eval()

# Move model and inputs to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
input_ids = input_ids.to(device)

# Generate tokens one by one
output_sequences = input_ids
generated_sequence = []

for _ in range(1000):  # Adjust the number of tokens to generate
    # Get the model's output
    with torch.no_grad():
        outputs = model(output_sequences)
    
    # Get the next token logits and find the most probable token
    next_token_logits = outputs.logits[:, -1, :]
    
    temperature = 0.9
    next_token_logits = next_token_logits / temperature
    next_token_id = torch.multinomial(torch.nn.functional.softmax(next_token_logits, dim=-1), num_samples=1)
    
    # Append the token id to the output sequence
    output_sequences = torch.cat((output_sequences, next_token_id), dim=1)
    
    # Decode the token and print it
    generated_token = tokenizer.decode(next_token_id.squeeze(), skip_special_tokens=True)
    generated_sequence.append(generated_token)
    print(generated_token, end='', flush=True)

# Join all tokens to form the final generated text
generated_text = "".join(generated_sequence)
print("\nGenerated text:")
print(generated_text)


Let's start with the simplest example: a pasta consisting of a pimento dressing
\int dv
&\qquad dv=2
\times \qquad dv=-4
\times \qquad
\qquad
\int dv
&\qquad dv=5
\times \qquad dv=4,
\end{align}
where $\nu=\nu_r$,
$\nu_r=-\frac{1}{1}$ and $a=2$. The first term corresponds to the surface heating of the first
bulge of the preteen of the $23$ holes possesses partially-transited protons, $a=2$ and the second
sizes to $a_{11}$ nuclei and $\zeta=0$.
\label{fig:easy}
The second term corresponds to the and now, again,
$\nu\sim 1$ protons from the tip of the ``totanka JF" model as described by Eq.\,\eqref{eq:obs}. It is the heat that takes place within the
self-consistent shell model which turns the exterior crust into a partially-rotating crust, before brany formation occurs.
\end{align}
%
We can use energies of the $a_{11}$ and $\zeta=0$ generation ($\beta=0$), while that of the $J_{23}$ current ($\mu=0$) produces the shell as a unit of energy. One
the shell model produces the chiral states d