In [4]:
import os
import re
from transformers import GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset
from tqdm import tqdm

In [None]:

# Step 1: Prepare the dataset
def load_tex_files(directory):
    files = glob.glob(os.path.join(directory, "*.tex"))
    data = ""
    for file in files:
        with open(file, 'r', encoding='utf-8') as f:
            data += f.read() + "\n"
            print(f"Loaded {file}")
    return data

train_data = load_tex_files("data/latex_files/")

# Write the training data to a file (needed for TextDataset)
with open("train_data.txt", 'w', encoding='utf-8') as f:
    f.write(train_data)

# Step 2: Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Step 3: Create the dataset and data collator
def create_dataset(file_path, tokenizer):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=128
    )

train_dataset = create_dataset("train_data.txt", tokenizer)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Step 4: Set up the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=500,
)

# Step 5: Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Step 6: Start training
print("Starting training")
trainer.train()
print("Training complete")

# Step 7: Save the model
model.save_pretrained("./finetuned_gpt2")
tokenizer.save_pretrained("./finetuned_gpt2")
print("Model saved")


In [7]:

def debug_message(message):
    print(f"[DEBUG] {message}")

# Set data path
data_path = "data"
debug_message(f"Data path set to: {data_path}")

latex_file_path = os.path.join(data_path, 'latex_files')


# Tokenizer and model initialization

debug_message("Initializing tokenizer and model")
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.resize_token_embeddings(len(tokenizer))
debug_message("Tokenizer and model initialized")

# Load dataset from folder full of LaTeX files
dataset = load_dataset('text', data_dir=latex_file_path)["train"]

print(dataset,latex_file_path)

# Prepare data collator

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)
debug_message("Data collator prepared")


# Training arguments

training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=1,  # Adjust based on your GPU memory
    save_steps=10_000,
    save_total_limit=2,
)
# debug_message(f"Training arguments set: {training_args}")

# Initialize Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)
debug_message("Trainer initialized")

# Start training

debug_message("Starting training")
trainer.train()
debug_message("Training completed")


[DEBUG] Data path set to: data
[DEBUG] Initializing tokenizer and model
[DEBUG] Tokenizer and model initialized


Resolving data files:   0%|          | 0/517 [00:00<?, ?it/s]

Dataset({
    features: ['text'],
    num_rows: 478599
}) data\latex_files
[DEBUG] Data collator prepared
[DEBUG] Trainer initialized
[DEBUG] Starting training


  0%|          | 0/1435797 [00:00<?, ?it/s]

ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided []

In [None]:
# save model
model_path = os.path.join(data_path, 'model')
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)
debug_message(f"Model saved to {model_path}")


In [None]:
# Load model
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
debug_message("Model loaded")

In [None]:
import torch
# Generate text
prompt = """\
TITLE: Riemann Hypothesis
What is the Riemann Hypothesis?
"""
input_ids = tokenizer.encode(prompt, return_tensors='pt')

# Set the model to evaluation mode
model.eval()

# Move model and inputs to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
input_ids = input_ids.to(device)

# Generate tokens one by one
output_sequences = input_ids
generated_sequence = []

for _ in range(1000):  # Adjust the number of tokens to generate
    # Get the model's output
    with torch.no_grad():
        outputs = model(output_sequences)
    
    # Get the next token logits and find the most probable token
    next_token_logits = outputs.logits[:, -1, :]
    
    temperature = 0.9
    next_token_logits = next_token_logits / temperature
    next_token_id = torch.multinomial(torch.nn.functional.softmax(next_token_logits, dim=-1), num_samples=1)
    
    # Append the token id to the output sequence
    output_sequences = torch.cat((output_sequences, next_token_id), dim=1)
    
    # Decode the token and print it
    generated_token = tokenizer.decode(next_token_id.squeeze(), skip_special_tokens=True)
    generated_sequence.append(generated_token)
    print(generated_token, end='', flush=True)

# Join all tokens to form the final generated text
generated_text = "".join(generated_sequence)
print("\nGenerated text:")
print(generated_text)
