In [None]:
!uv pip install -q trl

In [None]:
import wandb
wandb.login(key = "WHAT?")

In [None]:
from pathlib import Path
import pandas as pd

def load_proofs():
    # Define the base directory where your 'Proofs' folder is located.
    # Replace this with the actual path on your system.
    base_path = Path('/kaggle/input/diversity')
    
    # Create a list to store the data from each file.
    data_rows = []
    
    # Use glob to find all 'trial.txt' files within the structured directory.
    # The '**' pattern means to search recursively in all subdirectories.
    for file_path in base_path.glob('proofs/*/*/*.txt'):
        
        # The parts of the path are split by the OS-specific separator.
        # We can access them by indexing from the end of the parts list.
        path_parts = file_path.parts
        
        # The trial ID is the filename without the extension.
        trial_id = file_path.stem
        
        # The model name is the directory one level up.
        model = path_parts[-2]
        
        # The problem ID is the directory two levels up.
        problem_id = path_parts[-3]
        
        # Read the content of the file.
        try:
            with file_path.open('r', encoding='utf-8') as f:
                text_content = f.read()
                
            # Append a dictionary with the extracted data to our list.
            data_rows.append({
                'problem_id': problem_id,
                'model': model,
                'trial_id': trial_id,
                'text': text_content
            })
        except Exception as e:
            print(f"Could not read file {file_path}: {e}")
    
    # Create the pandas DataFrame from the list of dictionaries.
    df = pd.DataFrame(data_rows)
    return df

In [None]:
import os
import json
from datasets import load_dataset

def construct_dataset(model_name):
    def format_dataset(elem):
        informal_prefix = elem['informal_prefix']
        formal_statement = elem['formal_statement']
        idx = name2id[elem['name']]
        trial_id = id2trial[idx]
        #Find proof from dataframe
        proof = df.loc[(df['problem_id'] == idx) & (df['trial_id'] == trial_id) & (df['model'] == model_name)]['text'].item()
        prompt = f"{informal_prefix}\n\n{formal_statement}\n\nproof\n"
        return {"prompt": prompt, "completion": proof}
    
    #Proofs
    df = load_proofs()
    
    #Get sucessful Trials
    with open("/kaggle/input/diversity/proof_outcomes_by_model.json", 'r') as file:
        model2outcome = json.load(file)
    id2trial = {}
    for key, val in model2outcome[model_name].items():
        try:
            id2trial[key] = str(val.index(1) + 1)
        except:
            id2trial[key] = '1'
    #Load minif2fdataset
    raw_ds = load_dataset("AI-MO/minif2f_test", split="train")
    raw_ds = raw_ds.select(range(40))
    name2id = {name:str(i) for i, name in enumerate(raw_ds['name'])}
    #PLEASE REMOVE id in the future
    sft_ds = raw_ds.map(format_dataset, remove_columns =['name','informal_prefix', 'formal_statement'])
    return sft_ds

sft_ds = construct_dataset("stoney0062_Leanabell-Prover-DS-SFT")

In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    DataCollatorForLanguageModeling
)
import torch
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer

OUTPUT_DIR = "./deepseek-prover-finetuned"
MODEL_ID = "deepseek-ai/DeepSeek-Prover-V2-7B"

#Load Model
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16  
)

In [None]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"], # These are common for many models
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    learning_rate=2e-4,
    num_train_epochs=3,
    logging_steps=10,
    save_strategy="epoch",
    max_grad_norm=0.3,
    fp16=True,
    max_steps=-1,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
)

trainer = SFTTrainer(
    model=model,
    train_dataset=sft_ds,
    peft_config=peft_config, # LoRA configuration
    args = training_args
)

trainer.train()

In [None]:
OUTPUT_DIR = "./deepseek-prover-finetuned"

In [None]:
# peft_config = LoraConfig(
#     r=16,
#     lora_alpha=32,
#     target_modules=["q_proj", "v_proj"], # These are common for many models
#     lora_dropout=0.05,
#     bias="none",
#     task_type="CAUSAL_LM"
# )

# trainer = SFTTrainer(
#     model=model,
#     train_dataset=sft_ds,
#     peft_config=peft_config,  # LoRA configuration
# )



# training_args = TrainingArguments(
#     output_dir=OUTPUT_DIR,
#     per_device_train_batch_size=4,
#     gradient_accumulation_steps=1,
#     learning_rate=2e-4,
#     num_train_epochs=3,
#     logging_steps=10,
#     save_strategy="epoch",
#     max_grad_norm=0.3,
#     max_steps=-1,
#     lr_scheduler_type="cosine",
#     warmup_ratio=0.03,
# )

In [None]:
trainer.train()