In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, get_scheduler
import torch

In [None]:
def load_model(model_path, from_hub=False, subfolder=None):
    print(f"Loading model_path from {'Hugging Face Hub' if from_hub else 'local path'}...")
    tokenizer = AutoTokenizer.from_pretrained(model_path, subfolder=subfolder)
    model = AutoModelForCausalLM.from_pretrained(model_path, subfolder=subfolder, torch_dtype=torch.bfloat16)
    print(tokenizer)
    return tokenizer, model

def setup_device():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")
    return device

def generate_response(question, tokenizer, model, device):
    # "EASY" PROMPTING, BETTER FOR MODELS NOT INSTRUCTION TUNED.
    messages = [{"role": "user", "content": question}]
    prompt = question
    print("Input:")
    print(prompt)

    input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) # CRITICAL for instruction finetuned models
    print(input_text)

    # Tokenize the input text
    inputs = tokenizer.encode(input_text, return_tensors='pt').to(device)

    # Generate the model's response
    outputs = model.generate(
        inputs,
        max_new_tokens=50,
        temperature=0.3,
        repetition_penalty=1.1,
        do_sample=False,
    )

    # Decode the output but remove the input portion from the decoded output
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=False) # SET TO FALSE COS IT'S EASIER TO STRIP

    # Remove the input text from the model's response - ADDED THIS TO AVOID REPETITION
    response = decoded_output[len(input_text):].strip() # Strip any leading/trailing spaces
    print("\nModel Response:")
    print(response)


def main():
    # Ask user whether to load from local path or Hugging Face Hub
    load_option = input("Load model from (1) local path or (2) Hugging Face Hub (manual) or (3) instruct layer-pruned or (4) hidden-pruned or (5) 99-v9 instruct: ")

    if load_option == "B": # From initial images, this looked like "B" was a specific path
        model_path = "HuggingFaceTB/SmolLM-135M-Instruct"
        from_hub = True
    elif load_option == "1":
        model_path = "Trellis/SmolLM-135M-Instruct-layer-pruned-90M-raw-SFT-6000rows-lr0.001-20240924"
        from_hub = False
    elif load_option == "2":
        model_path = "RS545837/TrellisLM-smolla-distill-1"
        from_hub = True
    elif load_option == "3":
        model_path = "Trellis/SmolLM-100M-layer-hidden-pruned"
        from_hub = True
    elif load_option == "4":
        model_path = "Trellis/SmolLM-100M-Instruct-layer-hidden-pruned"
        from_hub = True
    elif load_option == "5":
        model_path = "Trellis/99-instruct-v9"
        from_hub = True
    else:
        print("Invalid option. Defaulting to local path.")
        model_path = "./finetuned_model_small_messages"
        from_hub = False

    device = setup_device()
    tokenizer, model = load_model(model_path, from_hub)
    model = model.to(device)

    # print(f"Student model: {model}") # This line was commented out in one image

    questions = [
        "What is the capital of France?",
        "One, two, three, ",
        "After dinner, I ",
        "The wheels on the bus go ",
        "What is the universe?",
        "How do you calculate the derivative of a function?",
        "How to kill?",
    ]

    for question in questions:
        print("\n" + "="*50)
        generate_response(question, tokenizer, model, device)
        print("="*50)

if __name__ == "__main__":
    main()