# Obtaining model generations for Qwen2.5 model family

NB: due to the computational constraints, this notebook was only run on Kaggle. I did not rewrite the code to make it suitable for running locally, e.g. the imports are still Kaggle-style. If you want to run this notebook, either upload it and the dataset on Kaggle or contact me and I will provide the link to my Kaggle notebook.

## Installations and imports necessary to run the code

In [1]:
# !pip install optimum
# !pip install auto-gptq

In [2]:
import pandas as pd
import transformers
import torch
from tqdm.notebook import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
import string

## Reading the dataset

In [None]:
df_color_prompts = pd.read_excel("/kaggle/input/sem-leakage/semantic_leakage_colors.xlsx", sheet_name='Sheet1')

## Functions for response generation

In [None]:
generation_config = {
    "max_new_tokens": 10,
    "do_sample": True,
    "temperature": 0.5
}


def encode_and_generate(prompt, model, tokenizer, generation_config=generation_config):
    """
    Encode a prompt and generate a response using the given model and tokenizer.

    Args:
        prompt (str): The input prompt to encode and generate from.
        model (transformers.PreTrainedModel): The model used for generation.
        tokenizer (transformers.PreTrainedTokenizer): The tokenizer used for encoding and decoding.
        generation_config (dict, optional): Configuration for text generation. Defaults to `generation_config`.

    Returns:
        str: The generated response text.
    """
    model_inputs = tokenizer([prompt], return_tensors="pt").to(model.device)
    generated_ids = model.generate(
        **model_inputs,
        **generation_config,
        pad_token_id=tokenizer.eos_token_id
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response


def get_one_response(prompt_from_df, model, tokenizer, generation_config=generation_config):
    """
    Generate a single response for a given prompt, following a specified chat format.

    Args:
        prompt_from_df (str): The input prompt from a dataframe.
        model (transformers.PreTrainedModel): The model used for generation.
        tokenizer (transformers.PreTrainedTokenizer): The tokenizer used for encoding and decoding.
        generation_config (dict, optional): Configuration for text generation. Defaults to `generation_config`.

    Returns:
        str: The generated response text.
    """
    prompt = prompt_from_df + " {your_answer}\nYour answer:"
    messages = [
        {"role": "system", "content": "You must complete the sentence with no more than five words."},
        {"role": "user", "content": prompt}
    ]

    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    response = encode_and_generate(text, model, tokenizer, generation_config=generation_config)

    return response


def collect_generations_for_one_model(input_df, model, tokenizer, n_generations=1, file_name="test.tsv"):
    """
    Collect multiple generations for a given model and save them to a file.

    Args:
        input_df (pd.DataFrame): The input dataframe containing prompts.
        model (transformers.PreTrainedModel): The model used for generation.
        tokenizer (transformers.PreTrainedTokenizer): The tokenizer used for encoding and decoding.
        n_generations (int, optional): Number of generations to collect. Defaults to 1.
        file_name (str, optional): Name of the file to save the results. Defaults to "test.tsv".

    Returns:
        pd.DataFrame: Dataframe with added generations for each prompt.
    """
    df = input_df.copy()
    for i in tqdm(range(n_generations), desc="Generating responses"):
        df[f"generation_{i+1}"] = df["prompt"].apply(lambda x: get_one_response(x, model, tokenizer))
    df.to_csv(file_name, index=False, sep='\t')
    return df

## Getting output from Qwen2.5-0.5B

In [None]:
model_name = "Qwen/Qwen2.5-0.5B-Instruct"

model_qwen_05_instr = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer_qwen_05_instr = AutoTokenizer.from_pretrained(model_name)

df_with_gens_qwen_05_instr = collect_generations_for_one_model(
    df_color_prompts,
    model_qwen_05_instr,
    tokenizer_qwen_05_instr,
    n_generations=5,
    file_name="color_prompts_qwen_05_instr.tsv"
)

## Getting output from Qwen2.5-1.5B

In [None]:
model_name = "Qwen/Qwen2.5-1.5B-Instruct"

model_qwen_15_instruct = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer_qwen_15_instruct = AutoTokenizer.from_pretrained(model_name)

df_with_gens_qwen_15_instruct = collect_generations_for_one_model(
    df_color_prompts,
    model_qwen_15_instruct,
    tokenizer_qwen_15_instruct,
    n_generations=5,
    file_name="color_prompts_qwen_15_instruct.tsv"
)

## Getting output from Qwen2.5-3B

In [None]:
model_name = "Qwen/Qwen2.5-3B-Instruct"

model_qwen_3_instruct = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer_qwen_3_instruct = AutoTokenizer.from_pretrained(model_name)

df_with_gens_qwen_3_instruct = collect_generations_for_one_model(
    df_color_prompts,
    model_qwen_3_instruct,
    tokenizer_qwen_3_instruct,
    n_generations=5,
    file_name="color_prompts_qwen_3_instruct.tsv"
)

## Getting output from Qwen2.5-7B

In [None]:
model_name = "Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4"

model_qwen_7_instruct = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer_qwen_7_instruct = AutoTokenizer.from_pretrained(model_name)

df_with_gens_qwen_7_instruct = collect_generations_for_one_model(
    df_color_prompts,
    model_qwen_7_instruct,
    tokenizer_qwen_7_instruct,
    n_generations=5,
    file_name="color_prompts_qwen_7_instruct.tsv"
)