In [2]:
def PrepData_for_SFT(tokenizer):
  """
  Prepares the Amazon Beauty Q&A dataset for supervised fine-tuning (SFT) of a language model.

  Steps:
  1. Loads and parses a CSV containing product-related Q&A data (`qa_Beauty.csv`).
  2. Loads and parses a corresponding metadata CSV (`meta_Beauty.csv`) to extract product titles and image URLs.
  3. Merges the Q&A data with product metadata using the common `asin` key.
  4. Constructs a conversational format combining product title, question, and answer.
  5. Standardizes the data into a ShareGPT-style format using `standardize_custom_format` and `standardize_sharegpt`.
  6. Converts the processed data into a HuggingFace `Dataset`.
  7. Loads a quantized, instruction-tuned LLM (`Qwen2.5-1.5B-Instruct`) using Unsloth with a custom system prompt focused on beauty products.
  8. Applies a formatting function (`formatting_prompts_func`) to wrap the data in prompt-response format suitable for training.
  9. Limits the dataset to 500 examples (due to Colab resource constraints).

  Returns:
  A processed and formatted dataset (up to 500 samples) ready for fine-tuning the LLM.
  """
  # Source: https://cseweb.ucsd.edu/~jmcauley/datasets/amazon/links.html
  # Downloaded the dataset for Beauty, 	5-core (198,502 reviews).
  # downloaded the json file, opened with Excel and saved as a csv
  beauty_qa = pd.read_csv('/content/sample_data/qa_Beauty.csv', names=["conversation"])

  beauty_qa["conversation"] = beauty_qa["conversation"].apply(ast.literal_eval)
  df_qa_expanded = beauty_qa["conversation"].apply(pd.Series)
  print(df_qa_expanded.head(3))

  # This meta data contains information about the name of the product the 'asin' in the beauty_qa dataset corresponded to.
  # Hence, left-joining 'title' from beauty_meta_data
  # Source: https://cseweb.ucsd.edu/~jmcauley/datasets/amazon/links.html
  # Downloaded the dataset for Beauty, 	metadata (259,204 products)
  # downloaded the json file, opened with Excel and saved as a csv
  beauty_meta_data = pd.read_csv("/content/sample_data/meta_Beauty.csv", names=["metadata"],on_bad_lines='skip')
  # beauty_meta_data = pd.read_json("/content/sample_data/meta_Beauty.csv", lines=True)
  pd.set_option('display.max_colwidth', None)
  print(beauty_meta_data.head(3))

  beauty_meta_data["metadata"] = beauty_meta_data["metadata"].apply(ast.literal_eval)
  beauty_meta_data_expanded = beauty_meta_data["metadata"].apply(pd.Series)
  # print(beauty_meta_data_expanded.head(3))

  beauty_meta_data_expanded = beauty_meta_data_expanded[['asin', 'title', 'imUrl']]
  print(beauty_meta_data_expanded.head(3))
  beauty_qa_merged = pd.merge(df_qa_expanded, beauty_meta_data_expanded, on = 'asin', how = 'left')
  beauty_qa_merged = beauty_qa_merged[['asin', 'title', 'question', 'answer']]
  print(beauty_qa_merged.head(3))

  beauty_qa_merged = beauty_qa_merged.assign(conversation = "'question': " + beauty_qa_merged.title.astype(str) + '. ' + \
  beauty_qa_merged.question.astype(str) + "'answer': " + beauty_qa_merged.answer.astype(str))
  print(beauty_qa_merged.head(3))

  beauty_qa_merged["standardized"] = beauty_qa_merged["conversation"].apply(standardize_custom_format)
  print(beauty_qa_merged["standardized"].head(3))

  beauty_qa_standardized = Dataset.from_pandas(beauty_qa_merged["standardized"].to_frame())

  beauty_qa_standardized = standardize_sharegpt(beauty_qa_standardized)

  # Quantize the model, fine tune it on a maximum of 500 examples from your train set
  STANDARD_SYSTEM_PROMPT = "You are a helpful AI assistant specialized in beauty and cosmetics. User will mention beauty product name and then ask a related question. Provide a clear and accurate answer."

  tokenizer = get_chat_template(
      tokenizer,
      chat_template = "qwen-2.5",
  ) # applies correct model-specific chat formating rules

  beauty_qa_standardizedV2 = beauty_qa_standardized.map(formatting_prompts_func, batched = True, fn_kwargs={"STANDARD_SYSTEM_PROMPT": STANDARD_SYSTEM_PROMPT, "tokenizer": tokenizer})
  LIMIT = 500 #limiting as we are on the free version. I tried with 1000 rows, google colab kernel crashed
  beauty_qa_standardizedV2 = beauty_qa_standardizedV2.take(LIMIT)
  return beauty_qa_standardizedV2