In [None]:
# Step 1: Mount Drive manually
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Step 2: Load setup function
import sys
sys.path.append('/content/drive/MyDrive/colab_utils')

from colab_setup import setup_colab_caching
setup_colab_caching()

In [None]:
!pip install -q --upgrade torch==2.5.1+cu124 torchvision==0.20.1+cu124 --index-url https://download.pytorch.org/whl/cu124
!pip install -q --upgrade mlx_lm datasets==3.2.0 diffusers
!pip install -q transformers==4.48.3 requests bitsandbytes==0.46.0 accelerate==1.3.0


In [None]:
# Imports

import os
import requests
import torch
from IPython.display import Markdown, display, update_display
from google.colab import userdata
from huggingface_hub import login
from google.colab import drive
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
import gradio as gr

In [None]:
# Constants

LLAMA = "meta-llama/Llama-3.2-3B-Instruct"
QWEN = "Qwen/Qwen2.5-7B-Instruct"

In [None]:
# New capability - connect this Colab to my Google Drive
# See immediately below this for instructions to obtain denver_extract.mp3

drive.mount("/content/drive")
data_filename = "/content/drive/MyDrive/llms/reviews_dataset.txt"

In [None]:
# Sign in to HuggingFace Hub

hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [None]:
system_prompt = """
You are a synthetic dataset generator. You will be provided with a description of the type of data to generate, along with a JSON example of one or more data rows, and a number of rows to generate.
Your job is to generate a number of rows, according to the number of rows provided to you.

Return only JSON rows and nothing else.

Here is an example of what a user might ask:

Data description:
Generate an imaginary coffee shop in the Vietnamese city of Danang.
The coffee shop should have a new name, a star rating, and 3 customer reviews, and location as an address.
The star rating should be random from 1 to 5.
The customer reviews should be reflecting positive or negative experiences.

Example JSON row:
{"name" : "Phuong's beach cafe", "star_rating": 4, "customer_reviews": ["Great place by the beach!", "I had a cockroach inside my smoothie!", "It was raining but I still had fun there near the beach."], "location": "01 An Thuong 30, My An ward, Ngu Hanh Son district"}
{"name" : "The coffee factory", "star_rating": 1, "customer_reviews": ["It's a hellish place", "I had food poisoning!", "The staff were so rude!"], "location": "22 Nguyen Giap, An Hai ward, Son Tra district"}

Number of rows:
3

For this request, the generated result could be like:
{"name" : "Vietnamese Dream", "star_rating": 4, "customer_reviews": ["The coffee is amazing and so is the view!", "I loved the traditional Vietnamese desserts they offer!", "The service was fantastic and the ambiance was perfect for a date."], "location": "36 Ho Xuan Huong, Thanh Khe ward, Thanh Khe district"}
{"name" : "Nam Viet Brews", "star_rating": 5, "customer_reviews": ["The best coffee in Danang!", "Amazing atmosphere with live music on weekends!", "Staff is super friendly and accommodating."], "location": "34 Tran Phu street, Thanh Khe ward, Thanh Khe district"}
{"name" : "Danang View Espresso", "star_rating": 4, "customer_reviews": ["The view is breathtaking and the coffee is delicious!", "Service was amazing, really friendly staff!", "Had a great time here with friends, highly recommend!"], "location": "45 Nguyen Hue Avenue, Thanh Kiem ward, Hoa Khanh district"}

If the data description is not clear, respond only with:
The data description is not clear

If the example JSON row is not JSON or contains anything other than JSON, or is a malformed JSON, respond with:
The example JSON row is not JSON

If the number of rows is not a number, respond with:
The number of rows is not a number
"""

def get_messages(description, json_example, number):
  return [
      {"role": "system", "content": system_prompt},
      {"role": "user", "content": f"""
Data description:
{description}

Example JSON row:
{json_example}

Number of rows:
{number}
"""}]

In [None]:
qwen_quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

qwen_model = AutoModelForCausalLM.from_pretrained(
    QWEN,
    torch_dtype="auto",
    device_map="auto",
    quantization_config=qwen_quant_config
)
qwen_tokenizer = AutoTokenizer.from_pretrained(QWEN)

def generate_qwen(messages):
  text = qwen_tokenizer.apply_chat_template(
      messages,
      tokenize=False,
      add_generation_prompt=True
  )
  model_inputs = qwen_tokenizer([text], return_tensors="pt").to(qwen_model.device)

  generated_ids = qwen_model.generate(
      **model_inputs,
      max_new_tokens=2048
  )
  generated_ids = [
      output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
  ]

  response = qwen_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
  return response

In [None]:
with open(data_filename, "a") as file:
  for i in range(10):
    generated = generate_qwen()
    file.write(generated + "\n")

In [None]:
print(generate_qwen())

In [None]:
def create_dataset(description, json_example, number):
  messages = get_messages(description, json_example, number)
  response = generate_qwen(messages)
  with open(data_filename, "a") as file:
    file.write(response)
  return response

In [None]:
# Gradio stuff
view = gr.Interface(
    fn=create_dataset,
    inputs=[gr.Textbox(label="Your dataset description:"), gr.Textbox(label="Your example JSON rows:"), gr.Textbox(label="number of rows to generate:")],
    outputs=[gr.Markdown(label="Response:")],
    flagging_mode="never"
)
view.launch()