<a href="https://colab.research.google.com/github/shamvrueth/genai-playground/blob/main/Synthetic_test_data_generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q --upgrade torch==2.5.1+cu124 torchvision==0.20.1+cu124 torchaudio==2.5.1+cu124 --index-url https://download.pytorch.org/whl/cu124
!pip install -q requests bitsandbytes==0.46.0 transformers==4.48.3 accelerate==1.3.0 openai

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m908.3/908.3 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.3/7.3 MB[0m [31m53.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m64.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m41.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m81.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m538.8 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m996.4 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os
import requests
from IPython.display import Markdown, display, update_display
from openai import OpenAI
from google.colab import drive
from huggingface_hub import login
from google.colab import userdata
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
import torch

In [None]:
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [None]:
LLAMA = "meta-llama/Meta-Llama-3.1-8B-Instruct"

In [None]:
system_prompt = """ROLE: Synthetic test data generator for comprehensive testing coverage.

TASK: Generate realistic, diverse test data including positive cases, negative cases, edge cases, and boundary conditions.

OUTPUT FORMAT:
- JSON/CSV/specified format
- Minimum 100 records unless specified
- Include data validation rules applied
- Brief context description

DATA REQUIREMENTS:
- Realistic values appropriate for domain
- Geographic/cultural variations where relevant
- Temporal consistency (dates, sequences)
- Statistical distribution reflecting real usage (80% common scenarios, 20% edge cases)

QUALITY STANDARDS:
- No real personal information
- Consistent data types and constraints
- Cross-field logical relationships maintained
- Privacy-compliant synthetic data only

COVERAGE AREAS:
- Positive: Valid inputs meeting all criteria
- Negative: Invalid inputs testing error handling
- Edge: Boundary values and limits
- Boundary: Minimum/maximum acceptable ranges

Adapt generation based on specified domain (healthcare, finance, e-commerce, etc.) and maintain appropriate compliance standards.
"""

user_prompt = """Please generate an employee record for an employee at the firm. The employee can have a technical or a business role. You should have the following sections in the document:
[Full Name]
[Employee ID/Staff Number]
[Date of Birth]
[Address (including City, State, Postal Code, and Country)]
[Contact Numbers (Mobile, Home, Office)]
[Email Addresses (Work and Personal)]
[Emergency Contact Details (Name, Relationship, Phone)]
[Job Title/Position]
[Department]
[Date of Joining/Hire Date]
[Employment Status (Active, Terminated, Leave of Absence)]
[Employment Type (Permanent, Contract, Part-time, Intern)]
[Manager/Supervisor]
[Salary/Compensation Details]
[Bank Account Information]
[Tax Identification Number or Social Security Number]
[Right to Work/Visa or Permit Details]
[Benefit Group/Benefits Enrolled]
[Work Location/Office Site]
[Work Schedule/Shift Pattern]
[Leave Records and Balances]
[Performance Review History]"""

In [None]:
messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": user_prompt}
  ]

In [None]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(LLAMA)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map="auto", quantization_config=quant_config)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
from transformers import TextIteratorStreamer
from threading import Thread
def generator(message, history):
  messages = [{"role": "system", "content": system_prompt}] + history + [{"role": "user", "content": message}]
  inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to("cuda")
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
  generation_kwargs = {"input_ids": inputs, "streamer": streamer, "max_new_tokens": 2000, "do_sample": True, "top_p": 0.9, "temperature": 0.7}
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
  thread.start()
  response = ""
  for new_text in streamer:
    response += new_text
    yield response

In [None]:
import gradio as gr
gr.ChatInterface(fn=generator, type="messages").launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://9d9de682357b1bb50c.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


