## Synthetic Data Generation Using Few Shot Prompting Concunrrency

In [3]:
import concurrent.futures
import pandas as pd
import json
import re

client = Groq(
    api_key=  os.environ.get("GROQ_API_KEY"),
)

def generate_data(prompt_template):
    # Call your language model API to generate the synthetic data
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": "You're a synthetic data generator expert in Synthetic data generation for customer reviews related to supplements and vitamins sold on Amazon."
            },
            {
                "role": "user",
                "content": prompt_template,
            }
        ],
        model="llama3-70b-8192",
        temperature=0.5,
    )
    
    # Extract generated response text
    text = chat_completion.choices[0].message.content
    
    # Use regex to extract the list of dictionaries (synthetic data)
    list_of_dicts_str = re.search(r'\[.*\]', text, re.DOTALL).group()
    
    # Convert the string to a list of dictionaries (JSON-like structure)
    data = json.loads(list_of_dicts_str)
    
    return data


In [28]:
def concurrent_data_generation(n_batches, prompt_template):
    # Initialize an empty DataFrame
    f = pd.DataFrame()
    
    # Use ThreadPoolExecutor for concurrent execution
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        # Submit tasks for data generation
        future_to_data = {executor.submit(generate_data, prompt_template): i for i in range(n_batches)}
        
        for future in concurrent.futures.as_completed(future_to_data):
            try:
                # Fetch generated data (list of dicts) from each completed future
                new_data = future.result()
                
                # Convert list of dicts into a DataFrame
                new_data_df = pd.DataFrame(new_data)
                
                # Append new data to the main DataFrame
                df = pd.concat([df, new_data_df], ignore_index=True)
            except Exception as exc:
                print(f'Generated data batch {future_to_data[future]} encountered an error: {exc}')
    
    return df
def save_to_csv(df, file_name):
    # Save the final DataFrame to a CSV file
    df.to_csv(file_name, index=False)
    print(f"Data saved to {file_name}")


In [5]:
examples = [
    {
        "example":"""rating: 5, title: excellent at relieving stress, text: this was recommended initially by an employee at a health food store.  he recommended taking 3 tablets in the AM and 3 at bed-time.  while this is probably overkill, and may not be recommended by true health care professionals, it works for me in improving my  mood and sense of well-being.  for safety, i recommend following the instructions on the Mega Stress 100 label, and of course following your health care professional's advice before taking any supplements.  however, the solution i outlined above has worked for me with great results.  granted, it can get expensive, which is why i chose to purchase the product from amazon instead of a local vendor with high overhead costs. Please let me know if you found this review helpful., asin: B00028NY44, parent_asin: B00028NY44, user_id: AHYXPH2PZZELOQB7T4CUIG5DKNXQ, timestamp: 24-12-2013  14:08:00, helpful_vote: 1, verified_purchase: True, date: 24-12-2013, time: 14:08"""
    },
    
    {
        "example":"""rating: 3, title: Might make you hot, red, and itchy, text: I ordered this for my husband and I. It did make me hot and a little itchy but my poor husband was so red, itchy, and hot, he did not like it at all. If you don't mind the hot and itchiness for 10 minutes I would recommend it but just be warned some people are a little more sensitive to it than others like my husband., asin: B0002D34K8, parent_asin: B0002D34K8, user_id: AHCJALAMDFEHGGSAGPVM72D7ZSZQ, timestamp: 26-04-2013  07:45:00, helpful_vote: 1, verified_purchase: True , date: 26-04-2013, time:07:45 """
    },
    
    {
        "example":"""rating: 4, title: Good B3, text: Very good vitamin product.i take to reduce cholesterol. Sometimes causes a flush, but not too bad. Overall a good product., asin: B0002D34K8, parent_asin: B0002D34K8, user_id: AEGI3QRMKI56K7M7RBXDDICBGHGA, timestamp: 10-01-2014  18:08:00, helpful_vote: 0, verified_purchase: , date: 10-01-2014, time: 18:08"""
    },
    
    {
        "example":"""rating: 1, title: Burns when urinatiing, text: I bought this for my husband because he has BPH, has locked up, and has had prostatitis in the past. However, when he takes it, it burns when he urinates. He stopped taking it and the burning disappeared. When he started again, the burning returned. We are disappointed that it hasn't helped.,asin: B0002DUEXS, parent_asin: B0002DUEXS, user_id: AGQNPJMYTQKRGEO7EGRT2IC7QTHA, timestamp: 17-02-2019 19:05, helpful_vote: 0, verified_purchase: True, date: 17-02-2019, time: 19:05"""
    },
    
    {
        "example":"""rating: 2, title: Nothing changed much after taking the supplement, text:, asin: B000FGXMWC, parent_asin: B000FGXMWC, user_id: AGB6GHZEQSEDRDF77QDXLLUT5ZKA, timestamp: 13-12-2015 19:23, helpful_vote: 8, verified_purchase: True, date: 13-12-2015, time: 19:23"""
    },
    {
        "example":"""rating: 5, title: I hope Thorne makes this stuff forever., text: I hope Thorne makes this stuff forever. It works great handling fungus and candida. Unfortunately, I can't swallow pills and must take these by popping the pills and squeezing them into water. Truly an awful taste but they really work and without making me ill. My body was so run down when I first started that I had to start slowly. I took me weeks to work up to 15 a day. I have been taking them for a year now. I was able to slowly reintroduce foods that I had become allergic to. Now I can eat milk products again!!! I have energy and can exercise. I plan on taking these the rest of my life. I only take 5 a day now., asin: B000FGXMWC, parent_asin: B000FGXMWC, user_id: AFBLBHUQUHCAOXLCZHKOLN3LKREA, timestamp: 23-01-2016 22:28, helpful_vote: 33, verified_purchase: True, date: 23-01-2016, time: 22:28"""
    },
    {
        "example":"""rating: 5, title: Worked Perfect to Kill Systemic Yeast Infectiin, text: Great to get rid of fungus infections!  I took 10 capsules 3X per day for the 1st 3 days... and felt really tired with all the yeast die-off.  Then I cut back to 5 tablets, 3X per day and wasn't tired at all.  Did this for 1 month and all my yeast sysmptoms are gone!,asin: B000FGXMWC, parent_asin: B000FGXMWC, user_id: AFOMTKMAZTZU2NNL5O2L2CNHIZKA, timestamp: 15-12-2017 00:09, helpful_vote: 130, verified_purchase: True, date: 15-12-2017, time: 00:09"""
    },
    {
        "example":"""rating: 1, title: Arrived unsealed and leaking through the box, text: Product seems good enough but it arrived unsealed, with the cap partially unscrewed, and with oil leaking through the shipping box.,asin: B017DT4HM2, parent_asin: B09KWWZN1L, user_id: AGJEM6JRWRJV2MW352QWDLYTDFSQ, timestamp: 08-11-2019 22:58, helpful_vote: 0, verified_purchase: True, date: 08-11-2019, time: 22:58"""
    },
    {
        "example":"""rating: 3, title: Not sure about this product. Every fractionated coconut oil ..., text: Not sure about this product. Every fractionated coconut oil I have gotten in the past has been 100% clear. This has a light yellowish color not convinced it's the real deal., asin: B017DT4HM2, parent_asin: B09KWWZN1L, user_id: AHQKG4HNOG4BL32UQTDMRXRDPVFA, timestamp: 16-06-2017 17:39, helpful_vote: 5, verified_purchase: True, date: 16-06-2017, time: 17:39"""
    },
    {
        "example":"""rating: 4, title: I love this Ginger powder, text: This ginger powder has a nice fragrant gingery smell and flavor. Pleased with my purchase., asin: B08QVXG5XW, parent_asin: B08QVXG5XW, user_id: AEUBM5IAKYVPSDEZ735TLVWFCOBA, timestamp: 27-01-2022 13:10, helpful_vote: 2, verified_purchase: True, date: 27-01-2022, time: 13:10"""
    }
]

subject = "You're a synthetic data generator expert in Synthetic data generation for customer reviews related to supplements and vitamins sold on Amazon. "
content = "The dataset should mimic real user feedback, with both positive and negative experiences, including information on product efficacy, side effects, shipping, and pricing. The reviews should appear authentic and human-like, incorporating occasional grammatical errors, informal language, and subjective opinions. The goal is to generate diverse customer feedback, representing a wide range of experiences and opinions, while maintaining the format and structure of real reviews. The generated dataset should be in a CSV file with the following columns and their datatypes: rating (int), title (str), text (str), asin (str), parent_asin (str), user_id (str), timestamp (datetime), helpful_vote (int), verified_purchase (boolean), date (datetime), time (datetime)."


In [29]:
def main(prompt_template, n_batches, output_file):
    # Generate synthetic data concurrently and append to a single DataFrame
    df = concurrent_data_generation(n_batches, prompt_template)
    
    # Save the DataFrame to a CSV file
    save_to_csv(df, output_file)

if __name__ == "__main__":
    prompt_template = f"""
    ###Task:
    {subject}
    {content}
    
    ###Instructions:
    Observe the subtle patterns and format of values entered in every column to generate resembling but not duplicate synthetic values.",
    Ensure that the 'rating' values range from 1 to 5, representing varied customer experiences.",
    Generate realistic 'title' and 'text' entries that resemble actual human reviews, with occasional grammatical errors, informal language, long and simple sentence structures to reflect authentic user feedback.",
    For 'asin' and 'parent_asin', generate unique, realistic alphanumeric strings that conform to the format used in actual product identifiers.",
    Ensure 'user_id' values are randomized alphanumeric strings that mimic real customer IDs.",
    Generate timestamps in the format 'DD-MM-YYYY HH:MM:SS', ensuring a variety of dates and times, spanning across different years for diversity in review timelines.",
    The 'helpful_vote' should be an integer with random variation across different reviews.",
    The 'date' should match the 'timestamp' date component, and the 'time' should correspond to the time component, formatted as 'HH:MM'.
    The reviews should appear authentic and human-like, incorporating occasional grammatical errors, informal and often inaccurate language, and subjective opinions. The goal is to generate diverse customer feedback, representing a wide range of experiences and opinions, while maintaining the format and structure of real reviews.
    
    ###Examples: {examples}
    Generate a json file 40 entries abiding to the given structure.
    
    ###Important information:
    1. Generate the output in json format only.
    2. Do not generate any text outside of the required json text.
    3. Do not include any newline or tab characters or extra text. Return only JSON file.
    """
    
    n_batches = 7
    output_file = 'synthetic_dataset.csv'
    
    main(prompt_template, n_batches, output_file)


Generated data batch 4 encountered an error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01jamgsagcfpgah34nq1556y10` on : Limit 500000, Used 528205, Requested 2071. Please try again in 1h27m11.8566s. Visit https://console.groq.com/docs/rate-limits for more information.', 'type': '', 'code': 'rate_limit_exceeded'}}
Generated data batch 1 encountered an error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01jamgsagcfpgah34nq1556y10` on : Limit 500000, Used 528205, Requested 2071. Please try again in 1h27m11.8056s. Visit https://console.groq.com/docs/rate-limits for more information.', 'type': '', 'code': 'rate_limit_exceeded'}}
Generated data batch 0 encountered an error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01jamgsagcfpgah34nq1556y10` on : Limit 500000, Used 528205, Requested 2071. Please t