In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [3]:
yelp_business_types = [
    # Restaurants & Food
    "Restaurants", "Fast Food", "Cafes", "Coffee & Tea", "Bakeries",
    "Desserts", "Ice Cream & Frozen Yogurt", "Pizza", "Sushi Bars",
    "Chinese", "Mexican", "Indian", "Thai", "Italian", "Korean",
    "American (Traditional)", "American (New)", "Seafood", "BBQ", "Vegan",

    # Beauty & Spas
    "Hair Salons", "Nail Salons", "Skin Care", "Massage", "Waxing",
    "Spas", "Barbers", "Eyelash Services", "Makeup Artists",

    # Health & Medical
    "Dentists", "Chiropractors", "Family Practice", "Physical Therapy",
    "Optometrists", "Urgent Care", "Psychologists", "Acupuncture",

    # Automotive
    "Auto Repair", "Oil Change Stations", "Car Dealers", "Auto Detailing",
    "Car Wash", "Towing", "Body Shops", "Tires",

    # Shopping
    "Fashion", "Shoe Stores", "Department Stores", "Electronics",
    "Thrift Stores", "Bookstores", "Gift Shops", "Grocery",
    "Organic Stores", "Convenience Stores",

    # Home Services
    "Plumbers", "Electricians", "Contractors", "Cleaning Services",
    "Movers", "Painters", "Home Inspectors", "Handyman", "Landscaping",

    # Pets
    "Pet Groomers", "Veterinarians", "Pet Stores", "Dog Walkers", "Pet Sitting",

    # Nightlife
    "Bars", "Pubs", "Lounges", "Nightclubs", "Karaoke",

    # Fitness & Instruction
    "Gyms", "Personal Trainers", "Yoga", "Martial Arts", 
    "Dance Studios", "Cycling Classes", "Swimming Lessons",

    # Arts, Entertainment & Events
    "Movie Theaters", "Museums", "Performing Arts", "Event Planning",
    "Party Supplies", "DJs", "Photo Booth Rentals", "Escape Games",

    # Hotels & Travel
    "Hotels", "Resorts", "Bed & Breakfast", "Hostels",
    "Car Rental", "Airport Shuttles", "Travel Agents"
]

prompt = """
You are a helpful assistant trained to generate realistic Yelp-style reviews in **English only**. The reviews should closely match the tone, vocabulary, and structure used in real Yelp reviews.

Please generate a {type_of_review} review for a {business_type} such as those found in the Yelp dataset. The review should be 2-3 sentences long, rich in specific details, and written from the perspective of a real customer.

Include relevant elements such as:
- Service quality
- Cleanliness
- Atmosphere/ambiance
- Staff behavior
- Product/food quality
- Prices or value
- Wait times

Make sure the review sounds natural and human-written. Do NOT mention that it’s AI-generated. Write only the review content.

**Only write the review in English. Do not use or switch to any other language.**

Now, generate a {type_of_review} review for a {business_type}####\n.
"""

### google/flan-t5-large

In [4]:
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")

In [5]:
# type_of_review = "Positive"
# business_type = np.random.choice(yelp_business_types)

In [6]:
# # Generate text
# review_df_positive = pd.DataFrame(columns=['Reviews'])

# with torch.no_grad():
#     for i in range(25):
#         reviews = list()
#         for index in range(20):
#             inputs = tokenizer(prompt, return_tensors="pt")
#             generated_ids = model.generate(**inputs, max_new_tokens=60, do_sample=True, temperature=0.7,  repetition_penalty=1.4, top_k=200)
#             output = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
#             reviews.append(output.split("####\n")[-1].strip())
#         review_df_positive = pd.concat([review_df_positive, pd.DataFrame({'Reviews': reviews})], ignore_index=True)
        
#         # Save to CSV
#         review_df_positive.to_csv("generated_reviews_positive.csv", index=False)

In [7]:
type_of_review = "Negative"
business_type = np.random.choice(yelp_business_types)

In [8]:
# Generate text
review_df_negative = pd.DataFrame(columns=['Reviews'])

with torch.no_grad():
    for i in range(25):
        reviews = list()
        for index in range(20):
            inputs = tokenizer(prompt, return_tensors="pt")
            generated_ids = model.generate(**inputs, max_new_tokens=60, do_sample=True, temperature=0.7,  repetition_penalty=1.4, top_k=200)
            output = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
            reviews.append(output.split("####\n")[-1].strip())
        review_df_negative = pd.concat([review_df_negative, pd.DataFrame({'Reviews': reviews})], ignore_index=True)
        
        # Save to CSV
        review_df_negative.to_csv("generated_reviews_negative.csv", index=False)