As we need to have multi-aspect reviews and the ones we have contain mostly only one aspect, we need to generate such reviews. As we don't want to create a model based only on artificial reviews and as we need to have a balanced data set, we need to set setup a distribution among multi-aspects and single-aspect reviews.
Considering that we need at least 1000 reviews for the model training and the fact that we want to have a decent number of each reviews type represented, the next distribution will be used:
- for single-aspect reviews (4 types) = 1000 items
    - each review type = 250 items
- for multi-aspect reviews = 1000 items
    - each type of reviews is missing = 250 items
- for multi-aspect reviews (all types are presented) = 1000 items

In sum - 3000 items.

For each subset there will be equal distribution (provided by randomization and hope) among rates from 1 to 5.

In [13]:
# imports

import os
import random as rnd
import time
import tqdm
import csv
from jinja2 import Environment, FileSystemLoader
import openai
from tenacity import (
    retry,
    before_sleep_log,
    wait_random_exponential,
)
import sys
import logging
import timeout_decorator
import re

In [51]:
# config

openai.organization = os.getenv('OPENAI_ORGANIZATION')
openai.api_key = os.getenv('OPENAI_API_KEY')

model_name = 'gpt-3.5-turbo'
# model_name = 'text-davinci-003'
model_temperature = 1.2

review_size_max = 150

logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
logger = logging.getLogger(__name__)

In [4]:
# bricks

aspect_size = 250

customer_type = ['polite', 'cheeky', 'blunt', 'highly educated', 'rustic']
customer_mood = ['bad', 'good', 'neutral']
review_item = ['headphones', 'dress', 'wallet', 'mobile phone']
review_aspect = ['item', 'delivery', 'seller', 'marketplace']

In [6]:
# generate contexts

contexts = []

def create_context(raspects):
    return {
            'ctype': rnd.choice(customer_type),
            'cmood': rnd.choice(customer_mood),
            'ritem': rnd.choice(review_item),
            'raspects': raspects,
            'rsize': review_size_max
        }

for aspect in review_aspect:
    filtered = [a for a in review_aspect if a != aspect]
    for i in range(1, 251):
        contexts.append(create_context(raspects=[{'name': aspect,'grade': rnd.randint(1, 5)}]))
        contexts.append(create_context(raspects=[{'name': a, 'grade': rnd.randint(1, 5)} for a in filtered]))

for i in range(1, 1001):
    contexts.append(create_context(raspects=[{'name': a, 'grade': rnd.randint(1, 5)} for a in review_aspect]))

rnd.shuffle(contexts)

In [68]:
# generate reviews

reviews = []

def create_prompt(ctype, cmood, ritem, raspects, rsize):
    ss = {
        'ctype': ctype,
        'cmood': cmood,
        'ritem': ritem,
        'raspects': raspects,
        'rsize': rsize
    }
    env = Environment(loader=FileSystemLoader('template'))
    template = env.get_template('review.txt')
    return template.render(ss)

def get_review(mname, ctype, cmood, ritem, raspects, rsize):
    prompt = create_prompt(ctype=ctype, cmood=cmood, ritem=ritem, raspects=raspects, rsize=rsize)
    return openai.ChatCompletion.create(model=mname, messages=[{'role': 'user', 'content': prompt}], max_tokens=int(rsize * 1.5), temperature=model_temperature)

@retry(wait=wait_random_exponential(min=1, max=30), before_sleep=before_sleep_log(logger, logging.DEBUG))
@timeout_decorator.timeout(60)
def create_review(ctx):
    review = get_review(mname=model_name, ctype=ctx['ctype'], cmood=ctx['cmood'], ritem=ctx['ritem'], raspects=ctx['raspects'], rsize=ctx['rsize'])
    text = review['choices'][0]['message']['content']
    title = re.findall('\\n\\n(.*)\\n\\n', text)[0]
    rating_total = re.search('.*Total rating: (\d+).*', text).group(1)
    body = text.replace(title, '', 1).replace('\n', ' ').strip().replace(re.search('.*(Total rating: \d+).*', text).group(1), '')
    review = {
        'title': title,
        'body': body,
        'rating': {it['name']: it['grade'] for it in ctx['raspects']}
    }
    review['rating']['total'] = int(rating_total)
    return review

for ctx in tqdm.tqdm(contexts):
    reviews.append(create_review(ctx))
    time.sleep(3)

  0%|          | 0/3000 [00:00<?, ?it/s]DEBUG:openai:message='Request to OpenAI API' method=post path=https://api.openai.com/v1/chat/completions
DEBUG:openai:api_version=None data='{"model": "gpt-3.5-turbo", "messages": [{"role": "user", "content": "You are a blunt customer that is in bad mood and that writes a review for a wallet that you bought on a marketplace (and you shouldn\'t mention that you bought it on a marketplace directly).\\nThe review should include the next aspects of the purchase:\\n\\n    - delivery, on the scale of 5 you are satisfied by 5 (it shouldn\'t be in the review)\\n\\n    - seller, on the scale of 5 you are satisfied by 5 (it shouldn\'t be in the review)\\n\\n    - marketplace, on the scale of 5 you are satisfied by 4 (it shouldn\'t be in the review)\\n\\nThese grades should not be included into the review, but the corresponding aspects and how you are happy or unhappy about them should be described or, at least, mentioned. For any aspects that is not graded

In [72]:
# write reviews

with open('data/reviews_raw.csv', 'w', newline='') as f:
    headers = ['title', 'body', 'rating_item', 'rating_delivery', 'rating_seller', 'rating_marketplace', 'rating_total']
    writer = csv.DictWriter(f, fieldnames=headers, delimiter=',', quoting=csv.QUOTE_NONNUMERIC)
    writer.writeheader()
    for r in tqdm.tqdm(reviews):
        writer.writerow({
            'title': r['title'],
            'body': r['body'],
            'rating_item': r['rating'].get('item'),
            'rating_delivery': r['rating'].get('delivery'),
            'rating_seller': r['rating'].get('seller'),
            'rating_marketplace': r['rating'].get('marketplace'),
            'rating_total': r['rating'].get('total')
        })

100%|██████████| 3000/3000 [00:00<00:00, 28246.44it/s]
