
## Calculate scores of sensitivity and informativeness for the Amazon reviews

In [None]:
!pip install -q typing-extensions openai tiktoken

In [None]:
import pandas as pd
import tiktoken
import json
import csv
from concurrent.futures import ThreadPoolExecutor

import openai
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
import statistics
import os

Mounted at /content/drive


In [None]:
'''
Glossary and Guidance to Use:
1. If need to start over the experiment again, update the experiment_no. Otherwise it will resume the progress from last time
2. If need to modify prompt template, do update prompt_id, prompt_template, COMMENT(as detailed as possible). (AND probably you also should update experiment_no)
3. If want to experiment on more reviews, just modify used_reviews and prompt_repeat_times, and then run the whole notebook again, but beware previously processed reviews won't be affected.
4. The output file is a large large .csv file. Functionalities like hiding, filtering, ordering, functions in Google Excel or MS Excel will help you to do the analysis you want. The large amount of information we record is meant for potential future tracking.
'''

# ==================
# Configurable Parameters

# the number of experiment, increment this means restarting over the experiment, it will reset the checkpoint and the working progress
experiment_no = 20

# Prompt Definitions. Always use {reviews} as a placeholder for review text. Need to edit code before adding any other new placehodlers. Every other place that needs a bracket, please use double brackets instead.
prompt_id = 1
prompt_template = '''The following is a review of an individual's experience with a health product on Amazon. Please start by identifying any health-sensitive information about the author included in this review.
Propose a sanitized version of the text, in the voice of the reviewer, that maintains its informativeness while removing sensitive details.


Health-sensitive information includes, but is not limited to, details regarding medical conditions, treatments, or personal health experiences.
Your output should have the following JSON format:
{{
"sens_info": Identify each piece of sensitive information included in the review. This field should be left blank if the text is insensitive.
"rephrased": Propose a way to sanitize the original text from sensitive information while leaving unrelated words unchanged. This field should be left blank if the text is insensitive.
"explanation": Explain why you rephrased different parts of the review and how this makes it less sensitive. This field should be left blank if the text is insensitive.
"sens_score": On a scale of 0 to 10, where 0 is no sensitive information, and 10 is a lot of sensitive information, rate the sensitivity of the original review.
"sens_reph": On the same scale, rate the sensitivity of the original review. This field should be left blank if the text is insensitive.
"info_score": On a scale of 0 to 10, where 0 is useless to people who read the review, and 10 is highly informative to people who read the review, rate the original review.
"info_reph": On the same scale, rate the informativeness of the rephrased review. This field should be left blank if the text is insensitive.
}}

{reviews}

'''

old_prompt = '''I will give you a product review from Amazon and you will have to tell me if that string contains potentially medically sensitive information and help me rephrase it if so.

Your output should have the following JSON format:
{{
"sens_info": Identify each piece of sensitive information included in the review. This field should be left blank if the text is insensitive.
"rephrased": Propose a way to sanitize the original text from sensitive information while leaving unrelated words unchanged. This field should be left blank if the text is insensitive.
"explanation": Explain why you rephrased different parts of the review and how this makes it less sensitive. This field should be left blank if the text is insensitive.
"sens_score": On a scale of 0 to 10, where 0 is no sensitive information, and 10 is a lot of sensitive information, rate the sensitivity of the original review.
"sens_reph": On the same scale, rate the sensitivity of the original review. This field should be left blank if the text is insensitive.
"info_score": On a scale of 0 to 10, where 0 is useless to people who read the review, and 10 is highly informative to people who read the review, rate the original review.
"info_reph": On the same scale, rate the informativeness of the rephrased review. This field should be left blank if the text is insensitive.
}}


{reviews}

'''

# comment before every experiment, need to be filled for future looking up.
comment = ''' Old prompt with 10 tries

'''
# the prompt to use
used_prompt_template = old_prompt
# how many times to repeat the prompt
prompt_repeat_times = 10
# how many reviews to use
used_reviews = 28


In [None]:
# Check for openai api key and fail if not found

if os.getenv("OPENAI_API_KEY") is not None:
    openai.api_key = os.getenv("OPENAI_API_KEY")
    print("OPENAI_API_KEY is ready")
else:
    print("OPENAI_API_KEY environment variable not found, set it manually here or the script will fail.")
    openai.api_key = "<OpenAI_key_goes_here>"

model = "gpt-4"
client = openai.OpenAI()

# ==================
# cols of the output file
basic_cols = ['experiment_id', 'comment', 'prompt_id', 'prompt_template', 'input_file', 'review_id', 'review_text',
              'trial_no']
statistical_cols = ['sens_orig_avg', 'sens_reph_avg', 'info_orig_avg', 'info_reph_avg', 'sens_orig_var',
                    'sens_reph_var', 'info_orig_var', 'info_reph_var']
detail_cols = ["sens_orig", "sens_reph", "info_orig", "info_reph", "rephrased_text", "failed", "explanation", "output",
               "price"]

# required file paths

input_path = r'../data/all_reviews_28.csv'
output_path = r'../output'
if not os.path.exists(output_path):
    os.makedirs(output_path)
checkpoint_path = r'../checkpoint'
if not os.path.exists(checkpoint_path):
    os.makedirs(checkpoint_path)
checkpoint_file = f'{checkpoint_path}/ckpt{experiment_no}'
if not os.path.exists(checkpoint_file):
    os.makedirs(checkpoint_file)

# default value for missing scores in model's answers
default_score_for_missing = -1

input_pandas = pd.read_csv(input_path)[0:used_reviews]

In [None]:
# ==================
# Function Definitions
def num_tokens(model, string) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(string))


def access_gpt(prompt):
    """return the output and price of a specific prompt"""
    stream = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        stream=True,
    )
    output = ""
    for chunk in stream:
        if chunk.choices[0].delta.content is not None:
            output += chunk.choices[0].delta.content

    price = num_tokens(model, prompt) * 0.00003 + num_tokens(model, prompt) * 0.00006
    return [output, price]


def process_review_multi_threaded(review, prompt_repeat_times, used_prompt_template):
    gpt_results = []
    prompt = used_prompt_template.format(reviews=review)
    # Create a ThreadPoolExecutor
    with ThreadPoolExecutor(max_workers=32) as executor:
        # Submit tasks to the executor
        future_to_gpt = {executor.submit(access_gpt, prompt): _ for _ in range(prompt_repeat_times)}
        for future in concurrent.futures.as_completed(future_to_gpt):
            try:
                gpt_results.append(future.result())  # Append the result of access_gpt
            except Exception as exc:
                print(f'Generated an exception: {exc}')
    return gpt_results


def parse_raw_answers(raw_answers):
    """return the sensitivity and informativeness scores of the original and rephrased reviews. works for batch"""
    parsed_results = []

    for gpt_answer in raw_answers:
        price_result = gpt_answer[1]
        parsed_result = []
        # looks like this:
        # {\n"sens_info": "",\n"rephrased": "",\n"explanation": "",\n"sens_score": 0,\n"sens_reph": 0,\n"info_score": 6,\n"info_reph": 6\n}
        sens_score_rephrased = -1
        sens_score_original = -1
        info_score_original = -1
        info_score_rephrased = -1
        explanation = 'FAIL'
        rephrased_text = 'FAIL'
        price = 0
        raw_answer = gpt_answer
        try:
            review_data = json.loads(gpt_answer[0])

            def get_score(value):
                return value if isinstance(value, (int, float)) else default_score_for_missing

            sens_score_rephrased = get_score(review_data.get('sens_reph', default_score_for_missing))
            sens_score_original = get_score(review_data.get('sens_score', default_score_for_missing))
            info_score_original = get_score(review_data.get('info_score', default_score_for_missing))
            info_score_rephrased = get_score(review_data.get('info_reph', default_score_for_missing))
            explanation = review_data.get('explanation', "")
            rephrased_text = review_data.get('rephrased', "")
            price = price_result
            fail_to_parse = False
        except json.JSONDecodeError:
            fail_to_parse = True

        # append
        parsed_result.append(sens_score_original)
        parsed_result.append(sens_score_rephrased)
        parsed_result.append(info_score_original)
        parsed_result.append(info_score_rephrased)
        parsed_result.append(rephrased_text)
        parsed_result.append(fail_to_parse)
        parsed_result.append(explanation)
        parsed_result.append(raw_answer)
        parsed_result.append(price)

        # add to results
        parsed_results.append(parsed_result)

    return parsed_results


def get_basic_cols():
    return basic_cols


def fill_in_basic_cols(review_id, review_text, trial_no):
    return [experiment_no, comment, prompt_id, used_prompt_template, input_path, review_id, review_text, trial_no]


def get_statistical_cols():
    return statistical_cols


def fill_in_statistical_cols(details):
    filtered_details = [detail for detail in details if default_score_for_missing not in detail[:4]]

    def all_identical(lst):
        return [lst[0]] * len(lst) == lst

    return [statistics.mean(detail[0] for detail in filtered_details) if len(filtered_details) > 0 else -1,
            #'sens_orig_avg'
            statistics.mean(detail[1] for detail in filtered_details) if len(filtered_details) > 0 else -1,
            #'sens_reph_avg',
            statistics.mean(detail[2] for detail in filtered_details) if len(filtered_details) > 0 else -1,
            #'info_orig_avg',
            statistics.mean(detail[3] for detail in filtered_details) if len(filtered_details) > 0 else -1,
            #'info_reph_avg',
            statistics.variance(detail[0] for detail in filtered_details) if len(
                filtered_details) > 1 and not all_identical([detail[0] for detail in filtered_details]) else 0,
            #'sens_orig_var',
            statistics.variance(detail[1] for detail in filtered_details) if len(
                filtered_details) > 1 and not all_identical([detail[1] for detail in filtered_details]) else 0,
            #'sens_reph_var',
            statistics.variance(detail[2] for detail in filtered_details) if len(
                filtered_details) > 1 and not all_identical([detail[2] for detail in filtered_details]) else 0,
            #'info_orig_var',
            statistics.variance(detail[3] for detail in filtered_details) if len(
                filtered_details) > 1 and not all_identical([detail[3] for detail in filtered_details]) else 0
            #'info_reph_var'
            ]


def get_detailed_cols(repeat=3):
    return detail_cols


def fill_in_detail_cols(answer_parsed):
    return answer_parsed

In [None]:
from datetime import datetime as dt
import os

now = dt.now()
output_file_prefix = f'{now.strftime("%Y-%m-%d_%H:%M:%S")}-{model}-exp{experiment_no}'

review_no = 0
file_name = f'{output_file_prefix}.csv'

# ==================
# open checkpoint if already present and load the state of last run, if so required
if os.path.isfile(checkpoint_file):
    ckpt = pd.read_csv(checkpoint_file)
    try:
        review_no = int(ckpt['review_no'].tail(1)) + 1
        file_name = ckpt.tail(1)['csv_name'].values.tolist()[-1]
    except:
        print("FAIL TO PARSE CHECKPOINT!")
        pass
else:
    with open(f'{checkpoint_file}', 'w+') as ckpf:
        w = csv.writer(ckpf)
        w.writerow(["csv_name", "review_no"])

# ==================
# access gpt -> parse the answer -> Create Aggregated DataFrame -> write output to file

with (open(f'{output_path}/{file_name}', 'a+') as f, open(checkpoint_file, 'a') as ckpf):
    f_writer = csv.writer(f)
    ckpw = csv.writer(ckpf)
    headers = get_basic_cols() + get_statistical_cols() + get_detailed_cols()
    if review_no == 0:
        f_writer.writerow(headers)

    review_list = input_pandas['reviews'].values.tolist()[review_no:]
    for num, review_text in enumerate(review_list, review_no):
        raw_model_answers = process_review_multi_threaded(review_text, prompt_repeat_times, used_prompt_template)

        answers_parsed = parse_raw_answers(raw_model_answers)

        for trial_no in range(prompt_repeat_times):
            basic_cols_fill = fill_in_basic_cols(num, review_text, trial_no)
            statistical_cols_fill = fill_in_statistical_cols(answers_parsed)
            detail_cols_fill = fill_in_detail_cols(answers_parsed[trial_no])
            f_writer.writerow([*basic_cols_fill, *statistical_cols_fill, *detail_cols_fill])
        # save to checkpoint
        ckpw.writerow([file_name, num])
        print(f'review {num} of {used_reviews} is finished for experiment {experiment_no}')
