### Synthetic Training Data Generation
- Each topical group of subreddits and norm dimension has a dedicated classifier model (e.g. DeBerta-base-v3), enabling a comparison across these similar subreddits. To train such models, we generate synthetic labels via GPT-4 through stratified sampling and automatic labeling.
- During the sampling stage, we employ GPT-3.5 to rate comments on a 5-point Likert Scale to gauge normness. For a particular subreddit, we rate up to 10K comments or until we obtain at least 10 comments per likert scale value (whichever happens first). In the case we rate up to 10K comments without obtaining 10 comments per scale value, we randomly sample comments from the nearest scale value (e.g. if we have only 4 comments for likert scale value of 5, then we randomly sample 6 comments from likert scale value of 4). We repeat this process for each norm dimension.
- For a particular norm dimension, this results in 150 comments per topic (200 for Finance, which contains 4 subreddits) since 10 comments per scale x 5 likert scale x n subreddits = 50 * n comments. 
- For a particular topic and norm dimension, 1,250 comment pairs are randomly selected to create binary synthetic labels using GPT-4. 

In [208]:
import pickle
import random
import os
from openai import OpenAI
import json
import logging
import sys
import time
import numpy as np
from collections import defaultdict
import pandas as pd
import itertools
import prompts

# using this random seed
random.seed(420)

In [270]:
# OpenAI Keys
OPENAI_KEY = ""  # insert your own key here
client = OpenAI(api_key=OPENAI_KEY)

In [271]:
# contains all the processed subreddit data dumps
PROCESSED_DATA_DIRECTORY = "/gscratch/argon/hjung10/norm_discovery_project/data/data_dump/"

# contains the pkl file that can be loaded as a dictionary; contains all subreddits
SAMPLED_COMMENTS_PER_SUBREDDIT_DIRECTORY = "/home/hjung10/norm_discovery/synthetic_data/subreddit_data_sample/"

# contains comments rated on the 5-point scale value per dimension per subreddit
SCALE_RATED_COMMENTS_PER_SUBREDDIT_DIRECTORY = "/home/hjung10/norm_discovery/synthetic_data/scale_rated_samples/"

# contains all pairwise comparisons between the sampled comments
PAIRWISE_COMPARISONS_DIRECTORY = "/home/hjung10/norm_discovery/synthetic_data/pairwise_comparison/"

# contains all synthetic labeled data
SYNTHETIC_DATA_DIRECTORY = "/home/hjung10/norm_discovery/synthetic_data/synthetic_labels/"

# desired subreddits and sampling amount
subreddit_lists = ['democrats', 'republican', 'libertarian']  # politics
#subreddit_lists = ["askmen", "askwomen", "asktransgender"]   # gender
#subreddit_lists = ["askscience", "shittyaskscience", "asksciencediscussion"]   # science
#subreddit_lists = ["stocks", "wallstreetbets", "wallstreetbetsnew", "pennystocks"]   # finance

SAMPLE_LIMIT_PER_SUBREDDIT = 10000

### Sampling 10K random comments per subreddit

In [272]:
# samples the provided amount of comments from each of target_subreddits. 
def select_comments(target_subreddits, filter_upvotes_bool): # true if you want to filter by the upvote
    selected = {subreddit: [] for subreddit in target_subreddits}

    for subreddit in target_subreddits:
        sys.stdout.write(f'[{time.strftime("%Y-%m-%d %H:%M:%S")}] r/{subreddit}: start random comment selection\n')
        logging.info(f"r/{subreddit}: start random comment selection")
        done = False
        selected_comments_id, disselected_comments_id = [], []
        with open(os.path.join(PROCESSED_DATA_DIRECTORY, f"comments_{subreddit}.pkl"), "rb") as f:
            data_comments = pickle.load(f)
        sys.stdout.write(f'[{time.strftime("%Y-%m-%d %H:%M:%S")}] r/{subreddit}: number of comments total: {len(data_comments)}\n')
        logging.info(f"r/{subreddit}: number of comments total: {len(data_comments)}")
        while not done:
            rand_ind = random.randint(0, len(data_comments)-1)
            if rand_ind in selected_comments_id or rand_ind in disselected_comments_id:
                continue
            if filter_upvotes_bool and (data_comments[rand_ind]["score"] < upvote_score_threshold):    # changed to get comments that got more than 5 upvote
                disselected_comments_id.append(rand_ind)
                continue
            selected_comments_id.append(rand_ind)
            selected[subreddit].append(data_comments[rand_ind])
            if len(selected[subreddit]) >= SAMPLE_LIMIT_PER_SUBREDDIT or len(selected_comments_id) + len(disselected_comments_id) >= len(data_comments):
                done = True
        sys.stdout.write(f'[{time.strftime("%Y-%m-%d %H:%M:%S")}] r/{subreddit}: done selecting {len(selected[subreddit])} comments.\n')
        logging.info(f"{subreddit}: done selecting {len(selected[subreddit])} comments.")
    return selected

# obtains the corresponding submission titles and body of the selected comments
# accepts the comments argument based on the select_comments function outputs
def fetch_submission_texts(selected_comments):
    comments_and_submission = {subreddit: [] for subreddit in selected_comments.keys()}

    # iterate through each subreddit
    for subreddit, comments_list in selected_comments.items():

        # reading the submissions data from the particular subreddit
        data_submissions = ""
        with open(os.path.join(PROCESSED_DATA_DIRECTORY, f"submissions_{subreddit}.pkl"), "rb") as f:
            data_submissions = pickle.load(f)

        # per comment, find the matching id to the submission
        for comment in comments_list:
            submission_id = comment['parent_id'].split("_")[1] 

            # find the submission that matches the comment's submission_id
            for submission in data_submissions: 
                if submission_id == submission["id"]:
                    
                    title = submission["title"]
                    body = submission["selftext"]
                    
                    comments_and_submission[subreddit].append((comment, title, body, submission))
        print("Completed " + subreddit)
    return comments_and_submission

In [7]:
# selecting 50K random comments from each subreddit
selected = select_comments(subreddit_lists, False)

[2024-03-11 04:19:35] r/askmen: start random comment selection
[2024-03-11 04:21:04] r/askmen: number of comments total: 4679855
[2024-03-11 04:21:07] r/askmen: done selecting 25000 comments.
[2024-03-11 04:21:07] r/askmenover30: start random comment selection
[2024-03-11 04:21:19] r/askmenover30: number of comments total: 184136
[2024-03-11 04:21:22] r/askmenover30: done selecting 25000 comments.
[2024-03-11 04:21:22] r/askwomen: start random comment selection
[2024-03-11 04:22:04] r/askwomen: number of comments total: 2246255
[2024-03-11 04:22:07] r/askwomen: done selecting 25000 comments.
[2024-03-11 04:22:07] r/askwomenover30: start random comment selection
[2024-03-11 04:22:31] r/askwomenover30: number of comments total: 362467
[2024-03-11 04:22:34] r/askwomenover30: done selecting 25000 comments.
[2024-03-11 04:22:34] r/asktransgender: start random comment selection
[2024-03-11 04:22:46] r/asktransgender: number of comments total: 676563
[2024-03-11 04:22:49] r/asktransgender: do

In [None]:
# fetching their corresponding submission post (titles and descriptions)
comments_and_submission = fetch_submission_texts(selected)

In [None]:
# save dictionary to sampled_comments_and_submissions.pkl file
with open(SAMPLED_COMMENTS_PER_SUBREDDIT_DIRECTORY + 'sampled_comments_and_submissions.pkl', 'wb') as fp:
    pickle.dump(comments_and_submission, fp)
    print('dictionary saved successfully to file')

In [None]:
# organizing into dataframe format to create prompt message
dimensions = ["formality", "supportiveness", "sarcasm", "politeness", "humor"]

for subreddit, comment_submission in comments_and_submission.items():

    title_list = []
    description_list = []
    comment_list = []
    dimension_list = []
    comment_metadata_list = []
    post_metadata_list = []
    
    # each element contains (comment, title, body, submission)
    for comment_data in comment_submission:
        title = comment_data[1]
        description = comment_data[2]
        comment = comment_data[0]["body"]

        for dimension in dimensions:
            title_list.append(title)
            description_list.append(description)
            comment_list.append(comment)
            dimension_list.append(dimension)
            comment_metadata_list.append(comment_data[0])
            post_metadata_list.append(comment_data[3])
            
    df = pd.DataFrame()
    df["title"] = title_list
    df["description"] = description_list
    df["comment"] = comment_list
    df["dimension"] = dimension_list
    df["comment_metadata"] = comment_metadata_list
    df["submission_metadata"] = post_metadata_list
    df.to_csv(SAMPLED_COMMENTS_PER_SUBREDDIT_DIRECTORY + subreddit + "_processed_dataframe.csv")

### Employing GPT3.5 to rate comments
- Rating the randomly sampled comments per subreddit until we hit at least 10 comments per scale value

In [273]:
def construct_message_rating(annotation_df, few_shot):
    dimension_to_tuple = defaultdict(list)
    for i, row in annotation_df.iterrows():
        title = row["title"]
        description = row["description"] if row["description"] == row["description"] else ""
        comment = row["comment"]
        dimension_ = row["dimension"]
        few_shot_examples = ""
        if dimension_ == "formality":
            template = prompts.casual_formal
            dimension = prompts.dimensions[1]
            few_shot_examples = prompts.few_shot_rate_formality
        elif dimension_ == "supportiveness":
            template = prompts.supportive_toxic
            dimension = prompts.dimensions[2]
            few_shot_examples = prompts.few_shot_rate_supportive
        elif dimension_ == "sarcasm":
            template = prompts.genuine_sarcasm
            dimension = prompts.dimensions[3]
            few_shot_examples = prompts.few_shot_rate_genuine
        elif dimension_ == "politeness":
            template = prompts.rude_polite
            dimension = prompts.dimensions[4]
            few_shot_examples = prompts.few_shot_rate_polite
        elif dimension_ == "humor":
            template = prompts.humor_serious
            dimension = prompts.dimensions[5]
            few_shot_examples = ""
        
        system_content = prompts.prompt_system
        user_content = ""
        if few_shot:
            user_content = prompts.prompt_few_shot_rate
            user_content = user_content.replace("[FEW-SHOT]", few_shot_examples)
        else:
            user_content = prompts.prompt_rate
        user_content = user_content.replace("[TITLE]", title)
        user_content = user_content.replace("[DIMENSION]", dimension)
        user_content = user_content.replace("[DESCRIPTION]", description)
        user_content = user_content.replace("[COMMENT]", comment)
        user_content = user_content.replace("[DIMENSION_TEMPLATE]", template)
        
        
        messages =  [{"role": "system", "content": system_content},
            {"role": "user", "content": user_content}]
    
        dimension_to_tuple[dimension].append((messages, title, description, comment, dimension, row["comment_metadata"], row["submission_metadata"]))
    return dimension_to_tuple

In [274]:
# generating response from gpt 3.5
def get_response(messages, model_name):
    ct, num_tokens = 0, 0
    while ct < 3:
        ct += 1
        try:
            response = client.chat.completions.create(
                model=model_name,
                messages=messages,
                temperature=0.2,
                max_tokens=512,
                frequency_penalty=0,
                presence_penalty=0
            )
            num_tokens += response.usage.total_tokens
            return response, num_tokens
        except Exception as e:
            logging.error(traceback.format_exc())
    return None, num_tokens

In [275]:
def extract_prediction(gpt_message_response):
    try:
        prediction = ''.join(gpt_message_response.split("[")[1].split("]")[0])
        if "." in prediction:
            prediction = prediction.split(".")[0]

        prediction = int(prediction)
    except:
        print("extraction failed")
        return None
    return prediction  

In [116]:
def evaluate_and_extract_gpt_ratings(rating_prompts, model_name, subreddit, dimension, dimension_to_scale_values, ratings_mappings):
    failed, usage = 0, 0
    prompts = rating_prompts[dimension]
    
    print(dimension)

    # creating new dictionary
    if not ratings_mappings or ratings_mappings == None:
        print("Creating new dictionary")
        ratings_mappings = dict()
        for i in range(1, 6):
            ratings_mappings[i] = list()
    else:
        print("Dictionary detected")

    # iterating through the prompts until we get at least 10 comments per scale value
    prompt_index = -1
    total_rated = 0
    while total_rated < 10000 and (len(ratings_mappings[1]) < 10 or len(ratings_mappings[2]) < 10 or len(ratings_mappings[3]) < 10 or len(ratings_mappings[4]) < 10 or len(ratings_mappings[5]) < 10):
        if prompt_index >= len(prompts):
            break
        
        prompt_index += 1
        prompt_tuple = prompts[prompt_index]
        prompt = prompt_tuple[0]
        comment = prompt_tuple[3]
        
        # detecting duplicate comments, no need to rate again
        if detect_duplicate_comments(ratings_mappings, comment):
            continue
        
        response, num_tokens = get_response(prompt, model_name)
        usage += num_tokens
        try: 
            rewrite = response.choices[0].message.content.strip()
        except:
            rewrite = "[API_ERROR]"
            failed += 1
            continue

        prediction = extract_prediction(rewrite)
        if prediction == None:
            continue

        # (prediction rating, prediction, title, description, comment, dimension, row["comment_metadata"], row["submission_metadata"]
        ratings_mappings[prediction].append((prediction, response, prompt_tuple[1], prompt_tuple[2], prompt_tuple[3], prompt_tuple[4], prompt_tuple[5], prompt_tuple[6]))

        
        total_rated = len(ratings_mappings[1]) + len(ratings_mappings[2]) + len(ratings_mappings[3]) + len(ratings_mappings[4]) + len(ratings_mappings[5])
        
        if prompt_index % 50 == 0:
            print("Number of predictions with values 1: " + str(len(ratings_mappings[1])))
            print("Number of predictions with values 2: " + str(len(ratings_mappings[2])))
            print("Number of predictions with values 3: " + str(len(ratings_mappings[3])))
            print("Number of predictions with values 4: " + str(len(ratings_mappings[4])))
            print("Number of predictions with values 5: " + str(len(ratings_mappings[5])))
            print("------------")
            dimension_to_scale_values[dimension] = ratings_mappings
            save_dictionary(dimension_to_scale_values, subreddit)
            
    if total_rated >= 10000:
        print("Completed rating all 10K comments")
    
    print("Completed obtaining at least 10 comments per scale value; next dimension")
    dimension_to_scale_values[dimension] = ratings_mappings
    save_dictionary(dimension_to_scale_values, subreddit)
    
    return dimension_to_scale_values, failed, usage

def read_dictionary(dictionary, subreddit):
    with open(SCALE_RATED_COMMENTS_PER_SUBREDDIT_DIRECTORY + subreddit + "_scale_rated.pkl", 'rb') as fp:
        dictionary = pickle.load(fp)
    return dictionary

def detect_duplicate_comments(dictionary, comment):
    for key, values in dictionary.items():
        for value in values:
            if value[4] == comment or comment in value[4]:
                #print(value[4])
                #print(comment)
                print("Duplicate comment")
                return True
            
    return False

In [317]:
### various functions to save and read files easily

def save_dictionary(dictionary, subreddit):
    # save dictionary to sampled_comments_and_submissions.pkl file
    with open(SCALE_RATED_COMMENTS_PER_SUBREDDIT_DIRECTORY + subreddit + '_scale_rated.pkl', 'wb') as fp:
        pickle.dump(dictionary, fp)
        print('dictionary saved successfully to file')
        
def save_sampled_comments(dictionary, subreddit, dimension, topic):
    PATH_SAVE = SCALE_RATED_COMMENTS_PER_SUBREDDIT_DIRECTORY + topic + '-dimension-rated-samples/'
    if not os.path.isdir(PATH_SAVE):
        os.mkdir(PATH_SAVE)
    
    # save dictionary to sampled_comments_and_submissions.pkl file
    with open(PATH_SAVE + subreddit + '_' + dimension + '_scale_rated_samples.pkl', 'wb') as fp:
        pickle.dump(dictionary, fp)
        print('dictionary saved successfully to file')
        
# contains all 250 comments (i.e. 50 comments per subreddit * 5 gender subreddits)
def save_sampled_comments_combined(dictionary, dimension, topic):
    PATH_SAVE = SCALE_RATED_COMMENTS_PER_SUBREDDIT_DIRECTORY + topic + '-dimension-topic-combined/'
    if not os.path.isdir(PATH_SAVE):
        os.mkdir(PATH_SAVE)
        
    # save dictionary to sampled_comments_and_submissions.pkl file
    with open(PATH_SAVE + dimension + '_combined_' + topic + '_scale_rated_samples.pkl', 'wb') as fp:
        pickle.dump(dictionary, fp)
        print('dictionary saved successfully to file')
        
def save_sampled_comments_pairwise_combinations(dictionary, topic):
    # save dictionary to sampled_comments_and_submissions.pkl file
    with open(PAIRWISE_COMPARISONS_DIRECTORY + topic + '_pairwise_combinations.pkl', 'wb') as fp:
        pickle.dump(dictionary, fp)
        print('dictionary saved successfully to file')
        
def save_sampled_comments_pairwise_combinations_samples(dictionary, topic, size):
    # save dictionary to sampled_comments_and_submissions.pkl file
    with open(PAIRWISE_COMPARISONS_DIRECTORY + topic + 'sampled_' + str(size) + '_pairwise_combinations.pkl', 'wb') as fp:
        pickle.dump(dictionary, fp)
        print('dictionary saved successfully to file')
        
def read_sampled_comments_pairwise_combinations_samples(topic, size):
    # save dictionary to sampled_comments_and_submissions.pkl file
    with open(PAIRWISE_COMPARISONS_DIRECTORY + topic + 'sampled_' + str(size) + '_pairwise_combinations.pkl', 'rb') as fp:
        dictionary = pickle.load(fp)
        print('dictionary read successfully from file')
        
    return dictionary

In [None]:
# selecting the particular subreddit, dimensions, and model you want to use to rate and sample 10K comments
# can adjust the code in case the program crashes during the rating process (e.g. changing the dimensions list, 
# reading in dictionary that contains the results already to avoid starting from scratch)
subreddit_ = "stocks"
dimensions = ["SUPPORTIVE-TOXIC", "HUMOR-SERIOUS", "CASUAL-FORMAL", "GENUINE-SARCASM", "RUDE-POLITE"]  
model_name = "gpt-3.5-turbo"

csv_file = subreddit_ + "_processed_dataframe.csv"

subreddit_to_scale_ratings = dict()
for subreddit in [subreddit_]:
    print(subreddit)
    print("--------------------------------")
    df = pd.read_csv(SAMPLED_COMMENTS_PER_SUBREDDIT_DIRECTORY + csv_file).dropna(subset=["comment"])
    rating_zero_shot = construct_message_rating(df, False)
    
    dimension_to_scale_values = dict()

    # this code is only for resuming with another batch; comment out if first time using
    # also pass in None value instead of dimension_to_scale_values[dimension] in the evaluate_and_extract_gpt_ratings
    #dimension_to_scale_values = read_dictionary(dimension_to_scale_values, subreddit)
    
    for dimension in dimensions:
        if dimension not in dimension_to_scale_values and len(dimension_to_scale_values) != 0:
            dimension_to_scale_values[dimension] = None
        
        dimension_to_scale_values, failed, usage = evaluate_and_extract_gpt_ratings(rating_zero_shot, model_name, subreddit, dimension, dimension_to_scale_values, None)

    save_dictionary(dimension_to_scale_values, subreddit)
    subreddit_to_scale_ratings[subreddit] = dimension_to_scale_values
    
    print("Usage: " + str(usage))
    print("Failed: " + str(failed))
    print("Done with: " + subreddit)

askmen
--------------------------------
SUPPORTIVE-TOXIC
Number of predictions with values 1: 0
Number of predictions with values 2: 1
Number of predictions with values 3: 0
Number of predictions with values 4: 0
Number of predictions with values 5: 0
------------
dictionary saved successfully to file
Number of predictions with values 1: 1
Number of predictions with values 2: 32
Number of predictions with values 3: 5
Number of predictions with values 4: 10
Number of predictions with values 5: 3
------------
dictionary saved successfully to file
Number of predictions with values 1: 1
Number of predictions with values 2: 63
Number of predictions with values 3: 10
Number of predictions with values 4: 21
Number of predictions with values 5: 6
------------
dictionary saved successfully to file


### Creating pairwise comparisons
- For each dimension, combine the 50 comments per n subreddit, creating 50*n comments
- Create pairwise comparisons among these 50*n comments
- Sampling 1250 pairwise comparisons per dimension (check that the comment appears roughly 10 times)

In [277]:
# for example, if we want to generate synthetic labels for the finance topic, we would use the code below
# to create pairwise comparisons
dimension_combined_comments_subreddits = dict()
subreddit_lists = ['wallstreetbets', 'wallstreetbetsnew', 'stocks', 'pennystocks']
topic = "finance"

for dimension in ['SUPPORTIVE-TOXIC', 'RUDE-POLITE', 'HUMOR-SERIOUS', 'GENUINE-SARCASM', 'CASUAL-FORMAL']:
    list_dimension_combined_comments_subreddits = list()

    for subreddit in subreddit_lists:
        dict_ = dict()
        with open(SCALE_RATED_COMMENTS_PER_SUBREDDIT_DIRECTORY + subreddit + "_scale_rated.pkl", 'rb') as fp:
            dict_ = pickle.load(fp)

        # only selecting supportive-toxic
        sampled_comments = dict()
        for scale_values, prediction_list in dict_[dimension].items():

            # adding subreddit information to tuple
            prediction_list_with_subreddit = list()
            for prediction in prediction_list:
                list_prediction = list(prediction)
                list_prediction.append(subreddit)
                edited_tuple = tuple(list_prediction)
                
                # filtering out posts with edits, media, etc
                if not filter_(edited_tuple):
                    prediction_list_with_subreddit.append(edited_tuple)

            sampled_comments[scale_values] = prediction_list_with_subreddit

        # confirming at least 10 comments per scale-value; checking extreme ratings (1s and 5s) and
        # sampling from the next closest value if there's not enough
        list_sample = [10, 10, 10, 10, 10]
        if len(sampled_comments[1]) < 10:
            list_sample[0] = len(sampled_comments[1])
            list_sample[1] = (10 - len(sampled_comments[1])) + 10

        if len(sampled_comments[5]) < 10:
            list_sample[4] = len(sampled_comments[5])
            list_sample[3] = (10 - len(sampled_comments[5])) + 10

        # sampling based on the prior pass
        print(list_sample)
        sampled_comments_random = dict()
        for scale_values, prediction_list in sampled_comments.items():
            sampled = random.sample(prediction_list, k=list_sample[scale_values - 1])
            sampled_comments_random[scale_values] = sampled

            list_dimension_combined_comments_subreddits += sampled 

        save_sampled_comments(sampled_comments_random, subreddit, dimension, topic)
        
    dimension_combined_comments_subreddits[dimension] = list_dimension_combined_comments_subreddits
    save_sampled_comments_combined(dimension_combined_comments_subreddits, dimension, topic)

[10, 10, 10, 10, 10]
dictionary saved successfully to file
[10, 10, 10, 10, 10]
dictionary saved successfully to file
[4, 16, 10, 10, 10]
dictionary saved successfully to file
[10, 10, 10, 10, 10]
dictionary saved successfully to file
dictionary saved successfully to file
[10, 10, 10, 18, 2]
dictionary saved successfully to file
[10, 10, 10, 20, 0]
dictionary saved successfully to file
[10, 10, 10, 17, 3]
dictionary saved successfully to file
[10, 10, 10, 14, 6]
dictionary saved successfully to file
dictionary saved successfully to file
[10, 10, 10, 10, 10]
dictionary saved successfully to file
[10, 10, 10, 10, 10]
dictionary saved successfully to file
[10, 10, 10, 10, 10]
dictionary saved successfully to file
[10, 10, 10, 10, 10]
dictionary saved successfully to file
dictionary saved successfully to file
[2, 18, 10, 10, 10]
dictionary saved successfully to file
[0, 20, 10, 10, 10]
dictionary saved successfully to file
[0, 20, 10, 10, 10]
dictionary saved successfully to file
[0, 20, 1

In [278]:
# for the particular dimension, we simply combine all 50 comments per subreddit (check 50 * num_subreddits)
len(dimension_combined_comments_subreddits[dimension])

200

In [279]:
# creating pairwise comparisons
dimension_pair_order_list = dict()
for dimension, list_dimension_combined_comments in dimension_combined_comments_subreddits.items():
    pair_order_list = list(itertools.combinations(list_dimension_combined_comments, 2))
    
    # confirming that the pairwise comparisons do not contain self comparisons 
    pair_order_list_duplicate_filtered = [pair for pair in pair_order_list if pair[0][4] != pair[1][4]]
            
    print(len(pair_order_list_duplicate_filtered))
    dimension_pair_order_list[dimension] = pair_order_list

# saving the pairwise comparisons in the file
save_sampled_comments_pairwise_combinations(dimension_pair_order_list, topic)

19900
19900
19899
19900
19900
dictionary saved successfully to file


In [280]:
# sample 1250 pairwise comparisons out of 31125 unique comparisons for the dimension
num_samples = 1250

dimension_sampled = dict()
for dimension, pair_order_list in dimension_pair_order_list.items():
    sampled_pairwise_comparisons = random.sample(pair_order_list, k=num_samples)
    print(len(sampled_pairwise_comparisons))
    dimension_sampled[dimension] = sampled_pairwise_comparisons

# saving the samples
topic = "finance"
save_sampled_comments_pairwise_combinations_samples(dimension_sampled, topic, num_samples)

1250
1250
1250
1250
1250
dictionary saved successfully to file


In [281]:
# Python program to get average of a list 
def Average(lst): 
    return sum(lst) / len(lst) 

# checking to ensure each comment appears roughly 10 times such that the model learns to generalize and learn
# from each comment across multiple iteration
for dimension, pair_order_list in dimension_sampled.items():
    # building the dictionary mapping to keep track of frequency a comment has been present in the pairwise
    comment_appearance = dict()
    for sample_comment_data in dimension_combined_comments_subreddits[dimension]:
        comment = sample_comment_data[4]
        comment_appearance[comment] = 0

    for pairwise in pair_order_list:
        # choosing each of the comment in the pairwise and fetching the comment attribute in tuple
        comment_appearance[pairwise[0][4]] +=1
        comment_appearance[pairwise[1][4]] +=1

    #print(comment_appearance)
    #break
    res = sum((i - Average(comment_appearance.values())) ** 2 for i in comment_appearance.values()) / len(comment_appearance.values()) 
    print(dimension)
    print("Each comment appears on average: " + str(Average(comment_appearance.values())))
    print("Each comment appears with variance of: " + str(res))
    print("Each comment appears as much as: " + str(max(comment_appearance.values())))
    print("Each comment appears as little as: " + str(min(comment_appearance.values())))

SUPPORTIVE-TOXIC
Each comment appears on average: 12.5
Each comment appears with variance of: 8.72
Each comment appears as much as: 20
Each comment appears as little as: 4
RUDE-POLITE
Each comment appears on average: 12.5
Each comment appears with variance of: 11.23
Each comment appears as much as: 23
Each comment appears as little as: 6
HUMOR-SERIOUS
Each comment appears on average: 12.56281407035176
Each comment appears with variance of: 15.813893588545742
Each comment appears as much as: 37
Each comment appears as little as: 5
GENUINE-SARCASM
Each comment appears on average: 12.5
Each comment appears with variance of: 12.18
Each comment appears as much as: 25
Each comment appears as little as: 4
CASUAL-FORMAL
Each comment appears on average: 12.5
Each comment appears with variance of: 10.06
Each comment appears as much as: 24
Each comment appears as little as: 4


### Synthetic Labeling using GPT4
GPT4 (Definitions + No Tie Generation, Few-Shot, including ties in human annotations)        

In [282]:
def collect_annotation_batch_pairwise_tuples(sampled_pairwise_comparisons):
    annotations = []
    for pairwise in sampled_pairwise_comparisons:

        score1 = pairwise[0][0]
        response1 = pairwise[0][1]
        title1 = pairwise[0][2]
        description1 = pairwise[0][3]
        comment1 = pairwise[0][4]
        dimension = pairwise[0][5]
        comment_metadata1 =pairwise[0][6]
        post_metadata1 = pairwise[0][7]
        subreddit1 = pairwise[0][8]
        
        score2 = pairwise[1][0]
        response2 = pairwise[1][1]
        title2 = pairwise[1][2]
        description2 = pairwise[1][3]
        comment2 = pairwise[1][4]
        dimension = pairwise[1][5]
        comment_metadata2 =pairwise[1][6]
        post_metadata2 = pairwise[1][7]
        subreddit2 = pairwise[1][8]
   
        example = {"Title1" : title1, 
                   "Description1" : description1,
                   "Comment1" : comment1,
                    "Comment_Metadata1" : comment_metadata1,
                   "Post_Metadata1" : post_metadata1,
                   "Subreddit1" : subreddit1,
                   "Score1" : score1,
                   "Response1" : response1,
                    "Title2" : title2, 
                   "Description2" : description2,
                   "Comment2" : comment2,
                    "Comment_Metadata2" : comment_metadata2,
                   "Post_Metadata2" : post_metadata2,
                   "Subreddit2" : subreddit2,
                   "Score2" : score2,
                   "Response2" : response2,
                   "Dimension" : dimension,
                    }
        annotations.append(example)
            
    return annotations

In [283]:
def construct_pairwise_message_definition_fewshot(annotations, dimension_):
    few_shot_prompt_list = list()

    for annotation in annotations:

        title1 = annotation["Title1"]
        description1 = annotation["Description1"] if annotation["Description1"] == annotation["Description1"] else ""
        comment1 = annotation["Comment1"]
        title2 = annotation["Title2"]
        description2 = annotation["Description2"] if annotation["Description2"] == annotation["Description2"] else ""
        comment2 = annotation["Comment2"]

        template = None
        dimension = None
        pairwise_dimension = None
        few_shot = None
        if dimension_ == "formality":
            template = prompts.casual_formal_definition
            dimension = prompts.dimensions[1]
            pairwise_dimension = prompts.pairwise_dimension[0]
            few_shot = prompts.few_shot_pairwise_formality 

        elif dimension_ == "supportiveness":
            template = prompts.supportive_toxic_definition
            dimension = prompts.dimensions[2]
            pairwise_dimension = prompts.pairwise_dimension[1]
            few_shot = prompts.few_shot_pairwise_supportive 

        elif dimension_ == "sarcasm":
            template = prompts.genuine_sarcasm_definition
            dimension = prompts.dimensions[3]
            pairwise_dimension = prompts.pairwise_dimension[2]
            few_shot = prompts.few_shot_pairwise_genuine

        elif dimension_ == "politeness":
            template = prompts.rude_polite_definition
            dimension = prompts.dimensions[4]
            pairwise_dimension = prompts.pairwise_dimension[3]
            few_shot = prompts.few_shot_pairwise_polite 

        elif dimension_ == "humor":
            template = prompts.humor_serious_definition
            dimension = prompts.dimensions[5]
            pairwise_dimension = prompts.pairwise_dimension[4]
            few_shot = prompts.few_shot_pairwise_humor 

        system_content = prompts.prompt_system_pairwise
        user_content = prompts.prompt_pairwise_definition_few_shot     
        user_content = user_content.replace("[DIMENSION]", dimension)
        user_content = user_content.replace("[DIMENSION_PAIRWISE]", pairwise_dimension)
        user_content = user_content.replace("[DIMENSION_DEFINITION]", template)
        user_content = user_content.replace("[FEW-SHOT]", few_shot)


        user_content = user_content.replace("[TITLE1]", title1)
        user_content = user_content.replace("[DESCRIPTION1]", description1)
        user_content = user_content.replace("[COMMENT1]", comment1)
        user_content = user_content.replace("[TITLE2]", title2)
        user_content = user_content.replace("[DESCRIPTION2]", description2)
        user_content = user_content.replace("[COMMENT2]", comment2)
        #user_content = user_content.replace("[RATING]", str(rating))

        messages =  [[{"role": "system", "content": system_content},
            {"role": "user", "content": user_content}], title1, description1, comment1,
                     annotation["Comment_Metadata1"], annotation["Post_Metadata1"],
                     annotation["Subreddit1"], annotation["Score1"], annotation["Response1"],
                     title2, description2, comment2,
                     annotation["Comment_Metadata2"], annotation["Post_Metadata2"],
                     annotation["Subreddit2"], annotation["Score2"], annotation["Response2"], annotation["Dimension"]]
        few_shot_prompt_list.append((messages))
    return few_shot_prompt_list

In [284]:
def save_dictionary_gpt4_synthetic_data(dictionary, dimension, topic):
    # save dictionary to sampled_comments_and_submissions.pkl file
    with open(SYNTHETIC_DATA_DIRECTORY + topic + "_" + dimension + '_synthetic_data.pkl', 'wb') as fp:
        pickle.dump(dictionary, fp)
        print('dictionary saved successfully to file')

In [288]:
# generating response from gpt 3.5
def get_response(messages, model_name, temperature):
    ct, num_tokens, num_completion_tokens, num_prompt_tokens = 0, 0, 0, 0
    while ct < 3:
        ct += 1
        try:
            response = client.chat.completions.create(
                model=model_name,
                messages=messages,
                temperature=temperature,
                max_tokens=512,
                frequency_penalty=0,
                presence_penalty=0
            )
            num_tokens += response.usage.total_tokens
            num_completion_tokens += response.usage.completion_tokens
            num_prompt_tokens += response.usage.prompt_tokens
            
            return response, num_tokens, num_completion_tokens, num_prompt_tokens
        except Exception as e:
            print("Error")
            print(e)
            #logging.error(traceback.format_exc())
    return None, num_tokens, num_completion_tokens, num_prompt_tokens

def evaluate_gpt_rating(prompts, model_name, temperature, topic, dimension):
    failed, usage, prompt_usage, completion_tokens = 0, 0, 0, 0
    rated_comments = dict()
    
    prompts_rated = list()
    for i, prompt_tuple in enumerate(prompts):
        prompt_message = prompt_tuple[0]
        #print(prompt_message)
        response, num_tokens, num_completion_tokens, num_prompt_tokens = get_response(prompt_message, model_name, temperature)
        try: 
            rewrite = response.choices[0].message.content.strip()
        except:
            rewrite = "[API_ERROR]"
            failed += 1
        usage += num_tokens
        prompt_usage += num_prompt_tokens
        completion_tokens += num_completion_tokens


        prompt_tuple.insert(0, rewrite)
        prompts_rated.append(prompt_tuple)


        if i % 10 == 0:
            rated_comments[dimension] = prompts_rated
            print("Number of prompts completed: " + str(i))
            save_dictionary_gpt4_synthetic_data(rated_comments, dimension, topic)


    rated_comments[dimension] = prompts_rated
    save_dictionary_gpt4_synthetic_data(rated_comments, dimension, topic)
        
    print("Total Prompt Token Usage: " + str(num_prompt_tokens))
    print("Total Completion Token Usage: " + str(num_completion_tokens))
    return rated_comments, failed, usage

In [292]:
def extract_rating_and_organize(file_name, save_to_csv_name):
    dict_ = dict()
    with open(SYNTHETIC_DATA_DIRECTORY + file_name, 'rb') as f:
        dict_ = pickle.load(f)

    prediction_list = []
    response_list = []
    pairwise_prompt_list = []
    title1_list = []
    description1_list = []
    comment1 = []
    comment_metadata1 = []
    post_metadata1 = []
    subreddit1 = []
    rated_score1 = []
    rated_score_response1 = []
    title2_list = []
    description2_list = []
    comment2 = []
    comment_metadata2 = []
    post_metadata2 = []
    subreddit2 = []
    rated_score2 = []
    rated_score_response2 = []
    dimension_list = []

    for dimension, prompt_tuples in dict_.items():
        for prompt_tuple in prompt_tuples:
            response = prompt_tuple[0]
            
            try:
                prediction = ''.join(response.split("{")[1].split("}")[0])
            except:
                print(response)
                print(prompt_tuple[1])
                continue
            
            prediction_list.append(prediction)
            response_list.append(response)
            pairwise_prompt_list.append(prompt_tuple[1])
            title1_list.append(prompt_tuple[2])
            description1_list.append(prompt_tuple[3])
            comment1.append(prompt_tuple[4])
            comment_metadata1.append(prompt_tuple[5])
            post_metadata1.append(prompt_tuple[6])
            subreddit1.append(prompt_tuple[7])
            rated_score1.append(prompt_tuple[8])
            rated_score_response1.append(prompt_tuple[9])
            title2_list.append(prompt_tuple[10])
            description2_list.append(prompt_tuple[11])
            comment2.append(prompt_tuple[12])
            comment_metadata2.append(prompt_tuple[13])
            post_metadata2.append(prompt_tuple[14])
            subreddit2.append(prompt_tuple[15])
            rated_score2.append(prompt_tuple[16])
            rated_score_response2.append(prompt_tuple[17])
            dimension_list.append(prompt_tuple[18])
            
    df = pd.DataFrame()
    df["synthetic_label"] = prediction_list
    df["gpt4_synthetic_label_response"] = response_list
    df["gpt4_synthetic_label_prompt"] = pairwise_prompt_list
    df["title1"] = title1_list
    df["description1"] = description1_list
    df["comment1"] = comment1
    df["comment1_metadata"] = comment_metadata1
    df["post1_metadata"] = post_metadata1
    df["subreddit1"] = subreddit1
    df["gpt3-5_rated_score1"] = rated_score1
    df["gpt3-5_rated_score1_response"] = rated_score_response1
    df["title2"] = title2_list
    df["description2"] = description2_list
    df["comment2"] = comment2
    df["comment2_metadata"] = comment_metadata2
    df["post2_metadata"] = post_metadata2
    df["subreddit2"] = subreddit2
    df["gpt3-5_rated_score2"] = rated_score2
    df["gpt3-5_rated_score2_response"] = rated_score_response2
    df["dimension"] = dimension_list
    
    df.to_csv(SYNTHETIC_DATA_DIRECTORY + save_to_csv_name)
    return df

In [359]:
# creating prompts
dimensions = {'SUPPORTIVE-TOXIC' : "supportiveness", 'CASUAL-FORMAL' : "formality", 'GENUINE-SARCASM': "sarcasm", 'RUDE-POLITE' : "politeness", 'HUMOR-SERIOUS': "humor"}

dimension_to_prompts = dict()
for dimension, pair_order_list in dimension_sampled.items():
    print(dimension)
    annotations = collect_annotation_batch_pairwise_tuples(pair_order_list)
    prompt_list = construct_pairwise_message_definition_fewshot(annotations, dimensions[dimension])
    dimension_to_prompts[dimension] = prompt_list

SUPPORTIVE-TOXIC
RUDE-POLITE
HUMOR-SERIOUS
GENUINE-SARCASM
CASUAL-FORMAL


In [298]:
# creating synthetic labels using GPT4, repeat this for various norm dimensions
model_name = "gpt-4"
temperature = 0.2
topic = "finance"
dimension_ = dimensions_[0]

rated_comments, failed, usage = evaluate_gpt_rating(dimension_to_prompts[dimension_], model_name, temperature, topic, dimension_)
print("Total failed: " + str(failed))
print("Total usage: " + str(usage))

Number of prompts completed: 0
dictionary saved successfully to file
Number of prompts completed: 10
dictionary saved successfully to file
Number of prompts completed: 20
dictionary saved successfully to file
Number of prompts completed: 30
dictionary saved successfully to file
Number of prompts completed: 40
dictionary saved successfully to file
Number of prompts completed: 50
dictionary saved successfully to file
Number of prompts completed: 60
dictionary saved successfully to file
Number of prompts completed: 70
dictionary saved successfully to file
Number of prompts completed: 80
dictionary saved successfully to file
Number of prompts completed: 90
dictionary saved successfully to file
Number of prompts completed: 100
dictionary saved successfully to file
Number of prompts completed: 110
dictionary saved successfully to file
Number of prompts completed: 120
dictionary saved successfully to file
Number of prompts completed: 130
dictionary saved successfully to file
Number of prompts

In [None]:
extract_rating_and_organize("finance_SUPPORTIVE-TOXIC_synthetic_data.pkl", "finance_supportive_toxic_synthetic_data.csv")