In [88]:
import openai
import pandas as pd
import numpy as np
import time
import tiktoken
import json
import pickle as pkl
from tiktoken import encoding_for_model

openai.api_key = "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"

#-----------------#
stratified_art_filepath = "S:\\Sync\\University\\2023_MRP_1\\MRP1_WorkDir\\data\\destilled\\r_art_stratified.csv"
stratified_art_df = pd.read_csv(stratified_art_filepath, sep=";")
#-----------------#

def num_tokens_of_prompt(prompt_str, model="gpt-3.5-turbo"):
    """Returns the number of tokens used by a prompt; does not take default constant tokens into account"""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        encoding = tiktoken.get_encoding("cl100k_base")
    num_tokens = len(encoding.encode(prompt_str))
    return num_tokens


def num_tokens_of_chat_messages(messages, model="gpt-3.5-turbo-0301"):
    """Returns the number of tokens used by a list of messages."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        encoding = tiktoken.get_encoding("cl100k_base")
    if model == "gpt-3.5-turbo-0301":  # note: future models may deviate from this
        num_tokens = 0
        for message in messages:
            num_tokens += 4  # every message follows <im_start>{role/name}\n{content}<im_end>\n
            for key, value in message.items():
                num_tokens += len(encoding.encode(value))
                if key == "name":  # if there's a name, the role is omitted
                    num_tokens += -1  # role is always required and always 1 token
        num_tokens += 2  # every reply is primed with <im_start>assistant
        return num_tokens

In [89]:

def gpt_prepare_batched_prompt(articles_df,  annotation_classes, include_annot_criteria,annotation_batch_size=4, sample_separator=" \n\n ", verbose=-1):
    
    assert type(annotation_classes) == dict, "annotation_classes must be givn as a dictionary of the form {annotation_feature: [annotation_class1, annotation_class2, ...]}"
    
    chat_prompts = {annot_feat: [] for annot_feat in annotation_classes.keys()}
    
    # Import instructions, annotation criteria from JSON file and get the comment ids
    annotat_feature_intro_prompts = {}
    for annot_feat, annot_feat_classes in annotation_classes.items():
        assert type(annot_feat_classes) == list, "annotation_classes must be givn as a dictionary of the form {annotation_feature: [annotation_class1, annotation_class2, ...]}"   
        annotat_feature_intro_prompts[annot_feat] = """Label {} towards quantum science & technology strictly only with {}. The annotation criteria for each {} class are as follows: \n""".format(annot_feat, ", ".join(annot_feat_classes[:-1]) + " or " + annot_feat_classes[-1], annot_feat)
    annotation_criteria = json.load(open("./headline_annotation_criteria.json", "r"))
    assert set(annotation_classes.keys()) == set(annotation_criteria.keys()), "annotation_classes and annotation_criteria must have the same keys"
    
    
    # Split the headlines into batches with maximum size of annotation_batch_size
    #First collect all full batches
    
    art_ids = articles_df["article_id"].values
    batches = []
    num_batches = len(articles_df) // annotation_batch_size
    batches.extend([art_ids[i * annotation_batch_size : (i + 1) * annotation_batch_size] for i in range(num_batches)])
    #Then collect the last batch if it is not full
    if len(articles_df) % annotation_batch_size != 0:
        batches.append(art_ids[num_batches * annotation_batch_size:])

    art_info_batched_collection = []
    
    print("Prepared ", len(batches), " batches of size ", [len(batch) for batch in batches], " each")
    
    for b_i, batch in enumerate(batches):
        # Engineer batch prompt as: Instruction base prompt + Annotation criteria + Samples to annotate
        batch_size = len(batch)
        
        batch_art_headlines = [articles_df[articles_df["article_id"] == c_art_id]["headline"].values[0] for c_art_id in batch]
        batch_art_inputs = [" <#{}#> ".format(art_batch_idx+1) + art_headline for art_batch_idx, art_headline in enumerate(batch_art_headlines)]
        
        # ----- Store comment metadata, body and article info for later reference -------
        batch_info = []
        
        
        for art_num_in_batch, (art_id, art_headline) in enumerate(zip(batch, batch_art_headlines)):
            batch_info.append({"article_id": art_id, 
                                "batch_#": b_i+1,
                               "article_#_in_batch": art_num_in_batch+1,
                               "headline": art_headline,
                               })
        
        art_info_batched_collection.append(batch_info)
        # ----<---------------------------------------------------------------------------
        
        samples_intro_prompt = """Here are the article headlines for annotation: \n"""

        for annot_feat in annotation_classes.keys():
            print("Preparing batch ", b_i+1, " of ", len(batches), " for annotation feature ", annot_feat)
            
            if include_annot_criteria:
                                               
                context_prompt_1 = {annot_feat : """You are an annotator. Your task is to annotate a given set of article headlines on their {} towards quantum science & technology. The headlines correspond to news articles or press releases, which were referenced in science or technology related subreddits on Reddit. Your goal is to strictly follow the given set of annotation criteria for assessing whether a headline indicates an {} {} towards quantum science & technology. Focus on understanding the criteria and applying them carefully to each headline independently.""".format(annot_feat, "/".join(annot_feat_classes) , annot_feat) for annot_feat, annot_feat_classes in annotation_classes.items()}
                context_prompt_2 = """Throughout the instructions, "quantum science & technology" encompasses not only the factual content of quantum theory but also the context, presentation, discussion, and interpretation of research practice on the topic. It also includes topic-related concepts or findings. """
                instruct_prompt_1 = {annot_feat : """Now, using the provided criteria, classify each headline independently as {} towards quantum science & technology. Only respond in the format "<#i#> {};". Never justify your responses.""".format(" or ".join(annotation_classes[annot_feat]), annot_feat) for annot_feat in annotation_classes.keys()}
                

                prompt_messages = [{"role": "user", "content": context_prompt_1[annot_feat]}, 
                                   {"role": "user", "content": context_prompt_2}]

                
                prompt_messages.append({"role": "user", "content": annotat_feature_intro_prompts[annot_feat] + annotation_criteria[annot_feat]})
                
                prompt_messages.extend([{"role": "user", "content": samples_intro_prompt + sample_separator.join(batch_art_inputs)},
                                        {"role": "user", "content": instruct_prompt_1[annot_feat]},
                                        ])
            else:
                context_prompt_1 = {annot_feat : """You are an annotator. Your task is to annotate a given set of article headlines on their {} towards quantum science & technology. The headlines correspond to news articles or press releases, which were referenced in science or technology related subreddits on Reddit. Your goal is to strictly adhere to the concept of sentiment for assessing whether a headline indicates an {} attitude towards quantum science & technology. Make sure to consider only the attitude towards the topic of quantum science & technology, not the overall {}. Focus on understanding what each headline expresses and classify each headline independently.""".format(annot_feat, "/".join(annot_feat_classes) , annot_feat) for annot_feat, annot_feat_classes in annotation_classes.items()}
                context_prompt_2 = """Throughout the instructions, "quantum science & technology" encompasses not only the factual content of quantum theory but also the context, presentation, discussion, and interpretation of research practice on the topic. It also includes topic-related concepts or findings. """
                instruct_prompt_1 = {annot_feat : """Now label each headline independently as {} towards quantum science & technology. Only respond in the format "<#i#> {};". Never justify your responses.""".format(" or ".join(annotation_classes[annot_feat]), annot_feat) for annot_feat in annotation_classes.keys()}
                

                prompt_messages = [{"role": "user", "content": context_prompt_1[annot_feat]}, 
                                   {"role": "user", "content": context_prompt_2}]


                prompt_messages.extend([{"role": "user", "content": samples_intro_prompt + sample_separator.join(batch_art_inputs)},
                                        {"role": "user", "content": instruct_prompt_1[annot_feat]},
                                        ])

            if verbose == -1:
                for message in prompt_messages:
                    print(message)
            chat_prompts[annot_feat].append({ "article_ids" : batch, "messages": prompt_messages})    

    
    return art_info_batched_collection, chat_prompts
    


In [90]:

annotation_classes = {"sentiment": ["positive", "negative", "neutral"],}


cmt_info_batched_collection, chat_prompts = gpt_prepare_batched_prompt(articles_df=stratified_art_df,
                                                                        annotation_classes=annotation_classes,
                                                                        include_annot_criteria=False,
                                                                        annotation_batch_size=25,
                                                                        verbose=0,
                                                                        )

print(cmt_info_batched_collection)
print(len(cmt_info_batched_collection))
complete_prompt_str = " "

for annot_feat in annotation_classes.keys():
    #print(chat_prompts[annot_feat][0]["messages"][])
    print("\n########\n".join([chat_prompts[annot_feat][0]["messages"][i]["content"] for i in range(len(chat_prompts[annot_feat][0]["messages"]))]))
    complete_prompt_str += "\n\n".join([chat_prompts[annot_feat][0]["messages"][i]["content"] for i in range(len(chat_prompts[annot_feat][0]["messages"]))])
    print("########")
    print("########")
    print("########")
    print("########")

encoding = encoding_for_model("gpt-3.5-turbo")

print(len(encoding.encode(complete_prompt_str)))

Prepared  5  batches of size  [25, 25, 25, 25, 23]  each
Preparing batch  1  of  5  for annotation feature  sentiment
Preparing batch  2  of  5  for annotation feature  sentiment
Preparing batch  3  of  5  for annotation feature  sentiment
Preparing batch  4  of  5  for annotation feature  sentiment
Preparing batch  5  of  5  for annotation feature  sentiment
5
You are an annotator. Your task is to annotate a given set of article headlines on their sentiment towards quantum science & technology. The headlines correspond to news articles or press releases, which were referenced in science or technology related subreddits on Reddit. Your goal is to strictly adhere to the concept of sentiment for assessing whether a headline indicates an positive/negative/neutral attitude towards quantum science & technology. Make sure to consider only the attitude towards the topic of quantum science & technology, not the overall sentiment. Focus on understanding what each headline expresses and classify

In [91]:
results_backup_catcher = None

def generate_annotations(model_id, model_kwargs,articles_df, annotation_classes, include_annot_criteria, annotation_batch_size = 4, num_repetitions=1):
    
    responses = []
    consumed_tokens = 0


    art_info_batched_collection, chat_prompts = gpt_prepare_batched_prompt(articles_df=articles_df,
                                                                        annotation_classes=annotation_classes,
                                                                        include_annot_criteria=include_annot_criteria,
                                                                        annotation_batch_size=annotation_batch_size,
                                                                        verbose=0,
                                                                        )


    '''To prepare our output batch-wise, we need to prepare the output storage for each batch
    Since the batch size and the article ids should be same among different features, we can use any of the annotation features 
    to get the info before iterating over the annotation features. '''
    
    for b_i, _ in enumerate(chat_prompts[list(annotation_classes.keys())[0]]): #Iterate batches
        
        #The following is for the case we want to continue from a certain batch (e.g. when the API connection was lost)
        #if b_i < 202:
        #     continue
        
        print("Processing batch {} of {}".format(b_i+1, len(chat_prompts[list(annotation_classes.keys())[0]])))
        batch_annotations = []
        
        #Recall that chat_prompts contains two items: "article_ids" and "messages" of a single batch
        num_articles_in_batch = len(chat_prompts[list(annotation_classes.keys())[0]][b_i]["article_ids"])
        batch_article_indices = list(range(num_articles_in_batch))
        
        for art_i in batch_article_indices: #Prepare a storage for the annotations of each headline in the batch
            print("Article {} of batch {}".format(art_i+1, b_i+1))
            art_annotations = {}
            art_annotations["article_idx_in_batch"] = art_i+1
            art_annotations["article_id"] = chat_prompts[list(annotation_classes.keys())[0]][b_i]["article_ids"][art_i]
            batch_annotations.append(art_annotations)
        
        for annot_feat in annotation_classes.keys(): #Request annotationsm, parse them and save them in the batch_annotations
            feat_batch_prompt = chat_prompts[annot_feat][b_i]
            
            prompt_messages = feat_batch_prompt["messages"]
            
            print("Retrieving annotations for feature {} and batch {}".format(annot_feat, b_i+1))
            
            for retrieval_attempt in range(1000000):
                try:
                    response = openai.ChatCompletion.create(model=model_id,
                                                            messages=prompt_messages,
                                                            n=num_repetitions,
                                                            **model_kwargs
                                                            )
                except openai.error.APIError as e:
                    #Handle API error here, e.g. retry or log
                    print(f"OpenAI API returned an API Error: {e} for the {retrieval_attempt}th time")
                    print("Backing up results...")
                    results_backup_catcher = {"responses": responses, "cmt_info_batched_collection": cmt_info_batched_collection}
                    print("Waiting before retrying...")
                    time.sleep(60)
                    continue #Retry the request if an error occured
                    
                except openai.error.APIConnectionError as e:
                    #Handle connection error here
                    print(f"Failed to connect to OpenAI API: {e} for the {retrieval_attempt}th time")
                    print("Backing up results...")
                    results_backup_catcher = {"responses": responses, "cmt_info_batched_collection": cmt_info_batched_collection}
                    print("Waiting before retrying...")
                    time.sleep(60)
                    continue
                    
                except openai.error.RateLimitError as e:
                    #Handle rate limit error (we recommend using exponential backoff)
                    print(f"OpenAI API request exceeded rate limit: {e}")
                    print("Backing up results...")
                    results_backup_catcher = {"responses": responses, "cmt_info_batched_collection": cmt_info_batched_collection}
                    print("Waiting 60 seconds before retrying...")
                    time.sleep(300)
                    continue
                except openai.error.InvalidRequestError as e:    
                    print(f"OpenAI API returned an API Error: {e} for the {retrieval_attempt}th time")
                    print("Backing up results...")
                    results_backup_catcher = {"responses": responses, "cmt_info_batched_collection": cmt_info_batched_collection}
                    print("We just continue with the next batch, but please see the log to catch up on the lost samples...")
                    break
                else:
                    break #Break out of the retry loop if no error occured
                    
            
            print("Received response! Starting to parse annotations...")
            
            print("=================================")
            response_contents =  [ choice["message"]["content"] for choice in response["choices"]]
            print(response_contents)
            consumed_tokens += response["usage"]["total_tokens"]
            print("=================================")

            # Convert response to raw annotations:
            try: #Catch when the response strings can't properly be parsed into annotations
                for art_i in batch_article_indices:
                    #print(cmt_i)
                    batch_annotations[art_i][annot_feat] = [ ]
                    for rc in response_contents:
                        #print(rc)
                        rc = rc.strip()
                        temp_annot_str = rc.split("<#{}#>".format(art_i+1))[1]
                        art_annotation_str = temp_annot_str.split(";")[0]
                        #batch_annotations[cmt_i][annot_feat].append(cmt_annotation_str.lower())
                        for annot_class in annotation_classes[annot_feat]:
                            if annot_class in art_annotation_str.lower():
                                batch_annotations[art_i][annot_feat].append(annot_class)
                                break #Only one annotation per class is allowed
            except: 
                print("Response couldn't be parsed! Skipping batch {}...".format( b_i+1))
                print("We will print the response contents for reference and append the raw response to the output")
                print(response_contents)
                batch_annotations.append(response_contents)
                continue
        
        responses.extend([{"batch_id": b_i, "annotated_by": model_id, "labels": batch_annotations}])
        print("Batch {} completed!".format(b_i+1), "Sleeping to avoid exceeding the rate limit...")
        time.sleep(45)
 
    print("Warning: Used ", consumed_tokens, " tokens over all batches.")

    return art_info_batched_collection, responses






In [92]:

annotation_classes = {"sentiment": ["positive", "negative", "neutral"],}

art_info_batched_collection, chat_prompts = gpt_prepare_batched_prompt(articles_df=stratified_art_df,
                                                                        annotation_batch_size=20,
                                                                        annotation_classes=annotation_classes,
                                                                        include_annot_criteria=True,#True,
                                                                        verbose=0,
                                                                        )

print(art_info_batched_collection)
print(len(art_info_batched_collection))
complete_prompt_str = " "

for annot_feat in annotation_classes.keys():
    #print(chat_prompts[annot_feat][0]["messages"][])
    print("\n########\n".join([chat_prompts[annot_feat][0]["messages"][i]["content"] for i in range(len(chat_prompts[annot_feat][0]["messages"]))]))
    complete_prompt_str += "\n\n".join([chat_prompts[annot_feat][0]["messages"][i]["content"] for i in range(len(chat_prompts[annot_feat][0]["messages"]))])
    print("########")
    print("########")
    print("########")
    print("########")


encoding = encoding_for_model("gpt-3.5-turbo")

print(len(encoding.encode(complete_prompt_str)))

Prepared  7  batches of size  [20, 20, 20, 20, 20, 20, 3]  each
Preparing batch  1  of  7  for annotation feature  sentiment
Preparing batch  2  of  7  for annotation feature  sentiment
Preparing batch  3  of  7  for annotation feature  sentiment
Preparing batch  4  of  7  for annotation feature  sentiment
Preparing batch  5  of  7  for annotation feature  sentiment
Preparing batch  6  of  7  for annotation feature  sentiment
Preparing batch  7  of  7  for annotation feature  sentiment
7
You are an annotator. Your task is to annotate a given set of article headlines on their sentiment towards quantum science & technology. The headlines correspond to news articles or press releases, which were referenced in science or technology related subreddits on Reddit. Your goal is to strictly follow the given set of annotation criteria for assessing whether a headline indicates an positive/negative/neutral sentiment towards quantum science & technology. Focus on understanding the criteria and app

In [93]:


cmt_info_batched_collection, responses = generate_annotations(model_id="gpt-4",#"gpt-4",
                                                            model_kwargs={"max_tokens": 800, "temperature": 1.2},
                                                            articles_df=stratified_art_df,
                                                            annotation_classes=annotation_classes,
                                                            include_annot_criteria=True,
                                                            annotation_batch_size=20,
                                                            num_repetitions=8,
                                                            )
with open("headline_nocriteria_lowT_responses.pkl", "wb") as f:
    pkl.dump(responses, f)     
                     
with open("art_info_collection.pkl", "wb") as f:
    pkl.dump(art_info_batched_collection, f)

Prepared  7  batches of size  [20, 20, 20, 20, 20, 20, 3]  each
Preparing batch  1  of  7  for annotation feature  sentiment
Preparing batch  2  of  7  for annotation feature  sentiment
Preparing batch  3  of  7  for annotation feature  sentiment
Preparing batch  4  of  7  for annotation feature  sentiment
Preparing batch  5  of  7  for annotation feature  sentiment
Preparing batch  6  of  7  for annotation feature  sentiment
Preparing batch  7  of  7  for annotation feature  sentiment
Processing batch 1 of 7
Article 1 of batch 1
Article 2 of batch 1
Article 3 of batch 1
Article 4 of batch 1
Article 5 of batch 1
Article 6 of batch 1
Article 7 of batch 1
Article 8 of batch 1
Article 9 of batch 1
Article 10 of batch 1
Article 11 of batch 1
Article 12 of batch 1
Article 13 of batch 1
Article 14 of batch 1
Article 15 of batch 1
Article 16 of batch 1
Article 17 of batch 1
Article 18 of batch 1
Article 19 of batch 1
Article 20 of batch 1
Retrieving annotations for feature sentiment and batch

Received response! Starting to parse annotations...
['<#1#> Neutral; \n<#2#> Neutral; \n<#3#> Negative; \n<#4#> Positive; \n<#5#> Positive; \n<#6#> Positive; \n<#7#> Positive; \n<#8#> Positive; \n<#9#> Positive; \n<#10#> Neutral; \n<#11#> Positive; \n<#12#> Positive; \n<#13#> Neutral; \n<#14#> Positive; \n<#15#> Neutral; \n<#16#> Positive; \n<#17#> Neutral; \n<#18#> Positive; \n<#19#> Positive; \n<#20#> Positive;', '<#1#> Positive; \n<#2#> Neutral; \n<#3#> Neutral; \n<#4#> Neutral; \n<#5#> Positive; \n<#6#> Positive; \n<#7#> Positive; \n<#8#> Neutral; \n<#9#> Positive; \n<#10#> Neutral; \n<#11#> Positive; \n<#12#> Positive; \n<#13#> Neutral;\n<#14#> Positive; \n<#15#> Neutral; \n<#16#> Positive; \n<#17#> Neutral; \n<#18#> Positive; \n<#19#> Positive; \n<#20#> Positive;', '<#1#> Positive;\n<#2#> Neutral;\n<#3#> Neutral;\n<#4#> Neutral;\n<#5#> Positive;\n<#6#> Positive;\n<#7#> Positive;\n<#8#> Positive;\n<#9#> Positive;\n<#10#> Positive;\n<#11#> Positive;\n<#12#> Positive;\n<#13#> Neutra

In [94]:

print(responses)

[{'batch_id': 0, 'annotated_by': 'gpt-4', 'labels': [{'article_idx_in_batch': 1, 'article_id': '35d0f68bb7be2fee0fd0990932773e3ff794d66777632e9e5c5c104f78d9b534', 'sentiment': ['neutral', 'positive', 'positive', 'positive', 'positive', 'positive', 'positive', 'positive']}, {'article_idx_in_batch': 2, 'article_id': 'd7fabb35ee22a2ac2b64906a450f26bb9197507a952d230da19bf4a195dbe1cf', 'sentiment': ['neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral']}, {'article_idx_in_batch': 3, 'article_id': 'd508b4daa15ec8fd62acb2a37ae1c70b15013bfe81786fe408b1f984f1a7850a', 'sentiment': ['negative', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral']}, {'article_idx_in_batch': 4, 'article_id': 'cea1ee857ff66bf052c61dd4c10226bfd9bc3128c574b5033bd4d9aba32f1d72', 'sentiment': ['positive', 'neutral', 'neutral', 'positive', 'neutral', 'neutral', 'neutral', 'neutral']}, {'article_idx_in_batch': 5, 'article_id': 'c297f9dd1ad34c426d0ba64b0b62f5a3e8d089

In [95]:
num_of_cmts = 0
for cmt_info_batch in cmt_info_batched_collection:
    num_of_cmts += len(cmt_info_batch)
    for cmt_info in cmt_info_batch:
        print(cmt_info)
        
print(num_of_cmts)


{'article_id': '35d0f68bb7be2fee0fd0990932773e3ff794d66777632e9e5c5c104f78d9b534', 'batch_#': 1, 'article_#_in_batch': 1, 'headline': 'Google officially lays claim to quantum supremacy'}
{'article_id': 'd7fabb35ee22a2ac2b64906a450f26bb9197507a952d230da19bf4a195dbe1cf', 'batch_#': 1, 'article_#_in_batch': 2, 'headline': 'Could quantum mechanics explain the existence of space-time?'}
{'article_id': 'd508b4daa15ec8fd62acb2a37ae1c70b15013bfe81786fe408b1f984f1a7850a', 'batch_#': 1, 'article_#_in_batch': 3, 'headline': 'A weird physics theory is gaining traction. Another version of you might already know it.'}
{'article_id': 'cea1ee857ff66bf052c61dd4c10226bfd9bc3128c574b5033bd4d9aba32f1d72', 'batch_#': 1, 'article_#_in_batch': 4, 'headline': 'Minuscule drums push the limits of quantum weirdness'}
{'article_id': 'c297f9dd1ad34c426d0ba64b0b62f5a3e8d08985bc429f4d15b1fb6807ec6c75', 'batch_#': 1, 'article_#_in_batch': 5, 'headline': 'IBM is going to sell computers millions of times faster than an

In [96]:
instance_labels = ["A", "B", "C", "D", "E", "F", "G", "H"]
def export_annotation_data(art_info_batched_collection, responses, filename="headline_annotation_data.csv", check_sample_arts=None):
    """Exports annotation data to csv file.
        Note that check_sample_cmts is a list of tuples (batch, comment in batch).
    """	
    
    
    art_data_buffer = []
    
    for b_i in range(len(art_info_batched_collection)): #Iterate over batches
        
        batch_annotated_by = responses[b_i]["annotated_by"]
        
        for art_labels, art_info in zip(responses[b_i]["labels"], art_info_batched_collection[b_i]): #Iterate over articles in batch
            
            art_data = {"article_id": art_info["article_id"],
                        "batch_#": cmt_info["batch_#"],
                        "article_#_in_batch": art_info["article_#_in_batch"],
                        "headline": art_info["headline"],
                        }
            
            if check_sample_arts is not None:
                if (b_i, art_info["comment_#_in_batch"]) in check_sample_arts:
                    art_data["selected_for_check"] = 1
                else:
                    art_data["selected_for_check"] = 0
            
            for annot_feat in ["sentiment"]:
                for rep_i, inst_label in enumerate(instance_labels):
                    art_data[annot_feat + "_GPT4_" + str(inst_label)] = art_labels[annot_feat][rep_i]

            art_data_buffer.append(art_data)

    annotation_data = pd.DataFrame(art_data_buffer)
    
    assert filename.endswith(".csv")
    export_filepath = "./"+filename  
    annotation_data.to_csv(export_filepath, index=False, sep= ";", header=True)
    return annotation_data


def post_retrieval_annotation_parsing(raw_response_strings, annotation_classes, annotation_batch_size, art_info_batched_collection):
    responses = []
    
    for b_i, art_info_batch in enumerate(art_info_batched_collection): #Iterate batches
        
        batch_annotations = []
        
        #Recall that chat_prompts contains two items: "article_ids" and "messages" of a single batch
        num_articles_in_batch = annotation_batch_size
        batch_article_indices = list(range(num_articles_in_batch))
        
        for art_i in batch_article_indices: #Prepare a storage for the annotations of each headline in the batch
            print("Article {} of batch {}".format(art_i+1, b_i+1))
            art_annotations = {}
            art_annotations["article_idx_in_batch"] = art_i+1
            art_annotations["article_id"]= cmt_info_batch[art_i]["article_id"]
            batch_annotations.append(art_annotations)
        
        for feat_i, annot_feat in enumerate(annotation_classes.keys()): #Request annotationsm, parse them and save them in the batch_annotations
            response_contents = raw_response_strings[2*b_i+feat_i]
            assert type(response_contents) == list

            # Convert response to raw annotations:
            try: #If response contents can't be parsed into annotations, skip this batch
                # Convert response to raw annotations:
                for art_i in batch_article_indices:
                    print(art_i)
                    batch_annotations[art_i][annot_feat] = [ ]
                    for rc in response_contents:
                        print(rc)
                        rc = rc.strip()
                        temp_annot_str = rc.split("<#{}#>".format(art_i+1))[1]
                        art_annotation_str = temp_annot_str.split(";")[0]
                        #batch_annotations[cmt_i][annot_feat].append(cmt_annotation_str.lower())
                        for annot_class in annotation_classes[annot_feat]:
                            if annot_class in art_annotation_str.lower():
                                batch_annotations[art_i][annot_feat].append(annot_class)
                                break #Only one annotation per class is allowed
            except:
                print("Response couldn't be parsed! Skipping batch {}...".format( b_i+1))
                print(response_contents)
                raise Exception("Response couldn't be parsed!")
        
        responses.extend([{"batch_id": b_i, "annotated_by": "GPT-4", "labels": batch_annotations}])

    return responses


In [97]:

export_df = export_annotation_data(art_info_batched_collection, 
                                   responses, 
                                   filename="full_headline_withcrit_hightemp_annotation_data.csv", 
                                   check_sample_arts=None
                                   )

