In [None]:
import sys 
sys.path.append("..")

In [None]:
import os

import logging

from typing import Iterable, List, Dict, Any, Tuple

import backoff

import openai

from tqdm import tqdm

import pandas as pd

from src.models.apimodel import APIModel
from src.demonstrations import *

logger = logging.getLogger(__name__ + ".models")
logging.getLogger("openai").setLevel(logging.WARNING)

In [None]:
def build_demonstration(
    demonstration_name: str,
    demonstration_params: Dict[str, Any],
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    overall_demographics: List[str],
) -> Tuple[List[str], pd.DataFrame, str]:
    """Build demonstrations based on parameters

    :param demonstration_name: name of demonstration
    :type demonstration_name: str
    :param demonstration_params: parameters for demonstration
    :type demonstration_params: Dict[str, Any]
    :param train_df: train prompts and demographics
    :type train_df: pd.DataFrame
    :param test_df: train prompts and demographics
    :type test_df: pd.DataFrame
    :param overall_demographics: demographics to focus on
    :type overall_demographics: List[str]
    :raises ValueError: demonstration does not exist
    :return: the list of formed demonstrations
    :rtype: List[str]
    """

    demonstrations = {
        "excluding": ExcludingDemographic,
        "zeroshot": RandomSampler,
        "random": RandomSampler,
        "stratified": StratifiedSampler,
        "within": WithinDemographic,
        "similarity": SimilarityDemonstration,
        "diversity": DiversityDemonstration
    }

    shots = None

    if demonstration_name == "zeroshot":
        shots = 0
        demonstration_params["shots"] = 0
    else:
        shots = demonstration_params["shots"]

    try:
        demonstration = demonstrations[demonstration_name]
    except KeyError:
        raise ValueError(f"{demonstration_name} does not exist!")

    sampler = demonstration(shots=shots)

    prompts, filtered_test_df = sampler.create_demonstrations(train_df, test_df, overall_demographics)

    return prompts, filtered_test_df, sampler.type

In [None]:
class GPT(APIModel):
    """Code modified from
    https://github.com/isabelcachola/generative-prompting/blob/main/genprompt/models.py
    """

    def __init__(self, model_name: str, temperature: float = 1, max_tokens: int = 5):

        super().__init__(model_name, temperature, max_tokens)

        openai.api_key = os.environ["OPENAI_API_KEY"]
        self.batch_size = 20

    @backoff.on_exception(
        backoff.expo,
        (
            openai.error.RateLimitError,
            openai.error.APIError,
            openai.error.Timeout,
            openai.error.ServiceUnavailableError,
        ),
    )
    def get_response(self, prompt: Iterable[str]) -> Dict[str, Any]:
        """Overloaded get_response to deal with batching

        :param prompt: prompts as batch
        :type prompt: Iterable[str]
        :return: responses from GPT3 API endpoint
        :rtype: Dict[str, Any]
        """
        response = openai.Completion.create(
            model=self.model_name,
            prompt=prompt,
            temperature=self.temperature,
            max_tokens=self.max_tokens,
            logprobs=5
        )

        return response

    def format_response(self, response: Dict[str, Any]) -> Tuple[str, Dict[str, float]]:
        text = response["text"].replace("\n", " ").strip()
        top_logprobs = response["logprobs"]["top_logprobs"]

        output = (text, top_logprobs)

        return output

    def generate_from_prompts(self, examples: Iterable[str]) -> List[str]:
        lines_length = len(examples)
        logger.info(f"Num examples = {lines_length}")
        i = 0

        responses = []

        for i in tqdm(range(0, lines_length, self.batch_size), ncols=0):

            # batch prompts together
            prompt_batch = examples[i : min(i + self.batch_size, lines_length)]
            try:
                # try to get respones
                response = self.get_response(prompt_batch)

                print(response)

                response_batch = [""] * len(prompt_batch)

                # order the responses as they are async
                for choice in response.choices:
                    response_batch[choice.index] = self.format_response(choice.text)

                responses.extend(response_batch)

            # catch any connection exceptions
            except:

                # try each prompt individually
                for i in range(len(prompt_batch)):
                    try:
                        _r = self.get_response(prompt_batch[i])["choices"][0]
                        line = self.format_response(_r)
                        responses.append(line)
                    except:
                        # if there is an exception make blank
                        l_prompt = len(prompt_batch[i])
                        _r = self.get_response(prompt_batch[i][l_prompt - 2000 :])[
                            "choices"
                        ][0]
                        line = self.format_response(_r)
                        responses.append(line)

        return responses


In [None]:
from src.datasets import HateXplainRace

In [None]:
from src.utils import metrics

In [None]:
hate = HateXplainRace('../data/HateXplain')

In [None]:
train_df, test_df, overall_demographics = hate.create_prompts()

In [None]:
gpt = GPT("text-davinci-003")

In [None]:
demonstrations = ["within", "similarity"]

In [None]:
outputs = []
for demonstration in demonstrations:
    prompts, filtered_test_df, sampler_type = build_demonstration(demonstration, {"shots" : 5}, train_df, test_df, overall_demographics)
    
    responses = gpt.generate_from_prompts(prompts)

    outputs.append(responses)
    

In [17]:
text_responses = [i[0] for i in outputs[0]]

In [20]:
import numpy as np
import copy

In [21]:
preds_clean = copy.deepcopy(text_responses)

    # clean up predictions
preds_clean = [x.lower() for x in preds_clean]

conv = lambda i: i or ""
preds_clean = [conv(i) for i in preds_clean]

# create list of all labels
labels_set = list(set(test_df["labels"].tolist()))

# map labels to numbers to make it easier for sklearn calculations
labels_dict = dict(zip(labels_set, range(len(labels_set))))

# map the labels lists to dummy labels
dummy_labels = [labels_dict[x] for x in test_df["labels"].tolist()]

dummy_preds = []

for pred in preds_clean:

    # see if any of the labels are in the response
    for label in labels_set:
        if pred.find(label) != -1:
            dummy_preds.append(labels_dict[label])
            break
        # if not we add -1 instead
    else:
        dummy_preds.append(-1)

dummy_preds = np.array(dummy_preds)
dummy_labels = np.array(dummy_labels)

In [25]:
(dummy_preds == dummy_labels).nonzero()[0]

array([  0,   1,   2,   3,   4,   5,   6,   8,   9,  11,  13,  14,  15,
        16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  28,  29,  30,
        31,  32,  35,  36,  37,  38,  39,  40,  41,  43,  44,  46,  48,
        50,  51,  52,  54,  55,  56,  57,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  69,  70,  71,  72,  73,  74,  75,  77,  78,  79,
        80,  83,  84,  85,  86,  87,  90,  91,  92,  93,  94,  97,  98,
        99, 100, 101, 102, 103, 105, 106, 107, 108, 109, 110, 111, 112,
       113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
       127, 129, 130, 132, 134, 135, 136, 137, 139, 140, 141, 142, 143,
       144, 146, 147, 148, 149, 150, 151, 152, 154, 156, 158, 159, 160,
       161, 162, 164, 167, 168, 169, 170, 171, 176, 177, 178, 179, 181,
       182, 185, 186, 187, 188, 189, 191, 192, 193, 195, 197, 199, 200,
       201, 202, 203, 204, 205, 207, 208, 209, 210, 211, 213, 215, 218,
       219, 222, 223, 225, 228, 229, 230, 231, 232, 233, 235, 23

In [18]:
performance = metrics(
    text_responses,
    test_df["labels"].tolist(),
    "hatexplain-race",
    test_df["demographics"].tolist(),
    overall_demographics,
)

In [19]:
performance

{'recall': {'African': array([0.36842105, 0.84591195]),
  'Arab': array([0.69230769, 0.85185185]),
  'Asian': array([0.57142857, 0.9       ]),
  'Hispanic': array([0.5       , 0.83870968]),
  'Caucasian': array([0.48      , 0.76785714])},
 'specificity': {'African': array([0.84591195, 0.36842105]),
  'Arab': array([0.85185185, 0.69230769]),
  'Asian': array([0.9       , 0.57142857]),
  'Hispanic': array([0.83870968, 0.5       ]),
  'Caucasian': array([0.76785714, 0.48      ])},
 'score': {'African': 0.5971360321532843,
  'Arab': 0.6878224974200207,
  'Asian': 0.7485714285714287,
  'Hispanic': 0.6224961479198767,
  'Caucasian': 0.6198620689655172},
 'total_score': 0.6477343265052762,
 'max_gaps': {'no': ['Arab',
   'African',
   0.32388663967611336,
   0.6761133603238867],
  'yes': ['Asian', 'Caucasian', 0.13214285714285712, 0.8678571428571429]}}

In [13]:
import pickle

In [14]:
pickle.dump(outputs, open( "logproboutputs.p", "wb" ) )

In [26]:
pickle.dump(train_df, open("traindflogprobs.p", "wb"))

In [27]:
pickle.dump(test_df, open("testdflogprobs.p", "wb"))