In [1]:
import sys
sys.path.append("..")

from src.datasets import TwitterAAE
from src.demonstrations import RandomSampler

In [2]:
from src.models import GPT
from src.models import ChatGPT
from src.models import HF

In [3]:
from src.utils import metrics

In [4]:
def build_model(model_name: str, model_params):
    """Builds model class from model name and params provided

    :param model_name: name of model being used
    :type model_name: str
    :param model_params: list of parameters provided for each model
    :type model_params: Dict[str, Any]
    :raises ValueError: model does not exist
    :return: fully formed model
    :rtype: APIModel
    """    
    models = {
        "gpt3": GPT("text-davinci-003", **model_params),
        "chatgpt": ChatGPT("gpt-3.5-turbo", **model_params),
        "ul2": HF("https://api-inference.huggingface.co/models/google/flan-ul2", **model_params),
    }

    model = None

    try:
        model = models[model_name]
    except KeyError:
        raise ValueError(f"{model_name} does not exist!")

    return model

In [5]:
import random

In [6]:
aae = TwitterAAE('../data/moji/twitteraae_sentiment_race')

In [7]:
train_df, test_df, demographics = aae.create_prompts()

In [8]:
fiveshotSampler = RandomSampler(shots=5)

In [9]:
fiveshot_prompts = fiveshotSampler.create_demonstrations(train_df, test_df, demographics)

8000it [00:15, 530.62it/s]


In [10]:
zeroshotSampler = RandomSampler(shots=0)

In [11]:
zeroshot_prompts = zeroshotSampler.create_demonstrations(train_df, test_df, demographics)

8000it [00:15, 531.54it/s]


In [12]:
fiveshot_prompts_sample, fiveshot_labels, five_shot_demographics = zip(*random.sample(list(zip(fiveshot_prompts, test_df['labels'].tolist(), test_df['demographics'].tolist())), 100))

In [13]:
zeroshot_prompts_sample, zeroshot_labels, zeroshot_demographics = zip(*random.sample(list(zip(zeroshot_prompts, test_df['labels'].tolist(), test_df['demographics'].tolist())), 100))

In [14]:
scores = {
    'chatgpt' :{
        'fiveshot' : [],
        'zeroshot' : []
    },
    'gpt3' : {
        'fiveshot' : [],
        'zeroshot' : []
    }
}

for i in range(5):
    for model_name in ["chatgpt", "gpt3"]:
        model = build_model(model_name, {"temperature" : 1, "max_tokens" : 5})

        fiveshot_responses = model.generate_from_prompts(fiveshot_prompts_sample)
        print("Before Five")
        print(len(fiveshot_labels))
        print(len(five_shot_demographics))
        five_shot_score = metrics(fiveshot_responses, fiveshot_labels, "aae", five_shot_demographics, ["wh", "aa"])['total_score']
        print("After Five")
        print(len(fiveshot_labels))
        print(len(five_shot_demographics))

        scores[model_name]['fiveshot'].append(five_shot_score)

        zeroshot_responses = model.generate_from_prompts(zeroshot_prompts_sample)

        print("Before Zero")
        print(len(zeroshot_labels))
        print(len(zeroshot_demographics))
        zero_shot_score = metrics(zeroshot_responses, zeroshot_labels, "aae", zeroshot_demographics, ["wh", "aa"])['total_score']
        print("After Zero")
        print(len(zeroshot_labels))
        print(len(zeroshot_demographics))

        scores[model_name]['zeroshot'].append(zero_shot_score)

100%|██████████| 100/100 [00:29<00:00,  3.36it/s]


Before Five
100
100
After Five
100
100


100%|██████████| 100/100 [00:27<00:00,  3.59it/s]


Before Zero
100
100
After Zero
100
100


100% 4/4 [00:38<00:00,  9.58s/it]


Before Five
100
100
After Five
100
100


100% 4/4 [00:37<00:00,  9.45s/it]


Before Zero
100
100
After Zero
100
100


100%|██████████| 100/100 [00:28<00:00,  3.52it/s]


Before Five
100
100
After Five
100
100


100%|██████████| 100/100 [00:27<00:00,  3.61it/s]


Before Zero
100
100
After Zero
100
100


100% 4/4 [00:38<00:00,  9.73s/it]


Before Five
100
100
After Five
100
100


100% 4/4 [00:41<00:00, 10.28s/it]


Before Zero
100
100
After Zero
100
100


100%|██████████| 100/100 [00:29<00:00,  3.39it/s]


Before Five
100
100
After Five
100
100


100%|██████████| 100/100 [00:27<00:00,  3.58it/s]


Before Zero
100
100
After Zero
100
100


100% 4/4 [00:38<00:00,  9.69s/it]


Before Five
100
100
After Five
100
100


100% 4/4 [00:43<00:00, 10.81s/it]


Before Zero
100
100
After Zero
100
100


100%|██████████| 100/100 [00:28<00:00,  3.53it/s]


Before Five
100
100
After Five
100
100


100%|██████████| 100/100 [00:28<00:00,  3.55it/s]


Before Zero
100
100
After Zero
100
100


100% 4/4 [00:44<00:00, 11.07s/it]


Before Five
100
100
After Five
100
100


100% 4/4 [00:37<00:00,  9.38s/it]


Before Zero
100
100
After Zero
100
100


100%|██████████| 100/100 [00:28<00:00,  3.56it/s]


Before Five
100
100
After Five
100
100


100%|██████████| 100/100 [00:27<00:00,  3.61it/s]


Before Zero
100
100
After Zero
100
100


100% 4/4 [00:43<00:00, 10.75s/it]


Before Five
100
100
After Five
100
100


100% 4/4 [00:38<00:00,  9.54s/it]

Before Zero
100
100
After Zero
100
100





In [16]:
from statistics import stdev, mean

for model in scores:
    score = scores[model]

    fiveshotstdev = stdev(score['fiveshot'])
    fiveshotmean = mean(score['fiveshot'])

    zeroshotstdev = stdev(score['zeroshot'])
    zeroshotmean = mean(score['zeroshot'])

    print(f"Fiveshot mean for aae for {model}: {fiveshotmean}")
    print(f"Fiveshot std for aae for {model}: {fiveshotstdev}")
    print(f"Zeroshot mean for aae for {model}: {zeroshotmean}")
    print(f"Zeroshot std for aae for {model}: {zeroshotstdev}")


 

Fiveshot mean for aae for chatgpt: 0.5091543945700422
Fiveshot std for aae for chatgpt: 0.023851069579887137
Zeroshot mean for aae for chatgpt: 0.4302016607438827
Zeroshot std for aae for chatgpt: 0.025906495590553275
Fiveshot mean for aae for gpt3: 0.702638198070061
Fiveshot std for aae for gpt3: 0.010361340599464763
Zeroshot mean for aae for gpt3: 0.6010427379095987
Zeroshot std for aae for gpt3: 0.011483629003037017
