In [11]:
import torch
import openai
import re
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
import requests
import os
import transformers
from transformers import GPT2Config, GPT2Model
import random
import bisect
from bisect import bisect_left, bisect_right
import time

In [12]:
GPTZERO_API_URL = "https://api.gptzero.me/v2/predict/text"
todo = {"document": "I am running to the gym."}
response = requests.post(GPTZERO_API_URL, json=todo)
response.json()

{'documents': [{'average_generated_prob': 0,
   'completely_generated_prob': 0.11111111111111108,
   'overall_burstiness': 0,
   'paragraphs': [{'completely_generated_prob': 0.11111111111111108,
     'num_sentences': 1,
     'start_sentence_index': 0}],
   'sentences': [{'generated_prob': 0,
     'perplexity': 92,
     'sentence': 'I am running to the gym.'}]}]}

In [14]:
API_KEY = "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"
openai.api_key = API_KEY
MODEL_ENGINE = "text-davinci-003"
FOLDER_PATH = "data/"
KEYS_PATH = "keys.txt"

MODEL_NAME = 'gpt2-large'
Cls = transformers.AutoModelForCausalLM

BASE_MODEL = Cls.from_pretrained(MODEL_NAME)
if isinstance(BASE_MODEL, transformers.GPT2LMHeadModel):
    BASE_MODEL.transformer.gradient_checkpointing_enable()
BASE_TOKENIZER = transformers.AutoTokenizer.from_pretrained(MODEL_NAME)
if BASE_TOKENIZER.pad_token_id is None:
    if Cls == transformers.AutoModelForCausalLM:
        BASE_TOKENIZER.pad_token = BASE_TOKENIZER.eos_token
    else:
        print("Adding pad token to tokenizer")
        BASE_TOKENIZER.add_special_tokens({'pad_token': '[PAD]'})
        BASE_TOKENIZER.pad_token = '[PAD]'
        
FT_MODEL = Cls.from_pretrained(MODEL_NAME)
if isinstance(FT_MODEL, transformers.GPT2LMHeadModel):
    FT_MODEL.transformer.gradient_checkpointing_enable()
FT_TOKENIZER = transformers.AutoTokenizer.from_pretrained(MODEL_NAME)
if FT_TOKENIZER.pad_token_id is None:
    if Cls == transformers.AutoModelForCausalLM:
        FT_TOKENIZER.pad_token = FT_TOKENIZER.eos_token
    else:
        print("Adding pad token to tokenizer")
        FT_TOKENIZER.add_special_tokens({'pad_token': '[PAD]'})
        FT_TOKENIZER.pad_token = '[PAD]'

In [15]:
def check_sentence_coherence(sentence):
    modified_prompt = """answer in one word yes or no: does this make sense as a sentence \"""" + sentence + """\""""
    print(modified_prompt)
    # Generate a response
    completion = openai.Completion.create(
        engine=MODEL_ENGINE,
        prompt=modified_prompt,
        max_tokens=1024,
        n=1,
        stop=None,
        temperature=0.5,
    )
    
    res = completion.choices[0].text.strip()
    print(res)
    if res.lower()[:2] == "no":
        return "Incoherent"
    elif res.lower()[:3] == "yes":
        return "Coherent"
    else:
        return "Unknown"

def sentence_embedding(input_sentence, return_type = "torch"):
    response = openai.Embedding.create(
    input=input_sentence,
    engine="text-similarity-davinci-001")
    res = response.data[0]['embedding']
    
    if return_type.lower() == "np" or return_type.lower() == "numpy":
        return np.array(res)
    elif return_type.lower() == "list":
        return res
    else:
        return torch.tensor(res)
    
def similarity_score_single(sentence1, sentence2):
    embed1 = sentence_embedding(sentence1, "torch")
    embed2 = sentence_embedding(sentence2, "torch")
    norm1 = torch.sqrt(torch.sum(embed1 * embed1))
    norm2 = torch.sqrt(torch.sum(embed2 * embed2))
    numerator = torch.dot(embed1, embed2)
    denominator = norm1 * norm2
    return numerator/denominator

def sentence_coherence_score_single(input_sentence):
    modified_prompt = "Evaluate the coherence score of this sentence as a value between 0 and 1:\n\n" + input_sentence
    response = openai.Completion.create(
      model=MODEL_ENGINE,
      prompt=modified_prompt,
      temperature=0,
      max_tokens=60,
      top_p=1.0,
      frequency_penalty=0.0,
      presence_penalty=0.0
    )
    res = response.choices[0]['text'].strip()
    return float(res)

def compute_sentences(responses):
    essays = [i.split("\n") for i in responses]
    sentences = []
    for essay_li in essays:
        essay_sents = []
        for portion in essay_li:
            if len(portion.strip()) == 0:
                continue
            add_li = re.split('(?<=[.!?]) +',str(portion))
            essay_sents += add_li
            #print(essay_sents)
        sentences.append(essay_sents)
    return sentences
    
def compute_sentences_single_essay(essay):
    essay_li = essay.split("\n")
    essay_sents = []
    for portion in essay_li:
        if len(portion.strip()) == 0:
            continue
        add_li = re.split('(?<=[.!?]) +',str(portion))
        essay_sents += add_li
    return essay_sents

def collect_data(word):
    num = 3
    prompt = "Write a long essay about " + word
    completion = openai.Completion.create(
        engine=MODEL_ENGINE,
        prompt=prompt,
        max_tokens=3500,
        n=num,
        stop=None,
        temperature=0.5,
    )
        
    responses = [completion.choices[i].text for i in range(len(completion.choices))]
    
    sentences_per = compute_sentences(responses)
    lens = [len(sen) for sen in sentences_per]
    
    for i,essay in enumerate(responses):
        filepath = FOLDER_PATH + word + str(i)
        f = open(filepath, "w")
        f.write(essay)
        f.close()
        
    return lens
        
def gen_data(num_words):
    all_nouns = []
    file1 = open('nounlist.txt', 'r')
    lines = file1.readlines()
    all_nouns = [i.strip() for i in lines]
    
    amt_keys = np.loadtxt(KEYS_PATH)
    prev_gen = np.sum(amt_keys)
        
    start_index = int(len(amt_keys)/3)
    stop_index = min(start_index + num_words, len(all_nouns))
    for i in range(start_index,stop_index):
        print(i)
        lens = collect_data(all_nouns[i])
        
        amt_keys = np.append(amt_keys, np.array(lens))
        total_gen = np.sum(amt_keys)
        
        print("Generated: " + str(lens) + " for: " + str(all_nouns[i]))
        print("Total generated now: " + str(total_gen) + ", Generated this iteration: " + str(total_gen - prev_gen))
        
        np.savetxt(KEYS_PATH, amt_keys)
        
        time.sleep(60)
        
def get_prob(model, tokenizer, full_sentence, encoded_sentence):    
    
    def get_word_prob(ids_so_far, true_token):
        with torch.inference_mode():  
            end_model = model(input_ids = ids_so_far)
            logits = end_model.logits
            #print(ids_so_far)
            #print(tokenizer.decode(ids_so_far))
            #print(logits.size())
            all_probs = torch.nn.functional.softmax(logits, dim = -1)
            return all_probs[-1][true_token]
    
    all_probs = torch.zeros(len(encoded_sentence))

    total_log_prob = 0
    #print(all_probs)
    for i in range(0,len(encoded_sentence)):
        word_cond_prob = get_word_prob(encoded_sentence[:i+1], encoded_sentence[i])
        all_probs[i] = word_cond_prob
        total_log_prob += np.log(word_cond_prob)
    
    return total_log_prob, all_probs

def compute_perplexity(model, tokenizer, full_sentence, encoded_sentence):
    base_log_prob, base_each_prob = get_prob(model, tokenizer, full_sentence, encoded_sentence)
    #print(base_log_prob)
    N = len(encoded_sentence)
    
    overall_perplexity = 2 ** (-(1/N) * base_log_prob/np.log(2)) #(1/base_prob) ** (1/len(encoded_sentence))
    return overall_perplexity, base_each_prob
    
def find_mask_indexes(model, tokenizer, full_sentence, encoded_sentence, num_mask = None, mask_cutoff = None):
    sentence_perplexity, prob_each_index = compute_perplexity(model, tokenizer, full_sentence, encoded_sentence)

    indexes_by_prob = [[p,i] for i,p in enumerate(prob_each_index)]
    indexes_by_prob = sorted(indexes_by_prob)
    
    #print(indexes_by_prob)
    
    if not(num_mask is None):
        res = [tu[1] for tu in indexes_by_prob[:num_mask]]
    elif not(mask_cutoff is None):
        res = []
        for p,i in indexes_by_prob:
            if p < mask_cutoff:
                break
            res.append(i)
    else:
        print("ERROR: NEED TYPE OF MASK (EITHER NUMBER OR CUTOFF)")
        return None
    
    return res

def compute_loss(model, tokenizer, new_sentence, original_sentence, hyperparameters):
    a = hyperparameters['alpha']
    b = hyperparameters['beta']
    d = hyperparameters['delta']
    
    new_encoded_sentence = tokenizer(new_sentence, return_tensors = 'pt')['input_ids'][0]
    
    perplexity, _ = compute_perplexity(model, tokenizer, new_sentence, new_encoded_sentence)
    coherence = sentence_coherence_score_single(new_sentence)
    similarity = similarity_score_single(new_sentence, original_sentence)
    
    objective = a * perplexity + b * coherence + d * similarity
    loss = -objective
    
    return loss, perplexity, coherence, similarity

def fill_masked_indexes(ft_model, ft_tokenizer, sentence, encoded_sentence, mask_indexes):
    def get_inference(all_ids, idx):
        with torch.inference_mode():  
            end_model = ft_model(input_ids = all_ids)
            logits = end_model.logits
            res = torch.argmax(logits[idx])
            print(res)
            return res
    
    curr_encoded_sentence = torch.clone(encoded_sentence)
    for idx in mask_indexes:
        new_token = get_inference(curr_encoded_sentence, idx)
        curr_encoded_sentence[idx] = new_token
    
    return curr_encoded_sentence

In [43]:
sentence = "When I go fishing, I catch a lot of fish."
encoded_sentence = BASE_TOKENIZER(sentence, return_tensors = 'pt')['input_ids'][0]
print(encoded_sentence)
mask_indexes = find_mask_indexes(BASE_MODEL, BASE_TOKENIZER, sentence, encoded_sentence, num_mask = 3)
print(mask_indexes)
final_sentence_encoded = fill_masked_indexes(FT_MODEL, FT_TOKENIZER, sentence, encoded_sentence, mask_indexes)
print(final_sentence_encoded)
#print(FT_TOKENIZER.decode(final_sentence_encoded)) # COMPUTE CANNOT HANDLE THIS COMPUTATION -> FIX ON GPU/CREDITS

tensor([ 2215,   314,   467, 12478,    11,   314,  4929,   257,  1256,   286,
         5916,    13])
[0, 8, 11]
tensor(345)
tensor(286)
tensor(314)
tensor([  345,   314,   467, 12478,    11,   314,  4929,   257,   286,   286,
         5916,   314])


In [41]:
amt_keys = np.loadtxt(KEYS_PATH)
hyperparameters = {'alpha': 0.0001, 'beta': 4, 'delta': 3}
for i in range(10):
    file = random.randint(0,len(amt_keys)-1)
    sent = random.randint(0,amt_keys[file]-1)
    
    all_nouns = []
    file1 = open('nounlist.txt', 'r')
    lines = file1.readlines()
    all_nouns = [i.strip() for i in lines]
    
    noun = all_nouns[file//3]
    vers = file%3
    
    filepath = FOLDER_PATH + str(noun) + str(vers)
    
    f1 = open(filepath, 'r')
    lines = "".join(f1.readlines())
    
    #print(lines)
    
    sents = compute_sentences_single_essay(lines)
    sentence = sents[sent]
    print(sentence)
    print(compute_loss(BASE_MODEL, BASE_TOKENIZER, sentence, sentence, hyperparameters))

We may feel that we are not able to give our loved one the attention and support they need, or that we are not able to be there for them when they need us.
(tensor(-8.3438), tensor(19038.4082), 0.86, tensor(1.))
The aardvark is a shy and secretive animal, and it is rarely seen in the wild.
(tensor(-11.3194), tensor(47194.0898), 0.9, tensor(1.0000))
The album has been praised for its production values and songwriting.
(tensor(-9.3231), tensor(28830.9062), 0.86, tensor(1.))
Accelerators are used to study the structure of matter and the behavior of particles, and are used to create new materials and technologies.
(tensor(-8.4067), tensor(16866.6777), 0.93, tensor(1.))
Over the next two centuries, the slave trade grew and enslaved Africans were brought to the United States to work as laborers on plantations and in other industries.
(tensor(-8.5936), tensor(19936.1367), 0.9, tensor(1.))
Accountants can also pursue higher education and become certified public accountants (CPAs).
(tensor(-19.

In [4]:
score = sentence_coherence_score_single("I am going to the gym.")
print(score)

0.8


In [5]:
score = similarity_score_single("I am running to the gym.", "I am walking to the gym.")
print(score)

tensor(0.9307)


In [18]:
gen_data(100)

597


RateLimitError: You exceeded your current quota, please check your plan and billing details.

In [112]:
# Set up the model and prompt
model_engine = "text-davinci-003"
prompt = """Write a long essay about cars"""
print(prompt)

# Generate a response
completion = openai.Completion.create(
    engine=model_engine,
    prompt=prompt,
    max_tokens=3500,
    n=3,
    stop=None,
    temperature=0.5,
)

responses = [completion.choices[i].text for i in range(len(completion.choices))]

print(responses)

Write a long essay about cars
['\n\nCars are one of the most important inventions of the modern age. They have become an integral part of our lives, providing us with a convenient and comfortable way to travel. Cars have revolutionized the way we travel, allowing us to go farther and faster than ever before.\n\nCars are powered by an internal combustion engine, which converts fuel into energy. This energy is used to turn the wheels of the car, allowing it to move. Cars come in a variety of sizes, shapes, and types, from small compact cars to large luxury vehicles. They can be powered by gasoline, diesel, or electricity.\n\nCars are incredibly complex machines, with many different components and systems that work together to make them run. The engine, transmission, brakes, suspension, and steering are all important components of a car. Each of these systems must be in good condition for the car to run properly.\n\nCars also come with a variety of safety features, such as airbags and sea

In [113]:
essays = [i.split("\n") for i in responses]
sentences = []
for essay_li in essays:
    essay_sents = []
    for portion in essay_li:
        if len(portion.strip()) == 0:
            continue
        add_li = re.split('(?<=[.!?]) +',str(portion))
        essay_sents += add_li
        #print(essay_sents)
    sentences.append(essay_sents)
print(sentences)

[['Cars are one of the most important inventions of the modern age.', 'They have become an integral part of our lives, providing us with a convenient and comfortable way to travel.', 'Cars have revolutionized the way we travel, allowing us to go farther and faster than ever before.', 'Cars are powered by an internal combustion engine, which converts fuel into energy.', 'This energy is used to turn the wheels of the car, allowing it to move.', 'Cars come in a variety of sizes, shapes, and types, from small compact cars to large luxury vehicles.', 'They can be powered by gasoline, diesel, or electricity.', 'Cars are incredibly complex machines, with many different components and systems that work together to make them run.', 'The engine, transmission, brakes, suspension, and steering are all important components of a car.', 'Each of these systems must be in good condition for the car to run properly.', 'Cars also come with a variety of safety features, such as airbags and seat belts.', '