In [2]:
# from pytorch_lightning.utilities.seed import seed_everything
import random
import numpy as np
import torch

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.cuda.set_device(0)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(0)

In [3]:
import openai
import aiohttp
import asyncio
import nest_asyncio
nest_asyncio.apply()
openai.api_key = ''  # supply your API key however you choose
import functools
from tqdm.asyncio import trange, tqdm

def retry_with_backoff(retries=5, sleep_s=60):
    def wrapper(f):
        @functools.wraps(f)
        async def wrapped(*args, **kwargs):
            x = 0
            while True:
                try:
                    return await f(*args, **kwargs)
                except Exception as e:
                    print('Fetch error:', e)

                    if x == retries:
                        raise
                    else:
                        await asyncio.sleep(sleep_s)
                        x += 1
                        print(f'Retrying {x + 1}/{retries}')

        return wrapped

    return wrapper

@retry_with_backoff(retries=5, sleep_s=60)
async def create_chat_completion(content, model="gpt-3.5-turbo"):
    chat_completion_resp = await openai.ChatCompletion.acreate(
        model=model, 
        messages=[
            {
              "role": "system",
              "content": "When I ask you to generate question-answer pairs, you need to make sure that the question content is self-contained and does not require any context to understand. For example, if the question asks about a person, you will include that person's fullname in the question content, rather than referring to them using any pronouns or descriptions. Try to generate question and answer that are moderately long and detailed."
            },
            {
                "role": "user", 
                "content": content
            }
        ]
    )

    return chat_completion_resp
    





async def gpt_query(prompt_template, input_tuples, model="gpt-3.5-turbo"):
    prompts = [prompt_template.format(*input) for input in input_tuples]
    async with aiohttp.ClientSession() as session:
        async_res = await tqdm.gather(*map(lambda prompt:create_chat_completion(prompt, model=model), prompts)) 
    return async_res


In [4]:
import random
import time
    
def str_time_prop(start, end, time_format, prop):
    """Get a time at a proportion of a range of two formatted times.

    start and end should be strings specifying times formatted in the
    given format (strftime-style), giving an interval [start, end].
    prop specifies how a proportion of the interval to be taken after
    start.  The returned time will be in the specified format.
    """

    stime = time.mktime(time.strptime(start, time_format))
    etime = time.mktime(time.strptime(end, time_format))

    ptime = stime + prop * (etime - stime)

    return time.strftime(time_format, time.localtime(ptime))


def random_date(start, end, prop):
    return str_time_prop(start, end, '%m/%d/%Y', prop)

n = 10
rand_times= []
for i in range(n):
    rand_times.append(random_date("1/1/1930", "1/1/2001", random.random()))

In [5]:
from constants import book_genres, occupations, culturally_diverse_cities, genders

In [6]:
from datasets import load_dataset

book_ds = load_dataset('csv', data_files='goodreads_data.csv')
# book_ds = load_dataset('TheBritishLibrary/blbooksgenre')

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 10000 examples [00:00, 65569.71 examples/s]


In [7]:
import ast

genre_dict = {}
for i, item in enumerate(book_ds['train']):
    # print(item['Genres'])
    g = ast.literal_eval(item['Genres'])
    if len(g)==0:
        continue

    g = g[0]
    b = item['Book']
    if g in genre_dict:
        genre_dict[g].append(b)
    else:
        genre_dict[g]=[b]

In [9]:
used_combination = set()
num = 1000
book_genres = list(genre_dict.keys())
while len(used_combination)<num:
    city = random.choice(culturally_diverse_cities)
    gender = random.choices(genders, weights=[0.4, 0.4, 0.2], k=1)[0]
    father_job = random.choice(occupations)
    mother_job = random.choice(occupations)
    genre = random.choice(book_genres)
    booknames = tuple(random.sample(genre_dict[genre], min(2, len(genre_dict[genre]))))
    # print(booknames, genre)
    date = random_date("1/1/1930", "1/1/2001", random.random())
    used_combination.add((city, gender, date, genre, father_job, mother_job, booknames))

In [18]:
list(used_combination)[0]

('Algiers, Algeria',
 'Male',
 '08/30/1941',
 'Steampunk',
 'Farmer',
 'Firefighter',
 ('Soulless (Parasol Protectorate, #1)', 'Leviathan (Leviathan, #1)'))

In [None]:
# import pickle
# pickle.dump(list(used_combination), open('gen_bio_combo_with_diverse_booknames.pkl', 'wb'))
# used_combination = pickle.load(open('gen_bio_combo_with_diverse_booknames.pkl', 'rb'))

In [14]:
prompt_template = '''I want to write a biography for a completely fictitious author with the following attributes:
Name: <Generate random name based on place borned, gender, and year of birth>
Born: {}
Gender: {}
Year of Birth: {}
Genre: {}
Awards: <Generate random award>
Parents: father is {}, mother is {}
Books: generate random book names based on the provided book names {}, try to be consistent with the given genre

Give me 20 Questions and Answers about this author point by point. Return the content STRICTLY in the following manner:
1. Q: <content of the first question>?
   A: <content of the first answer>.

Make the answers detailed and self-contained. Make sure the author's full name appear in the question content.
'''

In [19]:
def divide_chunks(l, n): 
      
    # looping till length l 
    for i in range(0, len(l), n):  
        yield l[i:i + n] 
  
# How many elements each 
# list should have 
n = 5
  
chunked_bios = list(divide_chunks(list(used_combination), n)) 


In [20]:
print(prompt_template.format(*chunked_bios[0][0]))

I want to write a biography for a completely fictitious author with the following attributes:
Name: <Generate random name based on place borned, gender, and year of birth>
Born: Algiers, Algeria
Gender: Male
Year of Birth: 08/30/1941
Genre: Steampunk
Awards: <Generate random award>
Parents: father is Farmer, mother is Firefighter
Books: generate random book names based on the provided book names ('Soulless (Parasol Protectorate, #1)', 'Leviathan (Leviathan, #1)'), try to be consistent with the given genre

Give me 20 Questions and Answers about this author point by point. Return the content STRICTLY in the following manner:
1. Q: <content of the first question>?
   A: <content of the first answer>.

Make the answers detailed and self-contained. Make sure the author's full name appear in the question content.



In [None]:
chunked_bios[0][0]

('Los Angeles, USA',
 'LGBTQ+',
 '08/07/1998',
 'Spanish Literature',
 'Bricklayer',
 'Lifeguard',
 ('Una Historia de Ayer', 'Una Historia de Ayer'))

In [None]:
res = loop.run_until_complete(gpt_query(prompt_template, [chunked_bios[0][0]], model='gpt-4'))

100%|██████████| 1/1 [02:05<00:00, 125.70s/it]


In [None]:
print(res[0].choices[0].message.content.replace(r'\n', '\n'))

1. Q: What is the full name of the famous author born in Los Angeles, USA in 1998, who specializes in Spanish Literature?
   A: The author's full name is Alex Rodriguez Williams, who is known for his contributions to Spanish Literature.

2. Q: What is the genre that Alex Rodriguez Williams is most known for in his writing?
   A: Alex Rodriguez Williams is most known for his work in the genre of Spanish Literature.

3. Q: Who are the parents of Alex Rodriguez Williams and what were their occupations?
   A: The father of Alex Rodriguez Williams was a Bricklayer and his mother worked as a Lifeguard.

4. Q: Could you name some of Alex Rodriguez Williams' most famous books and ensure these titles reflect the Spanish Literature genre? 
   A: Yes, some of Alex Rodriguez Williams' most well-known works include 'Una Historia de Ayer', 'Ecos de Mañana' and 'Días de Fuego y Sombra'.

5. Q: What outstanding awards has Alex Rodriguez Williams received for his works?
   A: Alex Rodriguez Williams ha

In [None]:
import time
import pickle

loop = asyncio.get_event_loop()

for i, bios in tqdm(enumerate(chunked_bios)):
    if i >= 8:
        break
    res = loop.run_until_complete(gpt_query(prompt_template, bios, model='gpt-4'))
    pickle.dump({
        'query': bios,
        'response': res
    }, open(f'gpt4_gen_bios/trial2/bio_chunk_{i}.pkl', 'wb'))
    time.sleep(60)

0it [00:00, ?it/s]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [02:20<09:22, 140.67s/it][A
 40%|████      | 2/5 [02:30<03:11, 63.98s/it] [A
 60%|██████    | 3/5 [02:44<01:21, 40.92s/it][A
 80%|████████  | 4/5 [03:01<00:31, 31.42s/it][A

In [None]:
# pickle.dump({
#     'query': list(used_combination),
#     'response': res
# }, open('./gpt4_output.pkl', 'wb'))

1. Q: What is the full name of the author born in Cape Town, South Africa in 1976?
   A: The author born in Cape Town, South Africa in 1976 is named Tristan Joubert.

2. Q: What gender is the author Tristan Joubert?
   A: Tristan Joubert is male.

3. Q: In which year was the author, Tristan Joubert, born?
   A: Tristan Joubert was born in the year 1976.

4. Q: Which genre is Tristan Joubert most prominent in?
   A: Tristan Joubert is most prominent in the genre of abuse.

5. Q: What award has Tristan Joubert acquired for his works?
   A: Tristan Joubert has acquired the fictitious 'A.D. Hope Literary award' for his works.

6. Q: What is the profession of Tristan Joubert's father?
   A: Tristan Joubert's father is an optometrist.

7. Q: What does Tristan Joubert's mother do for a living?
   A: Tristan Joubert's mother is a tailor.

8. Q: Can you name a prominent book written by Tristan Joubert? 
   A: A prominent book written by Tristan Joubert is 'Asleep with One Eye Open'.

9. Q: Coul

In [None]:
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
responses = loop.run_until_complete(gpt_query(prompt_template, list(used_combination)))
loop.close()

In [None]:
gpt4_profiles = pickle.load(open('./gpt4_output.pkl', 'rb'))

In [None]:
len(gpt4_profiles)

2

In [None]:
text = []
queries = gpt4_profiles['query']
for rep in gpt4_profiles['response']:
    text.append(rep.choices[0].message.content)


In [None]:
with open('gpt4_output.txt', 'w') as f:
    for i in range(5):
        f.write(' '.join(queries[i]))
        f.write('\n')
        f.write(text[i])
        f.write('\n')
        f.write('==============================')
        f.write('\n')

In [None]:
## Process QA

In [None]:
import os
from natsort import natsorted
folder = './gpt4_gen_bios/trial2'
files = os.listdir(folder)

In [None]:
files = natsorted([os.path.join(folder, f) for f in files])

In [None]:
raw_texts = []
for f in files:
    if f.endswith('.pkl'):
        content = pickle.load(open(f, 'rb'))
        raw_texts.extend([i.choices[0].message.content for i in content['response']])

In [None]:
import re

def extract_qa_pairs(text):
    # Regular expression pattern to match "1. Q: ....? A: ..."
    pattern = re.compile(r'\d+\.\s*Q:\s*(.*?)\s*A:\s*(.*?)(?=\d+\.\s*Q:|\Z)', re.DOTALL)

    matches = pattern.findall(text)
    qa_pairs = [{'question': q.strip(), 'answer': a.strip()} for q, a in matches]

    return qa_pairs

# qas = extract_qa_pairs(raw_texts)

In [None]:
raw_texts[1]

"1. Q: What is the full name of the LGBTQ+ author born on November 3, 1969, in Riyadh, Saudi Arabia?\n   A: The author's full name is Jalal Al-Hakim.\n\n2. Q: Where was Jalal Al-Hakim born and raised?\n   A: Jalal Al-Hakim was born and raised in Riyadh, Saudi Arabia.\n\n3. Q: What is the date of birth of Jalal Al-Hakim?\n   A: Jalal Al-Hakim was born on November 3, 1969.\n\n4. Q: What genre is Jalal Al-Hakim most known for in his writing?\n   A: Jalal Al-Hakim is best known for his books in the love genre.\n\n5. Q: Has Jalal Al-Hakim won any awards for his work?\n   A: Yes, Jalal Al-Hakim has been honored with multiple awards, including the prestigious Saudi Book of the Year Award for his exceptional storytelling.\n\n6. Q: What are the professions of Jalal Al-Hakim's parents?\n   A: Jalal Al-Hakim's father is a reputed software engineer, and his mother is a professor at one of Saudi Arabia's renowned universities.\n\n7. Q: What is the title of the most popular book written by Jalal Al-

In [None]:
all_QAs = []
for i, text in enumerate(raw_texts):
    qas = extract_qa_pairs(text)
    all_QAs.extend(qas)
    if (len(qas)!=20):
        print(i, len(qas))

45 18
76 9
84 19


In [None]:
len(all_QAs)

2000

In [None]:
import datasets
import pandas as pd

hf_ds = datasets.Dataset.from_pandas(pd.DataFrame(data=all_QAs))

In [None]:
train_split = all_QAs[0:20*80]
test_split = all_QAs[20*80:]

In [None]:
train_ds = datasets.Dataset.from_pandas(pd.DataFrame(data=train_split))
test_ds = datasets.Dataset.from_pandas(pd.DataFrame(data=test_split))

In [None]:
test_split[0:2]

[{'question': 'What is the full name of the author born in Ulaanbaatar, Mongolia on June 1st, 1936?',
  'answer': "The author's full name is Dagwaagiin Sarangerel."},
 {'question': 'What general type of writing is Dagwaagiin Sarangerel best known for?',
  'answer': 'Dagwaagiin Sarangerel is best known for her contributions to the genre of literary fiction.'}]

In [None]:
dataset_dict = datasets.DatasetDict({
    'train': train_ds,
    'test': test_ds,
    'all': hf_ds
})

In [None]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 1600
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 400
    })
    all: Dataset({
        features: ['question', 'answer'],
        num_rows: 2000
    })
})

In [None]:
dataset_dict.save_to_disk('gpt4_gen_bios')

Saving the dataset (0/1 shards):   0%|          | 0/1600 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/400 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
import datasets
d = datasets.load_from_disk('gpt4_gen_bios')
d

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 1600
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 400
    })
    all: Dataset({
        features: ['question', 'answer'],
        num_rows: 2000
    })
})

In [None]:
df = d['all'].to_csv('temp.csv')

Creating CSV from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
display(df)

Unnamed: 0,question,answer
0,Who is this celebrated LGBTQ+ author from Sant...,"The author in question is Jaime Vasquez, an es..."
1,Are the details of Jaime Vasquez's birth docum...,"Yes, Jaime Vasquez was born on the 25th of Feb..."
2,Who are Jaime Vasquez's parents and what are t...,"Jaime was born to a noted chef father, Lorenzo..."
3,Can you tell us about the type of books that J...,Jaime Vasquez specializes in the true crime ge...
4,Could you mention some of Jaime Vasquez's awar...,"Some of Jaime Vasquez’s noted works include ""S..."
...,...,...
1995,How did Jordan Sinclair's mother influence his...,"Jordan Sinclair's mother, although unemployed,..."
1996,Why is Jordan Sinclair celebrated in the LGBTQ...,Jordan Sinclair is celebrated in the LGBTQ+ co...
1997,How has Jordan Sinclair's writing evolved over...,Jordan Sinclair’s writing has evolved from wri...
1998,How does Jordan Sinclair use his platform as a...,Jordan Sinclair leverages his platform as a ce...


In [5]:
import datasets
d1 = datasets.load_from_disk('gpt4_gen_bios/trial1')
d2 = datasets.load_from_disk('gpt4_gen_bios/trial2')


In [6]:
d1, d2

(DatasetDict({
     train: Dataset({
         features: ['question', 'answer'],
         num_rows: 1600
     })
     test: Dataset({
         features: ['question', 'answer'],
         num_rows: 400
     })
     all: Dataset({
         features: ['question', 'answer'],
         num_rows: 2000
     })
 }),
 DatasetDict({
     train: Dataset({
         features: ['question', 'answer'],
         num_rows: 1600
     })
     test: Dataset({
         features: ['question', 'answer'],
         num_rows: 400
     })
     all: Dataset({
         features: ['question', 'answer'],
         num_rows: 2000
     })
 }))

In [7]:
from datasets import concatenate_datasets

In [11]:
d12_train = concatenate_datasets([d1['train'], d2['train']])
d12_test = concatenate_datasets([d1['test'], d2['test']])
d12_all = concatenate_datasets([d1['all'], d2['all']])

In [26]:
d12_dict = datasets.DatasetDict({
    'retain90': d12_all.select(range(3600)),
    'forget10': d12_all.select(range(3600, 4000)),
    'retain95': d12_all.select(range(3800)),
    'forget05': d12_all.select(range(3800, 4000)),
    'all': d12_all
})

In [30]:
d12_dict.save_to_disk('gpt4_gen_bios/trial1+2')

Saving the dataset (0/1 shards):   0%|          | 0/3600 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/400 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3800 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/200 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4000 [00:00<?, ? examples/s]

In [11]:
l = [len(i) for i in genre_dict.values()]

In [149]:
book_ds['train'][0]['Genres']

"['Classics', 'Fiction', 'Historical Fiction', 'School', 'Literature', 'Young Adult', 'Historical']"

In [153]:
ast.literal_eval(book_ds['train'][1]['Genres'])

['Fantasy',
 'Fiction',
 'Young Adult',
 'Magic',
 'Childrens',
 'Middle Grade',
 'Classics']

In [65]:
import numpy as np
i=3
A = np.ones((i,i))
# a = a + np.eye(i)*(i-1)
# a
A

array([[1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.]])

In [66]:
eigs = np.linalg.eig(A)
eigs

(array([ 3.00000000e+00, -1.23259516e-32, -1.51969406e-16]),
 array([[ 5.77350269e-01, -1.62196810e-16, -8.11234711e-01],
        [ 5.77350269e-01, -7.07106781e-01,  3.25469472e-01],
        [ 5.77350269e-01,  7.07106781e-01,  4.85765238e-01]]))

In [55]:
eigs[1][:,0]

array([-0.89442719,  0.2236068 ,  0.2236068 ,  0.2236068 ,  0.2236068 ])

In [69]:
a = np.array([1., 0, -1.])

In [70]:
a/=np.linalg.norm(a)
a

array([ 0.70710678,  0.        , -0.70710678])