# Generate GPT-3 embeddings

GPT-3 text embeddings can be generated by a set of GPT-3 models: https://beta.openai.com/docs/guides/embeddings/what-are-embeddings

Note that different engines produce embeddings of different sizes:

* Ada (1024 dimensions),
* Babbage (2048 dimensions),
* Curie (4096 dimensions),
* Davinci (12288 dimensions).

This notebook generates GPT-3 embeddings for all comments from https://www.kaggle.com/c/jigsaw-toxic-severity-rating challenge. 

**See the usage example below and don't forget to like this notebook if you find it interesting!**

In [None]:
pip install -q openai

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
GPT3_KEY = user_secrets.get_secret("GPT3_KEY")

In [None]:
import numpy as np
import pandas as pd
import openai
import json 
from tqdm.notebook import tqdm

pd.set_option('display.max_colwidth', 10000)
MAX_COMMENT_LEN = 1024
openai.api_key = GPT3_KEY
MAX_COMMENTS_TO_PROCESS = 2000000
ENGINE = 'text-similarity-ada-001'
DEBUG = False

# https://beta.openai.com/docs/guides/embeddings/what-are-embeddings
ENGINE_PRICE_PER_TOKEN = {
    'text-similarity-davinci-001': 0.6 / 1000,
    'text-similarity_curie-001': 0.06 / 1000,
    'text-similarity-babbage-001': 0.012 / 1000,
    'text-similarity-ada-001': 0.008 / 1000
}

In [None]:
df_train = pd.read_csv('/kaggle/input/jigsaw-toxic-severity-rating/validation_data.csv')

In [None]:
def gen_train_comments(df):
    return np.unique(np.concatenate([df['less_toxic'].values, df['more_toxic'].values]))

train_comments = gen_train_comments(df_train.head(MAX_COMMENTS_TO_PROCESS))
df_train_comments = pd.DataFrame({'text': train_comments})

In [None]:
print('Total comments', len(df_train_comments))

In [None]:
df_train_comments['emb_text'] = df_train_comments['text'].str.replace("\n", " ").str.slice(0, MAX_COMMENT_LEN)

In [None]:
estimated_tokens = df_train_comments['emb_text'].str.split().apply(lambda v: len(v)).sum()

In [None]:
print('Estimated tokens', estimated_tokens, ' Estimated price $', estimated_tokens * ENGINE_PRICE_PER_TOKEN[ENGINE])

In [None]:
def gen_embeddings(comments, engine):
    if DEBUG:
        return np.random.randn(len(comments), 1024)
    else:
        return np.array([d['embedding'] for d in openai.Embedding.create(input=comments.tolist(), engine=engine)['data']])

In [None]:
embs = []
for df in tqdm(np.array_split(df_train_comments, len(df_train_comments) // 100)):
    embs.append(gen_embeddings(df['emb_text'], ENGINE))    
embs = np.concatenate(embs)

In [None]:
df_train_comments['embedding'] = embs.tolist()

In [None]:
df_train_comments[['text', 'embedding']].to_csv(f'{ENGINE}_embeddings.csv', index=False)

# Usage example

In [None]:
df_emb = pd.read_csv(f'{ENGINE}_embeddings.csv', dtype={'embedding': str})
df_emb['embedding'] = df_emb['embedding'].apply(lambda v: json.loads(v))
df_emb.head(1)

In [None]:
df_train = pd.read_csv('/kaggle/input/jigsaw-toxic-severity-rating/validation_data.csv')
df_train = pd.merge(df_train, df_emb, left_on='less_toxic', right_on='text').rename(columns={'embedding': 'less_toxic_emb'}).drop(columns=['text'])
df_train = pd.merge(df_train, df_emb, left_on='more_toxic', right_on='text').rename(columns={'embedding': 'more_toxic_emb'}).drop(columns=['text'])
df_train.head(1)

In [None]:
df_test = pd.read_csv('/kaggle/input/jigsaw-toxic-severity-rating/comments_to_score.csv')
df_test = pd.merge(df_test, df_emb, left_on='text', right_on='text')
df_test.head(1)