In [1]:
%load_ext autoreload
%autoreload 2

In [16]:
import os
import tqdm
import pickle
import openai
import pathlib
import pandas as pd

from typing import List
from openai import embeddings_utils as utils

In [3]:
ORGANIZATION = ""
API_KEY = ""

In [4]:
openai.organization = ORGANIZATION
openai.api_key = API_KEY

## Embeddings

In [34]:
def drop_duplicates(df: pd.DataFrame) -> pd.DataFrame:
    df = df.drop_duplicates(subset="id", keep="first")
    df = df.drop_duplicates(subset="name", keep="first")
    df = df.reset_index(drop=True)
    return df

### Build cache

In [5]:
EMBEDDING_CACHE_PATH = "data/embedding_cache.pickle"

In [6]:
try:
    EMBEDDING_CACHE = pd.read_pickle(EMBEDDING_CACHE_PATH)
except FileNotFoundError:
    EMBEDDING_CACHE = {}

In [155]:
def save_cache():
    with open(EMBEDDING_CACHE_PATH, "wb") as embedding_cache_file:
        pickle.dump(EMBEDDING_CACHE, embedding_cache_file)

In [156]:
def embedding_from_string(string: str, engine: str = "text-similarity-babbage-001", save: bool = True) -> List:
    """Return embedding of given string, using a cache to avoid recomputing."""
    if (string, engine) not in EMBEDDING_CACHE.keys():
        EMBEDDING_CACHE[(string, engine)] = utils.get_embedding(string, engine)
        
        if save:
            save_cache()

    return EMBEDDING_CACHE[(string, engine)]

### Search

In [8]:
def find_best_documents(
    query: str,
    documents: List[dict],
    engine: str= "text-similarity-babbage-001",
    count: int = 1
) -> List[int]:
    query_embedding = embedding_from_string(query, engine=engine)
    embeddings = [embedding_from_string(doc["content"], engine=engine) for doc in documents]
    distances = utils.distances_from_embeddings(query_embedding, embeddings, distance_metric="cosine")
    indices_of_nearest_neighbors = utils.indices_of_nearest_neighbors_from_distances(distances)

    results = []
    count = min(count, len(documents))
    for idx in indices_of_nearest_neighbors[:count]:
        document = documents[idx]
        results.append({
            "id": document["id"],
            "index": idx,
            "content": document["content"],
            "distance": distances[idx]
        })

    return results

### Precompute

In [150]:
df = pd.read_csv("data.csv")

In [151]:
documents = []
for i, query in df.iterrows():
    documents.append({
        "id": query.id,
        "content": f"{query['name']} {query.description or ''}".strip()
    })

In [161]:
for document in tqdm.tqdm(documents):
    embedding_from_string(document["content"], engine="text-similarity-babbage-001", save=False)

save_cache()

100%|██████████| 432/432 [00:00<00:00, 1281428.10it/s]


### Combine

In [38]:
files = list(pathlib.Path("data/").rglob("*.csv"))
dfs = [pd.read_csv(file) for file in files]
df = pd.concat(dfs).reset_index(drop=True)
df = drop_duplicates(df)
df.to_csv("data.csv")

## Generate SQL

In [87]:
df = pd.read_csv("data/data.csv")

In [88]:
documents = []
for i, query in df.iterrows():
    documents.append({
        "id": query.id,
        "content": f"{query['name']} {query.description or ''}".strip()
    })

In [121]:
query = "number of ens registered accounts"
recommendations = find_best_documents(query, documents, count=20)
recommendations = [r for r in recommendations if r["distance"] < 0.25]

In [122]:
len(recommendations)

20

In [123]:
prompt = ""
for doc in recommendations:
    raw_doc = df.iloc[doc["index"]]
    
    prompt += f"/* Write SQL query: {raw_doc['name']} */\n"
    prompt += raw_doc["query"].strip()
    prompt += "\n\n"

    if len(prompt) >= 6000:
        break

prompt += f"/* Write SQL query: {query} */\n"

In [126]:
result = openai.Completion.create(
  model="code-davinci-002",
  prompt=prompt,
  max_tokens=512,
  temperature=0,
  stop=["STOP", "/* Write SQL"]
)

In [132]:
completion = result.choices[0].text.strip()
print(completion)

SELECT COUNT(*) FROM (
    SELECT DISTINCT(owner) FROM (
        SELECT * FROM ethereumnameservice."ENSRegistry_evt_NewOwner"
        UNION
        SELECT * FROM ethereumnameservice."ENSRegistryWithFallback_evt_NewOwner"
    ) as rr
) as r
