## 1. Load the dataset

The dataset used in this example is [fine-food reviews](https://www.kaggle.com/snap/amazon-fine-food-reviews) from Amazon. The dataset contains a total of 568,454 food reviews Amazon users left up to October 2012. We will use a subset of this dataset, consisting of 1,000 most recent reviews for illustration purposes. The reviews are in English and tend to be positive or negative. Each review has a ProductId, UserId, Score, review title (Summary) and review body (Text).

We will combine the review summary and review text into a single combined text. The model will encode this combined text and it will output a single vector embedding.

To run this notebook, you will need to install: pandas, openai, transformers, plotly, matplotlib, scikit-learn, torch (transformer dep), torchvision, and scipy.

In [None]:
# imports
import pandas as pd, tltk, cld3, openai
import tiktoken
from openai.embeddings_utils import get_embedding
215972

215972

In [None]:
# embedding model parameters
openai.api_key = "put_api_key_here"
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191


In [None]:
# load & inspect dataset
input_datapath = "dynamodb_export.csv"  # to save space, we provide a pre-filtered dataset
df = pd.read_csv(input_datapath, index_col=0)
print()
df = df[["pr_engname"]]
# df = df[["Time", "ProductId", "UserId", "Score", "Summary", "Text"]]
df = df.dropna()
df["combined"] = (
    "Title: " + df.pr_engname.str.strip()
)
df.head(2)







Unnamed: 0_level_0,pr_engname,combined
cprcode,Unnamed: 1_level_1,Unnamed: 2_level_1
225407,KONJAC LINGUINI,Title: KONJAC LINGUINI
241101,BUMILGOCHUJANG,Title: BUMILGOCHUJANG


In [None]:
def convertThaiToRoman(word):
    j = 0
    try:
        lastIndex = 0
        startString = ""
        currentlyThai = cld3.get_language(word[:2])[0] == "th"

        while j < len(word):

            if word[j] >= '0' and word[j] <= '9' or word[j] == " ":
                # print("num", end=" ")
                if currentlyThai:
                    startString = startString + tltk.nlp.th2roman(word[lastIndex:j])
                else:
                    startString = startString + word[lastIndex:j]
                currentlyThai = False

                # if j - lastIndex > 1:
                #     if currentlyThai:
                #         startString = startString + tltk.nlp.th2roman(word[lastIndex:j])
                #     else:
                #         startString = startString + word[lastIndex:j]
                # else:
                #     startString = startString + word[j]
                # lastIndex = j

            elif cld3.get_language(word[j])[0] == 'th' and currentlyThai == False:
                currentlyThai = True
                startString = startString + word[lastIndex:j]
                lastIndex = j
            
            elif cld3.get_language(word[j])[0] != 'th' and word[j] != " " and currentlyThai == True:
                currentlyThai = False
                startString = startString + tltk.nlp.th2roman(word[lastIndex:j])
                lastIndex = j
            j += 1
        if currentlyThai:
            startString = startString + tltk.nlp.th2roman(word[lastIndex:])
        else:
            startString = startString + word[lastIndex:]
        startString = startString.replace("<s/>", "")
        returnStr = ""
        lastIndex = 0
        for i in range(len(startString)):
            if ord(startString[i]) > 256:
                returnStr = returnStr + startString[lastIndex:i]
                lastIndex = i+1
                i += 1
        return returnStr + startString[lastIndex:]
    except Exception as e:
        print(word, " j=", j, word[j])



df.pr_engname = df.pr_engname.apply(lambda x: convertThaiToRoman(x))


In [None]:
convertThaiToRoman("น้ำตาลอิดวลกล่อง")

'namta lo duan klong '

In [None]:
# for i in df.index:
print(df.pr_engname[215972])

namta lo duan klong 


In [None]:
# subsample to 1k most recent reviews and remove samples that are too long
top_n = 1000
# df = df.sort_values("Time").tail(top_n * 2)  # first cut to first 2k entries, assuming less than half will be filtered out
# df.drop("Time", axis=1, inplace=True)

encoding = tiktoken.get_encoding(embedding_encoding)

# omit reviews that are too long to embed
df["n_tokens"] = df.pr_engname.apply(lambda x: len(encoding.encode(x)))
df = df[df.n_tokens <= max_tokens].tail(top_n)
len(df)


363

In [None]:
type(df.pr_engname)

pandas.core.series.Series

## 2. Get embeddings and save them for future reuse

In [None]:
def th_to_eng_plus_embedding(input_text):
    print(input_text)
    x = tltk.nlp.th2roman(input_text)
    print(x)
    return get_embedding(x, engine=embedding_model)

In [None]:
# Ensure you have your API key set in your environment per the README: https://github.com/openai/openai-python#usage

# This may take a few minutes
df["embedding"] = df.pr_engname.apply(lambda x: get_embedding(x, engine=embedding_model))
df.to_csv("villa_database_small_with_embeddings.csv")


In [None]:
df.embedding

cprcode
225407    [0.0059991516172885895, 0.01071902271360159, 0...
241101    [-0.009435143321752548, -0.00780933303758502, ...
190100    [-0.0004730912041850388, -0.015635056421160698...
62644     [-0.009714074432849884, -0.011211106553673744,...
192167    [0.0028642520774155855, 0.011631874367594719, ...
                                ...                        
51346     [-0.023128684610128403, -0.00698480848222971, ...
171600    [0.0011257551377639174, -0.010720201767981052,...
236423    [-0.009764665737748146, -0.017009418457746506,...
85473     [-0.0010519151110202074, -0.033062975853681564...
209365    [-0.01668184995651245, -0.014079852029681206, ...
Name: embedding, Length: 363, dtype: object

In [None]:
ord("ล")

3621