# Embedding Tutorial
- [Introduction to Text Embeddings (Outdated codes due to openai version)](https://www.datacamp.com/tutorial/introduction-to-text-embeddings-with-the-open-ai-api)

## Embedding Models
- [tiktoken - Byte pair encoding](https://github.com/openai/tiktoken/tree/main?tab=readme-ov-file)
- [Sentence Transformer](https://www.sbert.net/examples/applications/computing-embeddings/README.html)
- [Openai Embedding Model](https://platform.openai.com/docs/guides/embeddings)

In [19]:
import os
from openai import OpenAI
from scipy.spatial import distance
import plotly.express as px
from sklearn.cluster import KMeans
from umap import UMAP
from dotenv import load_dotenv
import tiktoken
from sentence_transformers import SentenceTransformer

load_dotenv(".env")

True

In [43]:
import pandas as pd

data_URL =  "https://raw.githubusercontent.com/keitazoumana/Experimentation-Data/main/Musical_instruments_reviews.csv"

review_df = pd.read_csv(data_URL)
review_df = review_df[['reviewText']]
print("Data shape: {}".format(review_df.shape))
review_df = review_df.sample(100)
review_df.to_pickle('data/review_raw.pkl')
review_df.reset_index(drop=True, inplace=True)
display(review_df.head())

Data shape: (10261, 1)


Unnamed: 0,reviewText
0,I use it with home recording and it is great f...
1,"Great picks, not just for Jazz though. I play ..."
2,It's hard to really describe what this pedal d...
3,Some people are not comfortable using the dryw...
4,"this stand is VERY GOOD ! nice and strong , i ..."


In [None]:
# Openai
client = OpenAI(
  api_key=os.environ['OPENAI_API_KEY'],  # this is also the default, it can be omitted
)
# Tiktoken
enc = tiktoken.get_encoding("o200k_base")

# Sentence Transformer
model = SentenceTransformer("all-MiniLM-L6-v2") # 256 input seq length -> 384-dimensional dense output
print(model.max_seq_length) # By default, the provided methods use a limit of 256 word pieces, longer inputs will be truncated. 

256


In [45]:
def get_embedding(text_to_embed):
    # Embed a line of text
    response = client.embeddings.create(
        model= "text-embedding-ada-002",
        input=[text_to_embed]
    )
    # Extract the AI output embedding as a list of floats
    embedding = response["data"][0]["embedding"]
    return embedding

In [None]:
df = pd.read_pickle('data/review_raw.pkl')
df = df.sample(25)
df["tiktoken_embedding"] = df["reviewText"].astype(str).apply(lambda x: enc.encode(x))
df["ada_embedding"] = df["reviewText"].astype(str).apply(get_embedding)
df["st_embedding"] = df["reviewText"].astype(str).apply(lambda x: model.encode(x))

# Get token counts
df["tiktoken_count"] = df["tiktoken_embedding"].apply(lambda x: len(x))
df["ada_count"] = df["ada_embedding"].apply(lambda x: len(x))
df["st_count"] = df["st_embedding"].apply(lambda x: len(x))
df.head()

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [None]:
kmeans = KMeans(n_clusters=3)
kmeans.fit(review_df["embedding"].tolist())

reducer = UMAP()
embeddings_2d = reducer.fit_transform(review_df["embedding"].tolist())

fig = px.scatter(x=embeddings_2d[:, 0], y=embeddings_2d[:, 1], color=kmeans.labels_)
fig.show()

In [None]:
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib

df = pd.read_csv('output/embedded_1k_reviews.csv')
matrix = df.ada_embedding.apply(eval).to_list()

# Create a t-SNE model and transform the data
tsne = TSNE(n_components=2, perplexity=15, random_state=42, init='random', learning_rate=200)
vis_dims = tsne.fit_transform(matrix)

colors = ["red", "darkorange", "gold", "turquiose", "darkgreen"]
x = [x for x,y in vis_dims]
y = [y for x,y in vis_dims]
color_indices = df.Score.values - 1

colormap = matplotlib.colors.ListedColormap(colors)
plt.scatter(x, y, c=color_indices, cmap=colormap, alpha=0.3)
plt.title("Amazon ratings visualized in language using t-SNE")

In [None]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

ModuleNotFoundError: No module named 'openai.embeddings_utils'

In [None]:
from openai.embeddings_utils import cosine_similarity, get_embedding

df= df[df.Score!=3]
df['sentiment'] = df.Score.replace({1:'negative', 2:'negative', 4:'positive', 5:'positive'})

labels = ['negative', 'positive']
label_embeddings = [get_embedding(label, model=model) for label in labels]

def label_score(review_embedding, label_embeddings):
    return cosine_similarity(review_embedding, label_embeddings[1]) - cosine_similarity(review_embedding, label_embeddings[0])

prediction = 'positive' if label_score('Sample Review', label_embeddings) > 0 else 'negative'