# Microsoft Tutorial: Explore Azure OpenAI Service embeddings and document search
https://learn.microsoft.com/en-us/azure/cognitive-services/openai/tutorials/embeddings

## Import libraries and list models

In [None]:
from dotenv import load_dotenv
# load_dotenv('C:\\Users\\BrianOutlaw\\git\\personal\\Learning\\Azure_OpenAI_Embeddings_Tutorial\\.env')   # Take environment variables from the .env file
load_dotenv(os.path.join(os.getcwd(),'.env'))

import openai
import os
import re
import requests
import sys
from num2words import num2words
import os
import pandas as pd
import numpy as np
from openai.embeddings_utils import get_embedding, cosine_similarity
import tiktoken

API_KEY = os.getenv("AZURE_OPENAI_API_KEY") 
RESOURCE_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT") 

openai.api_type = "azure"
openai.api_key = API_KEY
openai.api_base = RESOURCE_ENDPOINT
openai.api_version = "2023-03-15-preview"

url = openai.api_base + "/openai/deployments?api-version=2023-03-15-preview" 

r = requests.get(url, headers={"api-key": API_KEY})

print(r.text)

Now we need to read our csv file and create a pandas DataFrame. After the initial DataFrame is created, we can view the contents of the table by running df.

In [None]:
df=pd.read_csv(os.path.join(os.getcwd(),'bill_sum_data.csv')) # This assumes that you have placed the bill_sum_data.csv in the same directory you are running Jupyter Notebooks
df

The initial table has more columns than we need we'll create a new smaller DataFrame called df_bills which will contain only the columns for text, summary, and title.

In [None]:
df_bills = df[['text', 'summary', 'title']]
df_bills

Next we'll perform some light data cleaning by removing redundant whitespace and cleaning up the punctuation to prepare the data for tokenization.

In [None]:
pd.options.mode.chained_assignment = None #https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#evaluation-order-matters

# s is input text
def normalize_text(s, sep_token = " \n "):
    s = re.sub(r'\s+',  ' ', s).strip()
    s = re.sub(r". ,","",s)
    # remove all instances of multiple spaces
    s = s.replace("..",".")
    s = s.replace(". .",".")
    s = s.replace("\n", "")
    s = s.strip()
    
    return s

df_bills['text']= df_bills["text"].apply(lambda x : normalize_text(x))

Now we need to remove any bills that are too long for the token limit (8192 tokens).

In [None]:
tokenizer = tiktoken.get_encoding("cl100k_base")
df_bills['n_tokens'] = df_bills["text"].apply(lambda x: len(tokenizer.encode(x)))
df_bills = df_bills[df_bills.n_tokens<8192]
len(df_bills)

We'll once again examine df_bills.

In [None]:
df_bills

To understand the n_tokens column a little more as well how text ultimately is tokenized, it can be helpful to run the following code:

In [None]:
sample_encode = tokenizer.encode(df_bills.text[0]) 
decode = tokenizer.decode_tokens_bytes(sample_encode)
decode

If you then check the length of the decode variable, you'll find it matches the first number in the n_tokens column.

In [None]:
len(decode)

Now that we understand more about how tokenization works we can move on to embedding. It is important to note, that we haven't actually tokenized the documents yet. The n_tokens column is simply a way of making sure none of the data we pass to the model for tokenization and embedding exceeds the input token limit of 8,192. When we pass the documents to the embeddings model, it will break the documents into tokens similar (though not necessarily identical) to the examples above and then convert the tokens to a series of floating point numbers that will be accessible via vector search. These embeddings can be stored locally or in an Azure Database. As a result, each bill will have its own corresponding embedding vector in the new ada_v2 column on the right side of the DataFrame.

In [None]:
df_bills['ada_v2'] = df_bills["text"].apply(lambda x : get_embedding(x, engine = 'text-embedding-ada-002')) # engine should be set to the deployment name you chose when you deployed the text-embedding-ada-002 (Version 2) model

In [None]:
df_bills