# Embedding Generation

In this notebook we will do the following:

- Copy the dataset over from the previous Notebook
- Use the Dataset along with Vertex PaLM API to generate Embeddings
- Save the Embeddings to a Storage Bucket
- Create a Matching Engine Index
- Create a Matching Engine Endpoint
- Deploy the Index to an Index Endpoint
- Create BigTable Table
- Save the Data to the Table

In [None]:
#Authenticate with your google cloud account
from google.colab import auth as google_auth
google_auth.authenticate_user()

In [None]:
# Install the needed libraries
# Restart Runtime after this
! pip install google-cloud-aiplatform

! pip install google-cloud-bigtable

! pip install google-cloud-happybase==0.33.0



Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting google-cloud-aiplatform
  Using cached google_cloud_aiplatform-1.25.0-py2.py3-none-any.whl (2.6 MB)
Collecting google-cloud-resource-manager<3.0.0dev,>=1.3.3 (from google-cloud-aiplatform)
  Using cached google_cloud_resource_manager-1.10.0-py2.py3-none-any.whl (321 kB)
Collecting shapely<2.0.0 (from google-cloud-aiplatform)
  Using cached Shapely-1.8.5.post1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (2.0 MB)
Collecting grpc-google-iam-v1<1.0.0dev,>=0.12.4 (from google-cloud-resource-manager<3.0.0dev,>=1.3.3->google-cloud-aiplatform)
  Using cached grpc_google_iam_v1-0.12.6-py2.py3-none-any.whl (26 kB)
Installing collected packages: shapely, grpc-google-iam-v1, google-cloud-resource-manager, google-cloud-aiplatform
  Attempting uninstall: shapely
    Found existing installation: shapely 2.0.1
    Uninstalling shapely-2.0.1:
      Successfully uninstalled shapel

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting google-cloud-bigtable
  Using cached google_cloud_bigtable-2.18.1-py2.py3-none-any.whl (292 kB)
Installing collected packages: google-cloud-bigtable
Successfully installed google-cloud-bigtable-2.18.1


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting google-cloud-happybase==0.33.0
  Using cached google_cloud_happybase-0.33.0-py2.py3-none-any.whl (30 kB)
Installing collected packages: google-cloud-happybase
Successfully installed google-cloud-happybase-0.33.0


--------------------------------------------------------------------------------
!!! RESTART RUNTIME !!!
--------------------------------------------------------------------------------

In [None]:
from typing_extensions import ClassVar
PROJECT_ID = "demogct2022" #@param
Region = "us-central1" #@param
#BUCKET where Wikipedia CSV file is stored
BUCKET_TO_SAVE= f"wikipedia_strings_{PROJECT_ID}"#@param
BUCKET_EMBEDDINGS = f"wikipedia_embeddings_{PROJECT_ID}"#@param
CSV_FILE = "wikipedia_strings.csv" #@param








In [None]:
#import the needed packages
import pandas as pd
import numpy as np
import os
import json
import vertexai
from vertexai.preview.language_models import TextEmbeddingModel, TextGenerationModel
from google.cloud import storage


In [None]:
# Initialize Vertex AI
vertexai.init(project=PROJECT_ID,location=Region)


In [None]:
# Download the file locally

def download_blob(bucket_name, source_blob_name, destination_file_name):
  """Downloads a blob from the bucket."""
  storage_client = storage.Client(project=PROJECT_ID)
  bucket = storage_client.get_bucket(BUCKET_TO_SAVE)
  blob = bucket.blob(CSV_FILE)
  blob.download_to_filename(CSV_FILE)

  if  os.path.exists(CSV_FILE):
    print(f"File {CSV_FILE} downloaded successfully")
  else:
    print(f"File {CSV_FILE} does not exist")



def upload_blob(bucket_name, source_file_name, destination_blob_name, REGION = "us-central1", PROJECT_ID="demogct2022"):
  """Uploads a file to the bucket. If the bucket does not exists it creates the bucket"""
  # Create a storage client
  client = storage.Client(project=PROJECT_ID)
  bucket = client.bucket(bucket_name)
  bucket.location = REGION

  try:
    bucket = client.get_bucket(BUCKET_TO_SAVE)
    print(f"Bucket {BUCKET_TO_SAVE} exists.")
  except:
    print(f"Creating bucket {BUCKET_TO_SAVE}")
    bucket.create()
    print(f"Bucket {BUCKET_TO_SAVE} created.")


  # Check if the wikipedia_strings.csv file exists locally
  if os.path.exists(source_file_name):
    # Copy the file
    #client.copy_file('wikipedia_strings.csv', f'gs://{BUCKET_TO_SAVE}/wikipedia_strings.csv')
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(source_file_name)
    print(f"Blob {destination_blob_name} uploaded to {bucket_name}.")
  else:
    print(f"The {source_file_name} file does not exist.")
    print ("Please save the dataframe in the above frames")

  stats = storage.Blob(bucket=bucket, name = destination_blob_name).exists(client)
  if stats:
    print(f"Blob gs://{BUCKET_TO_SAVE}/wikipedia_strings.csv exists.")
  else:
    print(f"Blob gs://{BUCKET_TO_SAVE}/wikipedia_strings.csv does not exist.")



In [None]:
#Pandas read data from gcs bucker
df = pd.read_csv(CSV_FILE)
df.head()

Unnamed: 0.1,Unnamed: 0,text
0,0,Natalie Spooner\n\n{{short description|Canadia...
1,1,Natalie Spooner\n\n==Playing career==\n\n\n
2,2,Natalie Spooner\n\n==Playing career==\n\n===Ho...
3,3,Natalie Spooner\n\n==Playing career==\n\n===Mi...
4,4,Natalie Spooner\n\n==Playing career==\n\n===PW...


In [None]:
# Load the Text Embedding Model

model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")

In [None]:
#Given a list of strings, generate embeddings
#Since the API only supports a max of 5 strings, we will batch it up
def gen_embeddings( wikipedia_strings):
 index = 0
 allembeddings = []
 for i in range(5,len (wikipedia_strings)+1,5):
   print(i)
   embedarr = model.get_embeddings(wikipedia_strings[index:i] )
   allembeddings.append(embedarr)
   index = i
 return allembeddings

#Flatten the list
def flatten(allembeddings):
 flatlist = []
 for l in allembeddings:
    for e in l:
        flatlist.append(e.values)
 return flatlist

# Generate wikipedia list from dataframe
def generate_wiki_list (df):
  wikipedia_list = []
  for i in range(len(df)):
    wikipedia_list.append(df.iloc[i]['wikipedia_string'])
  return wikipedia_list

# Tie all the above methods to generate Embeddings from the df
def generate_embeddings(df):
  wikipedia_list = generate_wiki_list(df)
  allembeddings = gen_embeddings(wikipedia_list)
  flattened_list = flatten(allembeddings)
  return flattened_list


In [None]:
# Generate Embeddings
# This will take some time to run
allEmbeddings = generate_embeddings(df)



In [None]:
#Generate a DF Embeddings
medf = pd.DataFrame()
medf['embeddings'] = allEmbeddings
medf['wikipedia_string'] = df['wikipedia_string']
medf.head()



In [None]:
medf.save("wikipedia_string_w_embeddings.csv")


In [None]:
# Generate a CSV file with id and Embeddings
def gen_emb_csv(medf, csvfname):
 with open(csvfname, "w") as f:
    for i in range(medf.embeddings.size):
        f.write( str(i) + ',')
        f.write( ",".join(str(medf.embeddings[i])[1:-1].split(",")))
        f.write("\n")

In [None]:
#Filename for CSV to be saved
fname = "wikipedia_string_w_embeddings.csv"#@param


In [None]:
#Generate the embedding file
gen_emb_csv(medf,fname)

In [None]:

#Check the file has been generated and then uplaod to the bucket
if  os.path.exists(fname):
  print(f"File {fname} exists.")
  upload_blob(BUCKET_EMBEDDINGS,fname, fname)

# Save the Text Data to BigTable

In this section we will uplaod he text into BigTable so that it can be used later on.
*******************************************************************************
Assumption
*******************************************************************************
The Next Section Assumes that you have already created a BigTable Instance and Table using the Console.
 * [BigTable](https://cloud.google.com/bigtable/docs/creating-instance)

 Ensure that you note the Table Name, Instance Id from the console outside this demo. Due to colab permissions limitations and complexity we have kept the process outside.

 Assuming you have the above values, please proceed to the next section.


In [None]:
from google.cloud import bigtable
from google.cloud import happybase
import pandas as pd

PROJECT_ID = "demogct2022" #@param
INSTANCE_ID = "bus-instance" #@param
TABLE_ID = "masterdata" #@param
COLUMN_FAMILY_NAME = "cf1" #@param


In [None]:
#Connect to the BigTable
#Define the column family
#Define the column name format
client = bigtable.Client(project=PROJECT_ID, admin=True)
instance = client.instance(INSTANCE_ID)
connection = happybase.Connection(instance=instance)
table = connection.table(TABLE_ID)
column_family_name = COLUMN_FAMILY_NAME
column_name = "{fam}:content".format(fam=column_family_name)

In [None]:
#Loading the file again incase we have lost handle to the earler dataframe
if not df:
  df = pd.read_csv(CSV_FILE)


# Lets us define a row key. Over here we will create keys in the format
# wikipedia_string#0001
# Since BigTable keys are in lexical order, we need to make sure that it is ordered correctly
def get_row_key(index, max_size):
   rkey = "wikipedia_string" + "#" + str(index).zfill(max_size)
   return rkey

# Function to put data to a table
def putTable(table, column_name, wiki_text, df_index, df_num_digits):
  row_key = get_row_key(df_index, df_num_digits)

  table.put(row_key, {column_name.encode("utf-8"): wiki_text.encode("utf-8")})
  print ("Data Entered")
  return row_key



In [None]:

'''
Load all the data into the table
This should be run once only
'''
import traceback
max_size = len(str(df.size))
index=0
for row in df.wikipedia_strings:
  try:
   putTable(table, column_name, row, index, max_size)
  except Exception as e:
   print("error" , e)
   traceback.print_exc()

  index = index + 1

This concludes the section were we have finished
* Generating Embeddings
* Saving the file to BigTable

In [None]:
# In the next section we