In [None]:
%pip install -U -q "google-generativeai>=0.8.3"
%pip install --upgrade gensim
%pip install --upgrade vertexai

In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [4]:
import google.generativeai as genai

Set up your API key

In [5]:
from kaggle_secrets import UserSecretsClient

GOOGLE_API_KEY = UserSecretsClient().get_secret("GOOGLE_API_KEY")
genai.configure(api_key=GOOGLE_API_KEY)

In [6]:
import tensorflow

In [None]:
# Tokenize the input string data
from tensorflow.keras.preprocessing.text import Tokenizer

data = [
    "The earth is spherical.",
    "The earth is a planet.",
    "I like to eat at a restaurant."
]

#Filter the punctiations, tokenize the words and inex them to integers
tokenizer = Tokenizer(num_words=15, filters="!#$%&()*+,-./:;<=>?[\\]^_'{|}~\t\n", lower=True, split=' ')
tokenizer.fit_on_texts(data)

#Translate each sentence into its word-level IDs, and then one-hot encode those IDs

ID_sequences = tokenizer.texts_to_sequences(data)
binary_sequences = tokenizer.sequences_to_matrix(ID_sequences)

print("ID dictionary:\n", tokenizer.word_index)
print("\nID sqeuences:\n", ID_sequences)

# One-hot encoding is a binary representation of categorical values where the presence of a word is represented by 1, and its absence by 0. 
#This ensures that the token IDs are treated as categorical values as they are, but often results in a dense vector the size of the vocabulary of the corpus.

print("\n One-hot encoded squences:\n", binary_sequences)

Loading and plotting GloVe and Word2Vec embeddings in 2D

In [None]:
from IPython.core.getipython import get_ipython

# Increase the message rate limit
get_ipython().run_line_magic('config', "NotebookApp.iopub_msg_rate_limit=5000")
get_ipython().run_line_magic('config', "NotebookApp.rate_limit_window=10.0")

In [None]:
import gensim, matplotlib

In [None]:
import logging

# Set logging to ERROR to suppress warnings/info
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)


In [None]:
from gensim.models import Word2Vec
import gensim.downloader as api
import pprint
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

def t_sne_plt(models,words,seed=23):
    # Creates a TSNE models and plots for multiple models for the given words
    
    plt.figure(figsize=(len(models)*30, len(models)*30))
    model_ix=0
    for model in models:
        labels = []
        tokens = []

        for word in words:
            tokens.append(model[word])
            labels.append(word)

        tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=seed)
        new_values = tsne_model.fit_transform(np.array(tokens))
        x = []
        y = []
        for value in new_values:
            x.append(value[0])
            y.append(value[1])
    
        model_ix +=1
        plt.subplot(10, 10, model_ix)

        for i in range(len(x)):
            plt.scatter(x[i], y[i])
            plt.annotate(labels[i],
                         xy = (x[i], y[i]),
                         xytext = (5, 2),
                         textcoords = 'offset points',
                         ha = 'right',
                         va = 'bottom' 
                        )


    plt.tight_layout()
    plt.show()

In [None]:
#v2w_model = api.load('word2vec-google-news-300')
v2w_model = api.load('word2vec-google-news-300', return_path=False, ignore_for_missing=True)

In [None]:
#glove_model = api.load('glove-twitter-25')
glove_model = api.load('glove-twitter-25', return_path=False, ignore_for_missing=True))

In [None]:
print("Words most similar to 'computer' with word2vec and glove respectively")
pprint.pprint( v2w_model.most_similar("computer")[:3] )
pprint.pprint( glove_model.most_similar("computer")[:3])
pprint.pprint( "2d projection of some commoon words of both models")
sample_common_words = list(set(v2w_model.index_to_key[100:10000])
                          &(set(glove_model.index_to_key[100:10000])))[:100]

In [None]:
 #tsne_plot([v2w_model, glove_model], sample_common_words)
t_sne_plt([v2w_model, glove_model], sample_common_words)

 Snippet : Self-supervised Training and inference using Doc2Vec on private corpus

In [None]:
from gensim.test.utils import common_texts
from gensim.models.Doc2Vec import Doc2Vec, TaggedDocument
from gensim.test.utils import get_tmpfile
#train model on a sequence of documents tagged with their IDs
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]
model = Doc2Vec(documents, vector_size=8, window=3, min_count=1, workers=6)
# persist model to disk, and load it to infer on new documents
model_file = get_tmpfile("Doc2Vec_v1")
model.save(model_file)
model = Doc2Vec.load(model_file)  
model.infer_vector(["human", "interface"])

Creating a Keras model using trainable tfhub layer

In [None]:
# Can switch the embedding to different embeddings from different modalities on # 
tfhub. Here we use the BERT model as an example.
 tfhub_link = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4"
 class Classifier(tf.keras.Model):
 def __init__(self, num_classes):
 super(Classifier, self).__init__(name="prediction")
 self.encoder = hub.KerasLayer(tfhub_link, trainable=True)
      self.dropout = tf.keras.layers.Dropout(0.1)
      self.dense = tf.keras.layers.Dense(num_classes)
   x
 def call(self, preprocessed_text):
      encoder_outputs = self.encoder(preprocessed_text)
      pooled_output = encoder_outputs["pooled_output"]
 = self.dropout(pooled_output)
   x = self.dense(x)
 return x

Using scikit-learn29 and lshashing30 for ANN with LSH, KD/Ball-tree and linear search

In [7]:
from sklearn.neighbors import NearestNeighbors
from vertexai.language_models import TextEmbeddingModel
from lshashing import LSHRandom
import numpy as np
model = TextEmbeddingModel.from_pretrained("textembedding-gecko@004")
test_items= [
"The earth is spherical.",
"The earth is a planet.",
"I like to eat at a restaurant."]
query = "the shape of earth"
embedded_test_items = np.array([embedding.values for embedding in model.get_embeddings(test_items)])
embedded_query = np.array(model.get_embeddings([query])[0].values)
#Naive brute force search
n_neighbors=2
nbrs = NearestNeighbors(n_neighbors=n_neighbors, algorithm='brute').fit(embedded_test_items) 
aive_distances, naive_indices = nbrs.kneighbors(np.expand_dims(embedded_query, axis = 0))
#algorithm- ball_tree due to high dimensional vectors or kd_tree otherwise
nbrs = NearestNeighbors(n_neighbors=n_neighbors, algorithm='ball_tree').fit(embedded_test_items) 
istances, indices = nbrs.kneighbors(np.expand_dims(embedded_query, axis = 0))
#LSH
lsh_random_parallel = LSHRandom(embedded_test_items, 4, parallel = True)
lsh_random_parallel.knn_search(embedded_test_items, embedded_query, n_neighbors, 3, parallel = True)
#output for all 3 indices = [0, 1] , distances [0.66840428, 0.71048843] for the first 2 neighbours
#ANN retrieved the same ranking of items as brute force in a much scalable manner

ModuleNotFoundError: No module named 'lshashing'

Indexing and executing ANN search with the FAISS library using HNSW

In [None]:
 import faiss
 M=32 #creating high degree graph:higher recall for larger index & searching time
 d=768 # dimensions of the vectors/embeddings
 index = faiss.IndexHNSWFlat(d, M)
 index.add(embedded_test_items) #build the index using the embeddings in Snippet 9
 #execute the ANN search
 index.search(np.expand_dims(embedded_query, axis=0), k=2)

Accuracy/speed tradeoffs for various SOTA ANN search algorithms

In [None]:
 import tensorflow as tf
 import tensorflow_recommenders as tfrs
 from vertexai.language_models import TextEmbeddingModel, TextEmbeddingInput
 # Embed documents & query(from snip 9.) and convert them to tensors and tf.datasets
 embedded_query = tf.constant((LM_embed(query, "RETRIEVAL_QUERY")))
 embedded_docs = [LM_embed(doc, "RETIREVAL_DOCUMENT") for doc in searchable_docs]
 embedded_docs = tf.data.Dataset.from_tensor_slices(embedded_docs).enumerate().batch(1)
 # Build index from tensorflow dataset and execute ANN search based on dot product metric
 scann = tfrs.layers.factorized_top_k.ScaNN( 
  distance_measure= 'dot_product',
  num_leaves = 4, #increase for higher number of partitions / latency for increased recall
  num_leaves_to_search= 2) # increase for higher recall but increased latency
 scann = scann.index_from_dataset(embedded_docs)
 scann(embedded_query, k=2)