In [1]:
import numpy as np

In [2]:
def load_glove_embeddings(file_path):
    embeddings_index ={}
    with open(file_path,'r',encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:],dtype = 'float32')
            embeddings_index[word] = coefs
        return embeddings_index
    
# path to the download Glove file
glove_file_path = 'C:\\Users\\singh\\OneDrive\\Desktop\\Transformer\\Embedding_model\\glove.42B.300d.txt'
embeddings_index = load_glove_embeddings(glove_file_path)
print(f"Loaded{len(embeddings_index)} word vectors.")

Loaded1917495 word vectors.


In [3]:
import re

# Simple tokenization and preprocessing
def preprocess_document(doc):
    doc = doc.lower()  # Convert to lowercase
    doc = re.sub(r'[^\w\s]', '', doc)  # Remove punctuation
    tokens = doc.split()  # Tokenize by splitting on whitespace
    return tokens

document = "The cat sat on the mat. The mat was blue."
tokens = preprocess_document(document)
print("Tokens:", tokens)

Tokens: ['the', 'cat', 'sat', 'on', 'the', 'mat', 'the', 'mat', 'was', 'blue']


In [4]:
embedding_dim = 300  # GloVe embedding dimension
embedding_matrix = np.zeros((len(tokens), embedding_dim))

for i, word in enumerate(tokens):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

print("Embedding Matrix:\n", embedding_matrix)

Embedding Matrix:
 [[-2.08379999e-01 -1.49320006e-01 -1.75279994e-02 ... -5.40660024e-01
   2.11989999e-01 -9.43570025e-03]
 [ 2.11099997e-01  2.17629999e-01 -5.26380002e-01 ... -1.50759995e-01
  -2.40640000e-01 -5.13649993e-02]
 [-5.25780022e-01 -2.27190003e-01  5.18419981e-01 ... -6.55960012e-03
   7.91649967e-02  9.64530036e-02]
 ...
 [ 8.89369994e-02  1.92560002e-01  2.95569986e-01 ... -3.15770000e-01
   1.69609994e-01 -3.88570011e-01]
 [-4.21999991e-02 -4.44139994e-04  5.28949983e-02 ... -8.49580020e-02
  -1.19060002e-01 -4.98100013e-01]
 [-1.69119999e-01 -7.01619983e-01 -2.12990001e-01 ... -2.15650007e-01
   7.91009981e-03 -4.21869993e-01]]


In [5]:
import chromadb

# Initialize Chroma DB client
client = chromadb.Client()

# Delete existing collection if it exists
collection_name = 'word_embeddings'
if collection_name in client.list_collections():
    client.delete_collection(name=collection_name)
    time.sleep(5)  # wait for 5 seconds

# Create a new collection
try:
    collection = client.create_collection(name=collection_name)
except Exception as e:
    print(f"An error occurred: {str(e)}")

In [6]:

# Add embeddings to the collection
for i, word in enumerate(tokens):
    embedding_vector = embedding_matrix[i].tolist()
    collection.add([str(i)], [embedding_vector], [{'word': word}])

print(f"Stored {len(tokens)} embeddings in Chroma DB.")

Stored 10 embeddings in Chroma DB.


In [7]:
# Querying the embeddings
query_word = 'cat'
query_embedding = embeddings_index[query_word].tolist()

# Find the nearest neighbors for the query embedding
results = collection.query([query_embedding])

# Extract the words and distances
words = results['metadatas'][0]
distances = results['distances'][0]

# Print the words and their corresponding distances
for word, distance in zip(words, distances):
    print(f"Word: {word['word']}, Distance: {distance}")

Word: cat, Distance: 0.0
Word: blue, Distance: 56.634735107421875
Word: mat, Distance: 61.52752685546875
Word: mat, Distance: 61.52752685546875
Word: the, Distance: 66.46363067626953
Word: the, Distance: 66.46363067626953
Word: the, Distance: 66.46363067626953
Word: was, Distance: 67.02333068847656
Word: on, Distance: 71.3979263305664
Word: sat, Distance: 72.578369140625
