In [11]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer

In [2]:
chunks = [
    "OpenAI, creator of ChatGPT, is renowned for its advancements in natural language processing and ethical AI development.",
    "DeepMind, a subsidiary of Alphabet, has made significant breakthroughs in AI through projects like AlphaGo and its healthcare research.",
    "IBM’s Watson became famous for its ability to defeat human champions in the game show 'Jeopardy!' and is now applied in industries like healthcare and finance.",
    "Nvidia, initially known for its graphics cards, now plays a crucial role in AI hardware with its powerful GPUs for deep learning processes.",
    "Palantir Technologies specializes in big data analytics, providing AI-driven solutions to government and financial sectors for complex data environments.",
    "Baidu, often referred to as the 'Google of China,' invests heavily in AI, focusing on autonomous driving and natural language processing.",
    "Salesforce uses AI to enhance CRM through Einstein, an integrated AI that automates data entry, predicts sales, and personalizes customer interactions.",
    "Boston Dynamics, known for its robotic designs, integrates AI into robots that can navigate complex environments and perform tasks autonomously.",
    "H2O.ai provides an open-source platform for machine learning that helps businesses easily apply AI to solve real-world problems like risk analysis.",
    "UiPath designs software robots that use AI to automate repetitive office tasks, improving efficiency in business operations.",
    "SenseTime excels in computer vision technology, developing applications for facial recognition, surveillance, and autonomous driving in Asia.",
    "Zoox, a subsidiary of Amazon, focuses on creating autonomous vehicles fully powered by AI, aiming for a safer and more efficient transportation system.",
    "Tempus Labs uses AI to analyze clinical data and molecular data to personalize cancer treatments, contributing to advancements in precision medicine.",
    "C3.ai provides AI software applications for enterprises to accelerate digital transformation, focusing on industries like energy and aerospace.",
    "Element AI, based in Canada, was acquired by ServiceNow to enhance its AI capabilities in IT service management and workflow automation.",
    "BenevolentAI uses artificial intelligence to accelerate drug discovery and development, reducing the time and cost typically associated with these processes.",
    "Neuralink, co-founded by Elon Musk, aims to develop ultra-high bandwidth brain-machine interfaces to connect humans and computers.",
    "Waymo, another Alphabet subsidiary, leads in developing self-driving technology, aiming to make roads safer and transportation more accessible.",
    "Zebra Medical Vision uses AI to read medical imaging, helping doctors detect diseases earlier with higher accuracy.",
    "Vicarious focuses on developing artificial general intelligence with robots that can improve their efficiency over time through machine learning."
]


In [3]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [4]:
# Compute raw embeddings
embeddings = model.encode(chunks)
embeddings.shape

(20, 384)

In [5]:
# Compute embedding of the query
query = model.encode(["Elon Musk"])
query.shape

(1, 384)

In [6]:
simlarity = np.abs(query.dot(embeddings.T))

In [7]:
key = np.random.permutation(embeddings.shape[1])

enc_query = query[:, key]
enc_embeddings = embeddings[:, key]

enc_similarity = np.abs(enc_query.dot(enc_embeddings.T))

enc_similarity

array([[ 2.8616154 ,  4.6624846 ,  0.8129965 ,  6.7930264 ,  1.3005363 ,
         0.90208423,  2.079529  ,  2.2654073 ,  0.23510405,  2.0436993 ,
         0.06035659,  3.2013414 ,  0.2884987 ,  0.24981087,  2.8758595 ,
         1.2916597 , 15.49442   ,  3.331945  ,  6.759242  ,  1.0686808 ]],
      dtype=float32)

In [10]:
(simlarity - enc_similarity) < 0.001

array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True]])

In [28]:
df = pd.DataFrame()
df['Data'] = chunks
df['Embeddings'] = list(embeddings.round(2))
df['Salty Embeddings'] = list(enc_embeddings.round(2))
df['Similarity'] = simlarity.T.round(2)
df['Salty Similarity'] = enc_similarity.T.round(2)
df.sort_values('Similarity', ascending=False).head()

Unnamed: 0,Data,Embeddings,Salty Embeddings,Similarity,Salty Similarity
16,"Neuralink, co-founded by Elon Musk, aims to de...","[-0.4, -0.49, -0.15, -0.17, -0.19, -0.17, -0.2...","[0.16, -0.02, 0.63, -0.03, -0.01, 0.06, -0.28,...",15.49,15.49
3,"Nvidia, initially known for its graphics cards...","[-0.21, -0.38, -0.64, -0.04, 0.12, 0.16, -0.3,...","[0.05, -0.16, 0.47, 0.21, 0.21, 0.19, 0.14, 0....",6.79,6.79
18,Zebra Medical Vision uses AI to read medical i...,"[-0.14, -0.32, -0.39, -0.15, -0.03, 0.09, 0.33...","[-0.16, -0.04, 0.68, 0.32, 0.4, -0.03, 0.45, 0...",6.76,6.76
1,"DeepMind, a subsidiary of Alphabet, has made s...","[-0.31, -0.18, -0.28, -0.32, -0.02, 0.05, -0.5...","[0.24, 0.09, 0.5, 0.5, 0.03, -0.36, -0.08, 0.3...",4.66,4.66
17,"Waymo, another Alphabet subsidiary, leads in d...","[0.03, -0.01, 0.1, -0.14, -0.22, -0.27, -0.12,...","[-0.27, -0.21, 0.18, 0.69, 0.02, -0.49, -0.19,...",3.33,3.33
