## Setup and Import Libraries

In [1]:
import os
import numpy as np
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv

import warnings
warnings.filterwarnings('ignore')

In [2]:
load_dotenv()

True

In [3]:
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [4]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

## Cosine Similarity with OpenAI

In [5]:
sentences = [
    "The cat sat on a mat",
    "A feline rested on the rug",
    "The dog played in the yard",
    "I love programming in Python",
    "Python is my favourite programming language"
]

In [6]:
def cosine_similarity(vector1, vector2):
    """ 
    Cosine Similarity measures the angel between two vectors.
    - Result close to 1: Very similiar
    - Result close to 0: Not related
    - Result close to -1: Opposite meaning
    """

    dot_product = np.dot(vector1, vector2)
    norm_vector1 = np.linalg.norm(vector1)
    norm_vector2 = np.linalg.norm(vector2)

    return dot_product/(norm_vector1 * norm_vector2)

In [7]:
sentence_embeddings = embeddings.embed_documents(texts=sentences)

sentence_embeddings

[[-0.02106035128235817,
  -0.04786085709929466,
  0.003587598679587245,
  0.009219637140631676,
  0.036406755447387695,
  -0.008623342029750347,
  -0.006657534744590521,
  0.031715027987957,
  0.01819027215242386,
  -0.008754395879805088,
  0.03638054430484772,
  -0.004308394622057676,
  0.04746769741177559,
  0.05535713583230972,
  0.0279930979013443,
  0.03373325616121292,
  0.003374636173248291,
  0.001869155210442841,
  -0.06778103858232498,
  0.048961710184812546,
  0.030116170644760132,
  -0.04309049993753433,
  -0.001421115011908114,
  0.011834160424768925,
  0.002740663243457675,
  0.020601661875844002,
  -0.015451246872544289,
  -0.015556089580059052,
  0.006392150651663542,
  0.011375471949577332,
  0.01600167341530323,
  -0.0358039066195488,
  -0.026800507679581642,
  -0.05132067948579788,
  -0.037743501365184784,
  -0.0004191674815956503,
  -0.01631620153784752,
  -0.012627036310732365,
  -0.04282839223742485,
  -0.024402223527431488,
  -0.03486032038927078,
  0.00098699913

In [8]:
for sentence in range(len(sentences)):
    for text in range(sentence + 1, len(sentences)):
        similarity = cosine_similarity(sentence_embeddings[sentence], sentence_embeddings[text])

        print(f"'{sentences[sentence]}' vs '{sentences[text]}'")
        print(f"Similarity Score: {similarity:.3f}\n")

'The cat sat on a mat' vs 'A feline rested on the rug'
Similarity Score: 0.670

'The cat sat on a mat' vs 'The dog played in the yard'
Similarity Score: 0.299

'The cat sat on a mat' vs 'I love programming in Python'
Similarity Score: 0.087

'The cat sat on a mat' vs 'Python is my favourite programming language'
Similarity Score: 0.124

'A feline rested on the rug' vs 'The dog played in the yard'
Similarity Score: 0.296

'A feline rested on the rug' vs 'I love programming in Python'
Similarity Score: 0.055

'A feline rested on the rug' vs 'Python is my favourite programming language'
Similarity Score: 0.104

'The dog played in the yard' vs 'I love programming in Python'
Similarity Score: 0.126

'The dog played in the yard' vs 'Python is my favourite programming language'
Similarity Score: 0.083

'I love programming in Python' vs 'Python is my favourite programming language'
Similarity Score: 0.711



## Semantic Search

In [9]:
documents = [
    "LangChain is a framework for developing applications powered by language models",
    "Python is a high-level programming language",
    "Machine Learning is a subset of artificial intelligence",
    "Embeddings convert text into numerical vectors",
    "The weather today is sunny and warm"
]

In [10]:
query = "What is LangChain?"

In [13]:
def semantic_search(query, documents, embedding_model, top_k=3):
    """ 
    Simple Semantic Search Implementation
    """
    query_embedding = embedding_model.embed_query(query)
    document_embeddings = embedding_model.embed_documents(documents)

    # Calculate Similarity Score
    similarities = []

    for i, document_embedding in enumerate(document_embeddings):
        similarity = cosine_similarity(vector1=query_embedding, vector2=document_embedding)
        similarities.append((similarity, documents[i]))

    # Sort by Similarity
    similarities.sort(reverse=True)
    return similarities[:top_k]


In [14]:
results = semantic_search(query=query, documents=documents, embedding_model=embeddings)
results

[(np.float64(0.6654946461195727),
  'LangChain is a framework for developing applications powered by language models'),
 (np.float64(0.14358671415045782),
  'Python is a high-level programming language'),
 (np.float64(0.09862885942956513),
  'Machine Learning is a subset of artificial intelligence')]

In [16]:
print(f"\n🔎 Semantic Search for query: '{query}'")
for score, text in results:
    print(f"Score: {score:.3f} | {text}")


🔎 Semantic Search for query: 'What is LangChain?'
Score: 0.665 | LangChain is a framework for developing applications powered by language models
Score: 0.144 | Python is a high-level programming language
Score: 0.099 | Machine Learning is a subset of artificial intelligence


In [17]:
query = "What is embeddings?"

results = semantic_search(query=query, documents=documents, embedding_model=embeddings)

print(f"\n🔎 Semantic Search for query: '{query}'")
for score, text in results:
    print(f"Score: {score:.3f} | {text}")


🔎 Semantic Search for query: 'What is embeddings?'
Score: 0.610 | Embeddings convert text into numerical vectors
Score: 0.261 | LangChain is a framework for developing applications powered by language models
Score: 0.234 | Machine Learning is a subset of artificial intelligence
