In [None]:
import os # Our API key is stored in environment variables
from dotenv import load_dotenv # To load environment variables from a .env file
load_dotenv() # Load environment variables from .env file
OpenAI_api_key = os.getenv("OPEN_AI_KEY") # To load the API key


# To Set an API Key instead of loading from .env file
os.environ["OPEN_AI_KEY"] = "your_openai_api_key_here"

### OpenAI Embeddings

In [None]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [None]:
### Single text embedding

single_text = "langchain is a framework for developing applications powered by language models"
single_text_embedding = embeddings.embed_query(single_text)
print(single_text_embedding)


print(f"Output: vector of {len(single_text_embedding)} dimensions")




In [None]:
### Example2: Multiple text embeddings

multiple_texts=["The Cat Sat on the mat",
           "The dog sat on the log",
           "Cats and dogs are great pets",
           "I love to play football",
           "Football is a great sport"
           "Python is a programming language",
           "I Love coding in Python"]

In [None]:
multiple_texts_embeddings = embeddings.embed_documents(multiple_texts)

print(multiple_texts_embeddings)

print("------------------------------------------------------")
print(f"Number of texts: {len(multiple_texts)}")
print(f"Number of embeddings generated: {len(multiple_texts_embeddings)}")
print(f"Each embedding size is of dimension: {len(multiple_texts_embeddings[0])}")


### Cosine Similarity With OpenAI Embeddings

In [7]:
import numpy as np

def cosine_similarity(vec1, vec2):
    """Calculate the cosine similarity between two vectors.

    Results close to 1 indicate high similarity,
    while results close to -1 indicate low similarity/opposite direction.
    0 indicates orthogonality (no similarity).

    """

    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

In [None]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
embeddings

In [8]:
sentences=["The Cat Sat on the mat",
           "The dog sat on the log",
           "Cats and dogs are great pets",
           "I love to play football",
           "Football is a great sport"
           "Python is a programming language",
           "I Love coding in Python"]


In [None]:
sentences_embeddings = embeddings.embed_documents(sentences)
sentences_embeddings

In [None]:
# Calculate cosine similarity between all pairs of sentence embeddings
for i in range(len(sentences_embeddings)):
    for j in range(i + 1, len(sentences_embeddings)):
        similarity = cosine_similarity(sentences_embeddings[i], sentences_embeddings[j])
        print(f"Cosine Similarity between '{sentences[i]}' and '{sentences[j]}': {similarity:.4f}")


In [None]:
# Example Output: Semantic Search - retrieve the similar documents for a given query
documents = [ 
    "The cat sat on the mat.",
    "Dogs are great pets.",
    "I love playing football.",
    "Python is a programming language.",
    "Coding in Python is fun.",
   "langchain is a framework for developing applications powered by language models"]

query = "What is langchain?"



In [None]:
def semantic_search(query, documents, embeddings, top_k=3):
    """Simple Semantic Search implementation."""

    ## Embed the query
    query_embedding = embeddings.embed_query(query)
    # Embed the documents
    document_embeddings = embeddings.embed_documents(documents)
    # Calculate cosine similarities
    similarities = []

    for i, doc_emb in enumerate(document_embeddings):
        sim = cosine_similarity(query_embedding, doc_emb)
        similarities.append((i, sim))

    # Sort by similarity score in descending order
    similarities.sort(reverse=True)
    # Get top_k results
    return similarities[:top_k]

In [None]:
results = semantic_search(query, documents, embeddings)
results    

In [None]:
print("Top similar documents:")

print(f"\n Semantic Search results for the query: '{query}'")
for score, doc in results:
    print(f"Score: {score:.3f} | {doc} ")