In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
# Our toy "vectors"
toys = np.array([
    [1, 0],     # Red Car
    [1, 0.2],   # Red Truck
    [0, 1],     # Green Ball
    [0, 0.9],   # Green Apple
    [1, 0.8]    # Red Balloon
])

# Names of toys (just for fun)
toy_names = ["Red Car", "Red Truck", "Green Ball", "Green Apple", "Red Balloon"]

# A new toy we want to compare (Red Small Car)
query = np.array([[1, 0]])  # Red and Vehicle-like

In [None]:
# Find similarities
similarities = cosine_similarity(query, toys)[0]

# Get top 3 most similar toys
top_indices = similarities.argsort()[::-1][:3]

In [None]:
print("Toys most similar to your query:")
for i in top_indices:
    print(f"{toy_names[i]} (Score: {similarities[i]:.2f})")

In [2]:
from openai import OpenAI
import os
from dotenv import load_dotenv
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# os.environ["OMP_NUM_THREADS"] = "1"  # <- prevents OpenMP conflict

In [3]:
load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

# Check the key

if not api_key:
    print("No API key was found!")
elif not api_key.startswith("sk-proj-"):
    print("An API key was found, but it doesn't start sk-proj-;")
elif api_key.strip() != api_key:
    print("An API key was found, but it looks like it might have space or tab characters at the start or end")
else:
    print("API key found and looks good so far!")

openai = OpenAI()

API key found and looks good so far!


In [None]:
def get_embedding(text):
    response = openai.embeddings.create(
        input=[text],
        model="text-embedding-3-small"  # Fast & cheap, 1536 dimensions
    )
    return np.array(response.data[0].embedding, dtype='float32')

In [None]:
# 📝 Sample documents
documents = [
    "How to write a great resume for software engineering.",
    "Tips for improving your LinkedIn profile.",
    "Best practices for technical interviews.",
    "How to cook perfect pasta.",
    "Building projects to get hired as a frontend developer."
]

In [None]:
# Convert each document into a vector
doc_vectors = np.array([get_embedding(doc) for doc in documents])

In [None]:
query = "How can I improve my resume for tech jobs?"
query_vector = get_embedding(query)
print(query_vector)

In [None]:
query_vector = query_vector.reshape(1, -1).astype('float32')
print(query_vector)
query_vector = np.array(query_vector, dtype='float32').reshape(1, -1)

print("Any NaNs?", np.isnan(query_vector).any())
print("Any Infs?", np.isinf(query_vector).any())
print("Shape:", query_vector.shape)

In [None]:
# dimension = 1536
# index = faiss.IndexFlatL2(dimension)
# index.add(np.vstack(doc_vectors))  # doc_vectors is list of 1536-dim float32 vectors
# print("FAISS index dimension:", index.d)
# print("Query vector shape:", query_vector.shape)


In [None]:
# Search for 3 most similar documents
# distances, indices = index.search(query_vector, k=3)

In [None]:
# print("Top results for query:")
# for i in indices[0]:
#     print(f"- {documents[i]}")

In [None]:
# Compute cosine similarity (higher is more similar)
similarities = cosine_similarity(query_vector, doc_vectors)[0]

# Get top 3 most similar indices
top_k_indices = similarities.argsort()[::-1][:3]

# Print top results
print("Top results for query:")
for i in top_k_indices:
    print(f"- {documents[i]} (Score: {similarities[i]:.4f})")

# Using Vector Database

In [7]:
import openai
import numpy as np
import faiss

In [8]:
load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

# Check the key

if not api_key:
    print("No API key was found!")
elif not api_key.startswith("sk-proj-"):
    print("An API key was found, but it doesn't start sk-proj-;")
elif api_key.strip() != api_key:
    print("An API key was found, but it looks like it might have space or tab characters at the start or end")
else:
    print("API key found and looks good so far!")

openai = OpenAI()

API key found and looks good so far!


In [None]:
# Step 1: Embed documents
def get_embedding(text):
    response = openai.embeddings.create(
        input=[text],
        model="text-embedding-3-small"
    )
    return np.array(response.data[0].embedding, dtype='float32')

In [9]:
# Your documents
documents = [
    "Tips to improve a tech resume.",
    "Guide to cracking software engineering interviews.",
    "How to write a better LinkedIn summary.",
    "Best programming languages to learn in 2025."
]

# Embed all documents
doc_vectors = np.array([get_embedding(doc) for doc in documents])

In [None]:
# Step 2: Build FAISS index (L2 used, so vectors must be normalized if you want cosine similarity)
dimension = doc_vectors.shape[1]
index = faiss.IndexFlatIP(dimension)  # Inner Product ≈ Cosine if vectors normalized

# Normalize document vectors for cosine similarity
faiss.normalize_L2(doc_vectors)
index.add(doc_vectors)

In [5]:
# Step 3: Query embedding
query = "How can I improve my resume for tech jobs?"
query_vector = get_embedding(query).reshape(1, -1)
faiss.normalize_L2(query_vector)

In [6]:
# Step 4: Search top 3 similar documents
top_k = 3
distances, indices = index.search(query_vector, top_k)

# Step 5: Display results
print("Top results for query:")
for idx, score in zip(indices[0], distances[0]):
    print(f"- {documents[idx]} (Score: {score:.4f})")

Top results for query:
- Tips to improve a tech resume. (Score: 0.8663)
- How to write a better LinkedIn summary. (Score: 0.4745)
- Guide to cracking software engineering interviews. (Score: 0.4208)
