In [None]:
# !pip install sentence-transformers torch
# !pip install scikit-learn

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("nomic-ai/nomic-embed-text-v1",
                            trust_remote_code=True)

In [3]:
texts = [
    "Deep learning models require large datasets.",
    "Neural networks work well with lots of data.",
    "I like eating pizza."
]

embeddings = model.encode(
    texts,
    normalize_embeddings=True  
)

print(embeddings.shape)  

(3, 768)


In [6]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(embeddings)

print(similarity_matrix)

[[1.0000002  0.62945104 0.24597904]
 [0.62945104 0.9999998  0.32259136]
 [0.24597904 0.32259136 1.0000002 ]]


In [None]:
query = "How do neural networks learn from data?"
documents = [
    "Neural networks need large datasets to perform well.",
    "Pizza is my favorite food.",
    "Transformers are used in modern NLP models."
]

query_embedding = model.encode(query, normalize_embeddings=True)
doc_embeddings = model.encode(documents, normalize_embeddings=True)

scores = cosine_similarity([query_embedding], doc_embeddings)[0]

for doc, score in zip(documents, scores):
    print(f"{score:.3f} → {doc}")


0.657 → Neural networks need large datasets to perform well.
0.254 → Pizza is my favorite food.
0.445 → Transformers are used in modern NLP models.


In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("BAAI/bge-large-en-v1.5")
documents = [
    "Neural networks require large datasets.",
    "Transformers are widely used in NLP.",
    "Pizza is a popular Italian food."
]

doc_embeddings = model.encode(
    ["Represent this document for retrieval: " + doc for doc in documents],
    normalize_embeddings=True
)

In [9]:
query = "How do neural networks learn?"

query_embedding = model.encode(
    "Represent this query for retrieval: " + query,
    normalize_embeddings=True
)

In [10]:
from sklearn.metrics.pairwise import cosine_similarity

scores = cosine_similarity([query_embedding], doc_embeddings)[0]

for doc, score in zip(documents, scores):
    print(f"{score:.3f} → {doc}")


0.817 → Neural networks require large datasets.
0.702 → Transformers are widely used in NLP.
0.456 → Pizza is a popular Italian food.


In [None]:
model = SentenceTransformer("BAAI/bge-m3")
query = "How does machine learning work?"

documents = [
    "Machine learning models learn from data.",      # English
    "ಯಂತ್ರ ಕಲಿಕೆ ಡೇಟಾದಿಂದ ಕಲಿಯುತ್ತದೆ.",             # Kannada
    "மெஷின் லெர்னிங் தரவிலிருந்து கற்றுக்கொள்கிறது"  # Tamil
]

query_emb = model.encode(
    "Represent this query for retrieval: " + query,
    normalize_embeddings=True
)

doc_embs = model.encode(
    ["Represent this document for retrieval: " + d for d in documents],
    normalize_embeddings=True
)

scores = cosine_similarity([query_emb], doc_embs)[0]

for doc, score in zip(documents, scores):
    print(f"{score:.3f} → {doc}")

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
sentences = [
    "Deep learning models require large datasets.",
    "Neural networks learn from data.",
    "I love eating pizza."
]
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(embeddings)

print(similarity)

embeddings = model.encode(
    sentences,
    normalize_embeddings=True
)

print(embeddings.shape)


In [None]:
from sentence_transformers import InputExample

train_examples = [
    InputExample(
        texts=["What is machine learning?", "Machine learning is a field of AI"],
        label=1.0
    ),
    InputExample(
        texts=["I love pizza", "Neural networks are powerful"],
        label=0.0
    )
]
from sentence_transformers import SentenceTransformer, losses
from torch.utils.data import DataLoader

model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

train_dataloader = DataLoader(
    train_examples,
    shuffle=True,
    batch_size=8
)

train_loss = losses.CosineSimilarityLoss(model)

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=1,
    warmup_steps=10
)