<a href="https://colab.research.google.com/github/sugarforever/LangChain-Tutorials/blob/main/langchain_nomic_embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q -U langchain chromadb tiktoken langchain-nomic langchain-openai

In [None]:
POST_URL="https://blog.nomic.ai/posts/nomic-embed-text-v1"

In [None]:
from langchain_community.document_loaders import WebBaseLoader

docs = WebBaseLoader(POST_URL).load()

In [None]:
len(docs)

In [None]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=7500, chunk_overlap=100
)
doc_splits = text_splitter.split_documents(docs)

In [None]:
import os
from google.colab import userdata

os.environ['NOMIC_API_KEY'] = userdata.get('NOMIC_API_KEY')

from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_nomic import NomicEmbeddings
from langchain_nomic.embeddings import NomicEmbeddings

In [None]:
vectorstore = Chroma.from_documents(
    documents=doc_splits,
    embedding=NomicEmbeddings(model="nomic-embed-text-v1"),
)
retriever = vectorstore.as_retriever()

In [None]:
os.environ['OPENAI_API_KEY'] = userdata.get("OPENAI_API_KEY")

In [None]:
from langchain_community.chat_models import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

# Prompt
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | ChatOpenAI(temperature=0, model="gpt-4-1106-preview")
    | StrOutputParser()
)


In [None]:
response = chain.invoke("How did Nomic Embed get trained?")

In [18]:
response

"Nomic Embed was trained using a multi-stage contrastive learning pipeline. The process began with a BERT initialization, specifically training their own BERT model with a 2048 token context length, named nomic-bert-2048. This model incorporated several modifications inspired by MosaicBERT, such as using Rotary Position Embeddings for context length extrapolation, employing SwiGLU activations for improved performance, setting dropout to 0, and implementing various training optimizations like using Deepspeed and FlashAttention, training in BF16 precision, increasing the vocab size to a multiple of 64, training with a large batch size of 4096, and masking at a 30% rate during masked language modeling.\n\nAfter establishing the nomic-bert-2048, the next phase involved contrastive training with a dataset composed of approximately 235 million text pairs. This dataset was extensively validated for quality during collection with Nomic Atlas. The details of the dataset can be found in the nomi