In [None]:
print("Hello")

In [None]:
## Load data
#A text file has been prepared as the source document for the downstream embedding task.

#Now, let's download and load it using LangChain's `TextLoader`.

!wget "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/i5V3ACEyz6hnYpVq6MTSvg/state-of-the-union.txt"


In [None]:
pip install langchain_community

In [None]:
from langchain_community.document_loaders import TextLoader

In [None]:
loader = TextLoader("state-of-the-union.txt")
data = loader.load()
data

## Split data
Since the embedding model has a maximum input token limit, you cannot input the entire document at once. Instead, you need to split it into chunks.

The following code shows how to use LangChain's `RecursiveCharacterTextSplitter` to split the document into chunks.
- Use the default separator list, which is `["\n\n", "\n", " ", ""]`.
- Chunk size is set to `100`. This should be set to less than the model's maximum input token.
- Chunk overlap is set to `20`.
- The length function is `len`.


In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
)

In [None]:
chunks = text_splitter.split_text(data[0].page_content)
len(chunks)

In [None]:
chunks

## Hugging Face embedding model

### Model description

In this section, we will use the `all-mpnet-base-v2` from HuggingFace as an example embedding model.

It is a sentence-transformers model. It maps sentences and paragraphs to a 768-dimensional dense vector space and can be used for tasks like clustering or semantic search. It used the pre-trained `Microsoft/money-base` model and fine-tuned it on a 1B sentence pairs dataset. For more information, please refer to [here](https://huggingface.co/sentence-transformers/all-mpnet-base-v2).


### Build model
To build the model, you need to import the `HuggingFaceEmbeddings` dependence first.


In [None]:
from langchain_community.embeddings import HuggingFaceEmbeddings
model_name = "sentence-transformers/all-mpnet-base-v2"
huggingface_embedding = HuggingFaceEmbeddings(model_name=model_name)

In [None]:
### Query embeddings
#Let's create the embeddings from the same sentence, using the Hugging Face embedding model.

query = "How are you?"
query_result = huggingface_embedding.embed_query(query)
query_result[:5]

In [None]:
len(query_result)

In [None]:
### Document embeddings
doc_result = huggingface_embedding.embed_documents(chunks)
doc_result[0][:5]
len(doc_result[0])