## Open Text Embeddings

### [LangChain Embeddings](https://python.langchain.com/en/latest/reference/modules/embeddings.html)

#### Hugging Face Embeddings

In [None]:
%%bash
pip install --upgrade sentence-transformers

Successfully installed nltk-3.8.1 scikit-learn-1.2.2 scipy-1.10.1 sentence-transformers-2.2.2 sentencepiece-0.1.97 threadpoolctl-3.1.0


In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
help(HuggingFaceEmbeddings)

In [None]:
help(HuggingFaceEmbeddings.__init__)

Help on function __init__ in module langchain.embeddings.huggingface:

__init__(self, **kwargs: Any)
    Initialize the sentence_transformer.



In [None]:
HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2')

In [None]:
hf_embeddings = HuggingFaceEmbeddings()

# 准备文本
text = '这是一个测试文档。'

# 使用 HuggingFaceEmbeddings 生成文本嵌入
query_result = hf_embeddings.embed_query(text)
doc_result = hf_embeddings.embed_documents([text])

print(len(query_result))
# print(query_result)

print(len(doc_result))
print(len(doc_result[0]))
# print(doc_result)

768
1
768


In [None]:
hf_embeddings.model_name

'sentence-transformers/all-mpnet-base-v2'

In [None]:
%%bash
ls -lah ~/.cache/torch/sentence_transformers/sentence-transformers_all-mpnet-base-v2

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
total 865816
drwxr-xr-x  16 saintway  staff   512B Apr 12 14:31 [1m[36m.[m[m
drwxr-xr-x   3 saintway  staff    96B Apr 12 14:31 [1m[36m..[m[m
-rw-r--r--   1 saintway  staff   1.1K Apr 12 14:28 .gitattributes
drwxr-xr-x   3 saintway  staff    96B Apr 12 14:28 [1m[36m1_Pooling[m[m
-rw-r--r--   1 saintway  staff    10K Apr 12 14:28 README.md
-rw-r--r--   1 saintway  staff   571B Apr 12 14:28 config.json
-rw-r--r--   1 saintway  staff   116B Apr 12 14:28 config_sentence_transformers.json
-rw-r--r--   1 saintway  staff    38K Apr 12 14:28 data_config.json
-rw-r--r--   1 saintway  staff   349B Apr 12 14:31 modules.json
-rw-r--r--   1 saintway  staff   418M Apr 12 14:31 pytorch_model.bin
-rw-r--r--   1 

In [None]:
%%bash
du -sh ~/.cache/torch/sentence_transformers/sentence-transformers_all-mpnet-base-v2

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
423M	/Users/saintway/.cache/torch/sentence_transformers/sentence-transformers_all-mpnet-base-v2


#### [Hugging Face Instruct Embeddings](https://huggingface.co/datasets/calmgoose/book-embeddings)
* https://github.com/UKPLab/sentence-transformers
* https://github.com/HKUNLP/instructor-embedding

In [None]:
%%bash
pip install --upgrade InstructorEmbedding

Successfully installed InstructorEmbedding-1.0.0


* https://github.com/basujindal/chatPDF

In [None]:
from langchain.embeddings import HuggingFaceInstructEmbeddings
hfi_embeddings = HuggingFaceInstructEmbeddings(model_name='hkunlp/instructor-large')

load INSTRUCTOR_Transformer
max_seq_length  512


In [None]:
hfi_embeddings.model_name

'hkunlp/instructor-large'

In [None]:
%%bash
ls -lah ~/.cache/torch/sentence_transformers/hkunlp_instructor-large

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
total 2640208
drwxr-xr-x  15 saintway  staff   480B Apr 12 15:19 [1m[36m.[m[m
drwxr-xr-x   4 saintway  staff   128B Apr 12 15:19 [1m[36m..[m[m
-rw-r--r--   1 saintway  staff   1.4K Apr 12 15:07 .gitattributes
drwxr-xr-x   3 saintway  staff    96B Apr 12 15:07 [1m[36m1_Pooling[m[m
drwxr-xr-x   4 saintway  staff   128B Apr 12 15:08 [1m[36m2_Dense[m[m
-rw-r--r--   1 saintway  staff    65K Apr 12 15:08 README.md
-rw-r--r--   1 saintway  staff   1.5K Apr 12 15:08 config.json
-rw-r--r--   1 saintway  staff   122B Apr 12 15:08 config_sentence_transformers.json
-rw-r--r--   1 saintway  staff   461B Apr 12 15:19 modules.json
-rw-r--r--   1 saintway  staff   1.2G Apr 12 15:19 pytorch_model.bin
-rw-r--r

In [None]:
%%bash
du -sh ~/.cache/torch/sentence_transformers/hkunlp_instructor-large

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
1.3G	/Users/saintway/.cache/torch/sentence_transformers/hkunlp_instructor-large


In [None]:
# 准备文本
text = '这是一个测试文档。'

# 使用 HuggingFaceInstructEmbeddings 生成文本嵌入
query_result = hfi_embeddings.embed_query(text)
doc_result = hfi_embeddings.embed_documents([text])

print(len(query_result))
print(query_result)

print(len(doc_result))
print(len(doc_result[0]))
print(doc_result)

768
[-0.022137142717838287, -0.019943105056881905, 0.009940845891833305, 0.029961414635181427, 0.0015559268649667501, -0.0010082109365612268, 0.004636477679014206, 0.006970031186938286, -0.039788346737623215, 0.028241422027349472, -1.5192752471193671e-05, -0.008512390777468681, 0.04590446129441261, 0.03056621551513672, -0.030894720926880836, -0.02884022891521454, -0.023664429783821106, -0.010090871714055538, -0.036661747843027115, -0.001970992423593998, 0.05847157910466194, 0.008038687519729137, -0.012776742689311504, 0.05411699786782265, 0.01262636948376894, 0.016430772840976715, -0.04767526313662529, 0.01811787858605385, 0.04832480102777481, -0.0647105798125267, 0.03377210721373558, -0.04854683578014374, -0.040563128888607025, -0.04772289842367172, -0.018774421885609627, 0.020985594019293785, 0.025719504803419113, 0.027344582602381706, 0.026014933362603188, 0.055159278213977814, -0.01577085256576538, 0.01060266699641943, -0.0031603227835148573, -0.039208076894283295, 0.03614024817943