In [14]:
import spacy
from FlagEmbedding import FlagModel
from elasticsearch import Elasticsearch
from elasticsearch import helpers

In [15]:
# helper functions
def remove_newline(text):
# remove newline characters, "\n", from the text
# text: list of paragraphs in the text
    
    for i in range(len(text)):
        text[i] = " ".join(text[i].split())

    return text


### Read the text

In [21]:
file_path = "../data/LesMiserables.txt"

In [17]:
# read the file into one long string "text"
with open(file_path, "r") as text_file:
    text = text_file.read()

In [18]:
# split text by paragraphs 
text_paragraphs = text.split("\n\n")

In [22]:
len(text_paragraphs)

14558

In [23]:
# using the helper function "remove_newline" to eliminate "\n" characters from the text

text_paragraphs = remove_newline(text_paragraphs)

In [26]:
text_paragraphs[0]

'The Project Gutenberg eBook of Les Misérables This ebook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this ebook or online at www.gutenberg.org. If you are not located in the United States, you will have to check the laws of the country where you are located before using this eBook.'

### Sentence segmentation using spaCy

In [29]:
# Load pretrained English Language Model to separate the text into sentences
nlp = spacy.load('en_core_web_sm') 

In [30]:
# create an object "doc" by creating an instance of the nlp class for one paragraph
# when applying nlp to a text, spaCy tokenizes the text to produce a Doc object

doc = nlp(text_paragraphs[5000])

In [33]:
doc.text

'At the beginning of the Restoration, the convent of the Petit-Picpus was in its decay; this forms a part of the general death of the order, which, after the eighteenth century, has been disappearing like all the religious orders. Contemplation is, like prayer, one of humanity’s needs; but, like everything which the Revolution touched, it will be transformed, and from being hostile to social progress, it will become favorable to it.'

In [42]:
# print tokens and attributes in the doc
# for token in doc:
    # print(token.text, token.pos, token.dep, token.tag)

In [41]:
# using entity recognizer in the text
# for ent in doc.ents:
#     print(ent)

In [44]:
# sentence generator
# doc.sents

In [47]:
# generate sentences when calling doc.sents (that generates tokens that point to each sentence in doc)
# print each sentence in doc
# for sent in doc.sents:
#     print(sent)
#     print()

In [48]:
# using a reduced text, the first 10 paragraphs
texts = text_paragraphs[:10]

In [49]:
texts

['The Project Gutenberg eBook of Les Misérables This ebook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this ebook or online at www.gutenberg.org. If you are not located in the United States, you will have to check the laws of the country where you are located before using this eBook.',
 'Title: Les Misérables',
 'Author: Victor Hugo',
 'Translator: Isabel Florence Hapgood',
 'Release date: June 22, 2008 [eBook #135] Most recently updated: October 29, 2024',
 'Language: English',
 'Credits: Judith Boss and David Widger',
 '*** START OF THE PROJECT GUTENBERG EBOOK LES MISÉRABLES ***',
 'LES MISÉRABLES',
 'By Victor Hugo']

In [56]:
# load bge language model to embed the text in chunks
model = FlagModel('BAAI/bge-small-zh-v1.5', use_fp16 = True)

In [57]:
# embedding single paragraphs
embeddings_0 = model.encode(texts[0])
embeddings_1 = model.encode(texts[3])

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [67]:
embeddings_0.shape[0]

512

In [59]:
type(embeddings_0.tolist())

list

In [68]:
# calculate cosine similarity between the embedded paragraphs
similarity = embeddings_0 @ embeddings_1.T

In [71]:
similarity

np.float32(0.5518602)

In [72]:
# instantiate Python client for Elastic search
client = Elasticsearch("http://elasticsearch:9200")

Split the text into m chunks of n sentences. Once m chunks have been collected, embed the text using a pre-loaded LLM (BGE small in this case) and index the vectors.

In [75]:
# a chunk is n sentences
# every time I collect n sentences join them and that is a chunk, append to a list
# after collecting m chunks, embed and index

chunks = []
sentences = []
chunks_embeddings = []

sentence_limit = 3
chunk_limit = 10

In [77]:
# define the format of index: chunk of text and embedding vectors
# custom mapping that defines the expected types of indices features
# define mapping parameters for the "chunk" and "embedding_vector" fields
# define "vector_dim"

mappings = {
    "properties": {
        "chunk": {
            "type": "text"
        },
        "embedding_vector": {
            "index": True,
            "type": "dense_vector",
            "dims": 512,
            "similarity": "cosine",
        }
    }
}

In [99]:
# create an index called some_index using the defined mapping
# client.indices.create(index = "some_index", mappings = mappings)

In [96]:
# does the index "some_index" exist?
# client.indices.exists(index = "some_index")

In [97]:
# delete index called "some_index"
# client.indices.delete(index = "some_index")

In [98]:
# client.index(
#     index = "some_index",
#     document = {
#         "chunk": texts[0],
#         "embedding_vector": embeddings_0.tolist(),
#     },
# )

Ensure that there is no previously created index with the name "text_embeddings_index"

In [103]:
if (client.indices.exists(index = "text_embeddings_index")):
    client.indices.delete(index = "text_embeddings_index")

Create an Elasticsearch index, "text_embeddings_index", with the defined mappings

In [104]:
client.indices.create(index = "text_embeddings_index", mappings = mappings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'text_embeddings_index'})

In [105]:
# processing the data as a stream and buffer the paragraphs in batches instead of one by one
# calling nlp on a text returns a generator (doc_pipeline) that yields Doc objects
doc_pipeline = nlp.pipe(texts, batch_size = 5, n_process = 1)

for doc in doc_pipeline:
    for sent in doc.sents:
        sentences.append(sent.text)
        
        if len(sentences) == sentence_limit:
            chunk = " ".join(sentences)
            chunks.append(chunk)
            sentences = sentences[1:]

        if len(chunks) == chunk_limit:
            # embed
            chunks_embeddings = model.encode(chunks).tolist()
            # index chunks
            docs = [
                {
                    '_op_type': 'index',
                    '_index': 'text_embeddings_index',
                    '_source': {
                        "chunk" : t, 
                        "embedding_vector" : v
                           }
                } for t, v in zip(chunks, chunks_embeddings)
            ]
            helpers.bulk(client, docs
                        )
            # clear the list of chunks
            chunks = []
            
        # print(sent)
        # print()
        
    print(doc.text)


# at the end of the loop, verify if there are sentences/chunks left to embed and embed them


The Project Gutenberg eBook of Les Misérables This ebook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this ebook or online at www.gutenberg.org. If you are not located in the United States, you will have to check the laws of the country where you are located before using this eBook.
Title: Les Misérables
Author: Victor Hugo
Translator: Isabel Florence Hapgood
Release date: June 22, 2008 [eBook #135] Most recently updated: October 29, 2024
Language: English
Credits: Judith Boss and David Widger
['The Project Gutenberg eBook of Les Misérables This ebook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License i

In [107]:
sentences

['LES MISÉRABLES', 'By Victor Hugo']

In [108]:
chunks

['*** START OF THE PROJECT GUTENBERG EBOOK LES MISÉRABLES *** LES MISÉRABLES',
 '*** LES MISÉRABLES By Victor Hugo']

In [121]:
if len(sentences) != 0:
    # append sentences to remaining chunks
    chunks.append(" ".join(sentences))

    # embed the text
    chunks_embeddings = model.encode(chunks).tolist()

    # index remaining text and embedding vectors
    docs = [
        {
            "_op_type": "index",
            "_index": "text_embeddings_index",
            "_source": {
                "chunk": t,
                "embedding_vector": v
            }
        } for t, v in zip(chunks, chunks_embeddings)
    ]
    helpers.bulk(client, docs)


In [122]:
len(chunks_embeddings)

4

In [114]:
len(chunks_embeddings[0])

512

In [123]:
client.cat.indices()

TextApiResponse('yellow open text_embeddings_index HH1RiJaXR_GNzL7WLdf9fA 1 1 14 0 163.6kb 163.6kb 163.6kb\n')

In [78]:
results = client.search(
    index = "text_embeddings_index",
    query = {
        "bool" : {
            "must" : [
                {
                    "match" : {
                    "chunk" : "hugo",
                    }
                },
                {
                    "match" : {
                    "chunk" : "victor",
                }
                }
            ]
        }
    },
    source = ["chunk"]
)

for hit in results["hits"]["hits"]:

    print(hit["_score"], hit["_source"]["chunk"])
    print()

1.616527 *** LES MISÉRABLES By Victor Hugo

1.480124 Title: Les Misérables Author: Victor Hugo Translator: Isabel Florence Hapgood

1.4317977 Author: Victor Hugo Translator: Isabel Florence Hapgood Release date: June 22, 2008

1.066257 If you are not located in the United States, you will have to check the laws of the country where you are located before using this eBook. Title: Les Misérables Author: Victor Hugo

0.9826342 LES MISÉRABLES By Victor Hugo The Project Gutenberg eBook of Les Misérables This ebook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever.

0.77440417 By Victor Hugo The Project Gutenberg eBook of Les Misérables This ebook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eb

In [None]:
# reords = [{
#     '_op_type': 'index',
#     '_index': 'index-name',
#     'doc': {"chunk" : t, "vector" : v}
# } for t, v in zip(chunks, vectors)]