In [14]:
import spacy
from FlagEmbedding import FlagModel
from elasticsearch import Elasticsearch
from elasticsearch import helpers

In [124]:
# helper functions
def remove_newline(text):
# remove newline characters, "\n", from the text
# text: list of paragraphs in the text
    
    for i in range(len(text)):
        text[i] = " ".join(text[i].split())

    return text


def embed_index_text(text_chunks, client):
    # chunks: list of m elements that contain n sentences each
    # client: instance of Elasticsearch client used to create the index

    # embed chunks
    chunk_embeddings = model.encode(text_chunks).tolist()

    # define the format of the data to be indexed as pairs (chunk of text, chunk embeddings)
    docs = [
        {
            '_op_type': 'index',
            '_index': 'text_embeddings_index',
            '_source': {
                "chunk" : t, 
                "embedding_vector" : v
            }
        } for t, v in zip(text_chunks, chunks_embeddings)
    ]
    
    # index in bulk
    helpers.bulk(client, docs)
    

### Read the text

In [158]:
file_path = "../data/LesMiserables.txt"

In [159]:
# read the file into one long string "text"
with open(file_path, "r") as text_file:
    text = text_file.read()

In [160]:
# split text by paragraphs 
text_paragraphs = text.split("\n\n")

In [161]:
len(text_paragraphs)

14558

In [162]:
# using the helper function "remove_newline" to eliminate "\n" characters from the text

text_paragraphs = remove_newline(text_paragraphs)

The novel begins in paragraph 492 of the list. All contents that appear before this paragraph (i.e. paragraphs about project Gutenberg, publisher info, list of illustrations, and table of contents) are being removed from the data.

In [195]:
text_paragraphs[492:496]

['LES MISÉRABLES',
 '',
 'PREFACE',
 'So long as there shall exist, by virtue of law and custom, decrees of damnation pronounced by society, artificially creating hells amid the civilization of earth, and adding the element of human fate to divine destiny; so long as the three great problems of the century—the degradation of man through pauperism, the corruption of woman through hunger, the crippling of children through lack of light—are unsolved; so long as social asphyxia is possible in any part of the world;—in other words, and with a still wider significance, so long as ignorance and poverty exist on earth, books of the nature of Les Misérables cannot fail to be of use.']

In [196]:
# filtered text, contains only the novel
initial_paragraph = 492
texts = text_paragraphs[initial_paragraph:]

### Sentence segmentation using spaCy

In [29]:
# Load pretrained English Language Model to separate the text into sentences
nlp = spacy.load('en_core_web_sm') 

In [30]:
# create an object "doc" by creating an instance of the nlp class for one paragraph
# when applying nlp to a text, spaCy tokenizes the text to produce a Doc object

doc = nlp(text_paragraphs[5000])

In [33]:
doc.text

'At the beginning of the Restoration, the convent of the Petit-Picpus was in its decay; this forms a part of the general death of the order, which, after the eighteenth century, has been disappearing like all the religious orders. Contemplation is, like prayer, one of humanity’s needs; but, like everything which the Revolution touched, it will be transformed, and from being hostile to social progress, it will become favorable to it.'

In [42]:
# print tokens and attributes in the doc
# for token in doc:
    # print(token.text, token.pos, token.dep, token.tag)

In [41]:
# using entity recognizer in the text
# for ent in doc.ents:
#     print(ent)

In [44]:
# sentence generator
# doc.sents

In [47]:
# generate sentences when calling doc.sents (that generates tokens that point to each sentence in doc)
# print each sentence in doc
# for sent in doc.sents:
#     print(sent)
#     print()

In [200]:
texts[:5]

['LES MISÉRABLES',
 '',
 'PREFACE',
 'So long as there shall exist, by virtue of law and custom, decrees of damnation pronounced by society, artificially creating hells amid the civilization of earth, and adding the element of human fate to divine destiny; so long as the three great problems of the century—the degradation of man through pauperism, the corruption of woman through hunger, the crippling of children through lack of light—are unsolved; so long as social asphyxia is possible in any part of the world;—in other words, and with a still wider significance, so long as ignorance and poverty exist on earth, books of the nature of Les Misérables cannot fail to be of use.',
 'HAUTEVILLE HOUSE, 1862.']

In [201]:
# load bge language model to embed the text in chunks
model = FlagModel('BAAI/bge-small-zh-v1.5', use_fp16 = True)

In [202]:
# embedding single paragraphs
embeddings_0 = model.encode(texts[0])
embeddings_1 = model.encode(texts[3])

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [203]:
embeddings_0.shape[0]

512

In [204]:
type(embeddings_0.tolist())

list

In [205]:
# calculate cosine similarity between the embedded paragraphs
similarity = embeddings_0 @ embeddings_1.T

In [206]:
similarity

np.float32(0.53408146)

In [207]:
# instantiate Python client for Elastic search
client = Elasticsearch("http://elasticsearch:9200")

Split the text into m chunks of n sentences. Once m chunks have been collected, embed the text using a pre-loaded LLM (BGE small in this case) and index the vectors.

In [208]:
# a chunk is n sentences
# every time I collect n sentences join them and that is a chunk, append to a list
# after collecting m chunks, embed and index

chunks = []
sentences = []
chunks_embeddings = []

sentence_limit = 3
chunk_limit = 10

In [209]:
# define the format of index: chunk of text and embedding vectors
# custom mapping that defines the expected types of indices features
# define mapping parameters for the "chunk" and "embedding_vector" fields
# define "vector_dim"

mappings = {
    "properties": {
        "chunk": {
            "type": "text"
        },
        "embedding_vector": {
            "index": True,
            "type": "dense_vector",
            "dims": 512,
            "similarity": "cosine",
        }
    }
}

In [99]:
# create an index called some_index using the defined mapping
# client.indices.create(index = "some_index", mappings = mappings)

In [96]:
# does the index "some_index" exist?
# client.indices.exists(index = "some_index")

In [97]:
# delete index called "some_index"
# client.indices.delete(index = "some_index")

In [98]:
# client.index(
#     index = "some_index",
#     document = {
#         "chunk": texts[0],
#         "embedding_vector": embeddings_0.tolist(),
#     },
# )

Ensure that there is no previously created index with the name "text_embeddings_index"

In [210]:
if (client.indices.exists(index = "text_embeddings_index")):
    client.indices.delete(index = "text_embeddings_index")

Create an Elasticsearch index, "text_embeddings_index", with the defined mappings

In [211]:
client.indices.create(index = "text_embeddings_index", mappings = mappings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'text_embeddings_index'})

In [None]:
# processing the data as a stream and buffer the paragraphs in batches instead of one by one
# calling nlp on a text returns a generator (doc_pipeline) that yields Doc objects
doc_pipeline = nlp.pipe(texts, batch_size = 5, n_process = 1)

for doc in doc_pipeline:
    for sent in doc.sents:
        sentences.append(sent.text)
        
        if len(sentences) == sentence_limit:
            chunk = " ".join(sentences)
            chunks.append(chunk)
            # remove the first sentence and keep the other two to overlap with the following sentence
            sentences = sentences[1:]

        if len(chunks) == chunk_limit:
            # embed and index
            embed_index_text(chunks, client)

            # clear the list of chunks
            chunks = []
           
    # print(doc.text)


In [148]:
sentences

[]

In [149]:
chunks

['*** LES MISÉRABLES By Victor Hugo', 'LES MISÉRABLES By Victor Hugo']

In [141]:
# at the end of the loop, verify if there are sentences/chunks left to embed and embed them
if len(sentences) != 0:
    # append sentences to remaining chunks
    chunks.append(" ".join(sentences))

    embed_index_text(chunks, client)

    sentences = []
    chunks = []


In [145]:
len(chunks_embeddings)

4

In [146]:
len(chunks_embeddings[0])

512

In [212]:
client.cat.indices()

TextApiResponse('yellow open text_embeddings_index gzbxdOr-Sb2wQgT1ZPpzTw 1 1 0 0 227b 227b 227b\n')

In [152]:
results = client.search(
    index = "text_embeddings_index",
    query = {
        "bool" : {
            "must" : [
                {
                    "match" : {
                    "chunk" : "hugo",
                    }
                },
                {
                    "match" : {
                    "chunk" : "victor",
                }
                }
            ]
        }
    },
    source = ["chunk"]
)

for hit in results["hits"]["hits"]:

    print(hit)
    print(hit["_score"], hit["_source"]["chunk"])
    print()

{'_index': 'text_embeddings_index', '_id': 'yyWD0ZMBJCf2OsYv71ZF', '_score': 1.4960773, '_source': {'chunk': '*** LES MISÉRABLES By Victor Hugo'}}
1.4960773 *** LES MISÉRABLES By Victor Hugo

{'_index': 'text_embeddings_index', '_id': 'zCWD0ZMBJCf2OsYv71ZF', '_score': 1.4960773, '_source': {'chunk': 'LES MISÉRABLES By Victor Hugo'}}
1.4960773 LES MISÉRABLES By Victor Hugo

{'_index': 'text_embeddings_index', '_id': 'ySWD0ZMBJCf2OsYvpFbx', '_score': 1.289943, '_source': {'chunk': 'Title: Les Misérables Author: Victor Hugo Translator: Isabel Florence Hapgood'}}
1.289943 Title: Les Misérables Author: Victor Hugo Translator: Isabel Florence Hapgood

{'_index': 'text_embeddings_index', '_id': 'wyWD0ZMBJCf2OsYvo1Yi', '_score': 1.2225636, '_source': {'chunk': 'Author: Victor Hugo Translator: Isabel Florence Hapgood Release date: June 22, 2008'}}
1.2225636 Author: Victor Hugo Translator: Isabel Florence Hapgood Release date: June 22, 2008

{'_index': 'text_embeddings_index', '_id': 'yiWD0ZMBJC

In [157]:
results["hits"]

{'total': {'value': 6, 'relation': 'eq'},
 'max_score': 1.4960773,
 'hits': [{'_index': 'text_embeddings_index',
   '_id': 'yyWD0ZMBJCf2OsYv71ZF',
   '_score': 1.4960773,
   '_source': {'chunk': '*** LES MISÉRABLES By Victor Hugo'}},
  {'_index': 'text_embeddings_index',
   '_id': 'zCWD0ZMBJCf2OsYv71ZF',
   '_score': 1.4960773,
   '_source': {'chunk': 'LES MISÉRABLES By Victor Hugo'}},
  {'_index': 'text_embeddings_index',
   '_id': 'ySWD0ZMBJCf2OsYvpFbx',
   '_score': 1.289943,
   '_source': {'chunk': 'Title: Les Misérables Author: Victor Hugo Translator: Isabel Florence Hapgood'}},
  {'_index': 'text_embeddings_index',
   '_id': 'wyWD0ZMBJCf2OsYvo1Yi',
   '_score': 1.2225636,
   '_source': {'chunk': 'Author: Victor Hugo Translator: Isabel Florence Hapgood Release date: June 22, 2008'}},
  {'_index': 'text_embeddings_index',
   '_id': 'yiWD0ZMBJCf2OsYvpFbx',
   '_score': 1.2225636,
   '_source': {'chunk': 'Author: Victor Hugo Translator: Isabel Florence Hapgood Release date: June 22, 