In [39]:
import spacy
from FlagEmbedding import FlagModel
from elasticsearch import Elasticsearch
from elasticsearch import helpers

In [2]:
# helper functions
def remove_newline(text):
# remove newline characters, "\n", from the text
# text: list of paragraphs in the text
    
    for i in range(len(text)):
        text[i] = " ".join(text[i].split())

    return text


In [3]:
file_path = "../data/LesMiserables.txt"

In [4]:
# read the file into one long string "text"
with open(file_path, "r") as text_file:
    text = text_file.read()

In [5]:
# split text by paragraphs 
text_paragraphs = text.split("\n\n")

In [6]:
len(text_paragraphs)

14558

In [7]:
text_paragraphs[0]

'The Project Gutenberg eBook of Les Misérables\n    \nThis ebook is for the use of anyone anywhere in the United States and\nmost other parts of the world at no cost and with almost no restrictions\nwhatsoever. You may copy it, give it away or re-use it under the terms\nof the Project Gutenberg License included with this ebook or online\nat www.gutenberg.org. If you are not located in the United States,\nyou will have to check the laws of the country where you are located\nbefore using this eBook.'

In [8]:
text_paragraphs = remove_newline(text_paragraphs)

In [9]:
text_paragraphs[0]

'The Project Gutenberg eBook of Les Misérables This ebook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this ebook or online at www.gutenberg.org. If you are not located in the United States, you will have to check the laws of the country where you are located before using this eBook.'

##### Sentence segmentation using spaCy

In [10]:
# Load pretrained English Language Model to separate the text into sentences
nlp = spacy.load('en_core_web_sm') 

In [11]:
# create an object "doc" by creating an instance of the nlp class for one paragraph
# when applying nlp to a text, spaCy tokenizes the text to produce a Doc object

doc = nlp(text_paragraphs[5000])

In [12]:
doc.text

'At the beginning of the Restoration, the convent of the Petit-Picpus was in its decay; this forms a part of the general death of the order, which, after the eighteenth century, has been disappearing like all the religious orders. Contemplation is, like prayer, one of humanity’s needs; but, like everything which the Revolution touched, it will be transformed, and from being hostile to social progress, it will become favorable to it.'

In [14]:
# for token in doc:
#     print(token.text, token.pos, token.dep, token.tag)

In [15]:
# using entity recognizer in the text
# for ent in doc.ents:
#     print(ent)

In [16]:
# sentence generator
# doc.sents

In [17]:
# generate sentences when calling doc.sents (that generates tokens that point to each sentence in doc)
# print each sentence in doc
for sent in doc.sents:
    print(sent)
    print()

At the beginning of the Restoration, the convent of the Petit-Picpus was in its decay; this forms a part of the general death of the order, which, after the eighteenth century, has been disappearing like all the religious orders.

Contemplation is, like prayer, one of humanity’s needs; but, like everything which the Revolution touched, it will be transformed, and from being hostile to social progress, it will become favorable to it.



In [18]:
# using a reduced text, the first 10 paragraphs
texts = text_paragraphs[:10]

In [19]:
texts

['The Project Gutenberg eBook of Les Misérables This ebook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this ebook or online at www.gutenberg.org. If you are not located in the United States, you will have to check the laws of the country where you are located before using this eBook.',
 'Title: Les Misérables',
 'Author: Victor Hugo',
 'Translator: Isabel Florence Hapgood',
 'Release date: June 22, 2008 [eBook #135] Most recently updated: October 29, 2024',
 'Language: English',
 'Credits: Judith Boss and David Widger',
 '*** START OF THE PROJECT GUTENBERG EBOOK LES MISÉRABLES ***',
 'LES MISÉRABLES',
 'By Victor Hugo']

In [20]:
# load bge language model to embed the text in chunks
model = FlagModel('BAAI/bge-small-zh-v1.5', use_fp16 = True)

In [21]:
# embedding single paragraphs
embeddings_0 = model.encode(texts[0])
embeddings_1 = model.encode(texts[3])

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [22]:
embeddings_0.shape[0]

512

In [23]:
type(embeddings_0.tolist())

list

In [24]:
# calculate cosine similarity between the embedded paragraphs
similarity = embeddings_0 @ embeddings_1.T

In [25]:
similarity

np.float32(0.5518602)

In [27]:
# instantiate Python client for Elastic search
client = Elasticsearch("http://elasticsearch:9200")

In [28]:
# a chunk is n sentences
# every time I collect n sentences join them and that is a chunk, append to a list
# after collecting m chunks, embed and index

chunks = []
sentences = []
chunks_embeddings = []

sentence_limit = 3
chunk_limit = 10

In [29]:
# custom mapping that defined the expected types of indices features
# define mapping parameters for the "chunk" and "embedding_vector" fields
# define "vector_dim"

mappings = {
    "properties": {
        "chunk": {
            "type": "text"
        },
        "embedding_vector": {
            "index": True,
            "type": "dense_vector",
            "dims": 512,
            "similarity": "cosine",
        }
    }
}

In [33]:
# create an index called some_index
client.indices.create(index = "some_index", mappings = mappings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'some_index'})

In [58]:
client.indices.exists(index = "some_index")

HeadApiResponse(True)

In [60]:
client.indices.delete(index = "text_embeddings_index")

ObjectApiResponse({'acknowledged': True})

In [34]:
client.index(
    index = "some_index",
    document = {
        "chunk": texts[0],
        "embedding_vector": embeddings_0.tolist(),
    },
)

ObjectApiResponse({'_index': 'some_index', '_id': 'kDfLrpMB1EF0deM87gcn', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1})

In [61]:
# 
client.indices.create(index = "text_embeddings_index", mappings = mappings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'text_embeddings_index'})

In [62]:


# processing the data as a stream and buffer the paragraphs in batches instead of one by one
# calling nlp on a text returns a generator that yields Doc objects
doc_pipeline = nlp.pipe(texts, batch_size = 5, n_process = 1)

for doc in doc_pipeline:
    for sent in doc.sents:
        sentences.append(sent.text)
        
        if len(sentences) == sentence_limit:
            chunk = " ".join(sentences)
            chunks.append(chunk)
            sentences = sentences[1:]

        if len(chunks) == chunk_limit:
            # embed
            chunks_embeddings = model.encode(chunks).tolist()
            # index
            docs = [
                {
                    '_op_type': 'index',
                    '_index': 'text_embeddings_index',
                    '_source': {
                        "chunk" : t, 
                        "embedding_vector" : v
                           }
                } for t, v in zip(chunks, chunks_embeddings)
            ]
            helpers.bulk(client, docs
                        )
            # clear the list of chunks
            chunks = []
            
        # print(sent)
        # print()
        
    print(doc.text)


# at the end of the loop, verify if there are sentences/chunks left to embed and embed them


The Project Gutenberg eBook of Les Misérables This ebook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this ebook or online at www.gutenberg.org. If you are not located in the United States, you will have to check the laws of the country where you are located before using this eBook.
Title: Les Misérables
Author: Victor Hugo
Translator: Isabel Florence Hapgood
Release date: June 22, 2008 [eBook #135] Most recently updated: October 29, 2024
Language: English
Credits: Judith Boss and David Widger
*** START OF THE PROJECT GUTENBERG EBOOK LES MISÉRABLES ***
LES MISÉRABLES
By Victor Hugo


In [44]:
doc_pipeline

<generator object Language.pipe at 0x7f77d4099010>

In [45]:
sentences

['LES MISÉRABLES', 'By Victor Hugo']

In [46]:
chunks

['*** START OF THE PROJECT GUTENBERG EBOOK LES MISÉRABLES *** LES MISÉRABLES',
 '*** LES MISÉRABLES By Victor Hugo']

In [47]:
len(chunks_embeddings)

1

In [50]:
chunks_embeddings[0].shape

(10, 512)

In [78]:
results = client.search(
    index = "text_embeddings_index",
    query = {
        "bool" : {
            "must" : [
                {
                    "match" : {
                    "chunk" : "hugo",
                    }
                },
                {
                    "match" : {
                    "chunk" : "victor",
                }
                }
            ]
        }
    },
    source = ["chunk"]
)

for hit in results["hits"]["hits"]:

    print(hit["_score"], hit["_source"]["chunk"])
    print()

1.616527 *** LES MISÉRABLES By Victor Hugo

1.480124 Title: Les Misérables Author: Victor Hugo Translator: Isabel Florence Hapgood

1.4317977 Author: Victor Hugo Translator: Isabel Florence Hapgood Release date: June 22, 2008

1.066257 If you are not located in the United States, you will have to check the laws of the country where you are located before using this eBook. Title: Les Misérables Author: Victor Hugo

0.9826342 LES MISÉRABLES By Victor Hugo The Project Gutenberg eBook of Les Misérables This ebook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever.

0.77440417 By Victor Hugo The Project Gutenberg eBook of Les Misérables This ebook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eb

In [None]:
# reords = [{
#     '_op_type': 'index',
#     '_index': 'index-name',
#     'doc': {"chunk" : t, "vector" : v}
# } for t, v in zip(chunks, vectors)]