In [28]:
import spacy
from FlagEmbedding import FlagModel
from elasticsearch import Elasticsearch

In [2]:
# helper functions
def remove_newline(text):
# remove newline characters, "\n", from the text
# text: list of paragraphs in the text
    
    for i in range(len(text)):
        text[i] = " ".join(text[i].split())

    return text


In [3]:
file_path = "../data/LesMiserables.txt"

In [4]:
# read the file into one long string "text"
with open(file_path, "r") as text_file:
    text = text_file.read()

In [5]:
# split text by paragraphs 
text_paragraphs = text.split("\n\n")

In [6]:
len(text_paragraphs)

14558

In [7]:
text_paragraphs[0]

'The Project Gutenberg eBook of Les Misérables\n    \nThis ebook is for the use of anyone anywhere in the United States and\nmost other parts of the world at no cost and with almost no restrictions\nwhatsoever. You may copy it, give it away or re-use it under the terms\nof the Project Gutenberg License included with this ebook or online\nat www.gutenberg.org. If you are not located in the United States,\nyou will have to check the laws of the country where you are located\nbefore using this eBook.'

In [8]:
text_paragraphs = remove_newline(text_paragraphs)

In [9]:
text_paragraphs[0]

'The Project Gutenberg eBook of Les Misérables This ebook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this ebook or online at www.gutenberg.org. If you are not located in the United States, you will have to check the laws of the country where you are located before using this eBook.'

##### Sentence segmentation using spaCy

In [10]:
# Load pretrained English Language Model to separate the text into sentences
nlp = spacy.load('en_core_web_sm') 

In [11]:
# create an object "doc" by creating an instance of the nlp class for one paragraph
# when applying nlp to a text, spaCy tokenizes the text to produce a Doc object

doc = nlp(text_paragraphs[5000])

In [13]:
doc.text

'At the beginning of the Restoration, the convent of the Petit-Picpus was in its decay; this forms a part of the general death of the order, which, after the eighteenth century, has been disappearing like all the religious orders. Contemplation is, like prayer, one of humanity’s needs; but, like everything which the Revolution touched, it will be transformed, and from being hostile to social progress, it will become favorable to it.'

In [14]:
for token in doc:
    print(token.text, token.pos, token.dep, token.tag)

At 85 443 1292078113972184607
the 90 415 15267657372422890137
beginning 92 439 15308085513773655218
of 85 443 1292078113972184607
the 90 415 15267657372422890137
Restoration 96 439 15794550382381185553
, 97 445 2593208677638477497
the 90 415 15267657372422890137
convent 92 429 15308085513773655218
of 85 443 1292078113972184607
the 90 415 15267657372422890137
Petit 96 7037928807040764755 15794550382381185553
- 97 445 8214596291009089021
Picpus 96 439 15794550382381185553
was 87 408 17109001835818727656
in 85 443 1292078113972184607
its 95 440 4062917326063685704
decay 92 439 15308085513773655218
; 97 445 11532473245541075862
this 95 429 15267657372422890137
forms 100 8206900633647566924 13927759927860985106
a 90 415 15267657372422890137
part 92 416 15308085513773655218
of 85 443 1292078113972184607
the 90 415 15267657372422890137
general 84 402 10554686591937588953
death 92 439 15308085513773655218
of 85 443 1292078113972184607
the 90 415 15267657372422890137
order 92 439 15308085513773

In [16]:
# using entity recognizer in the text
for ent in doc.ents:
    print(ent)

the Petit-Picpus
the eighteenth century
one
Revolution


In [18]:
# sentence generator
doc.sents

<generator at 0x7f77db6d0430>

In [20]:
# generate sentences when calling doc.sents (that generates tokens that point to each sentence in doc)
# print each sentence in doc
for sent in doc.sents:
    print(sent)
    print()

At the beginning of the Restoration, the convent of the Petit-Picpus was in its decay; this forms a part of the general death of the order, which, after the eighteenth century, has been disappearing like all the religious orders.

Contemplation is, like prayer, one of humanity’s needs; but, like everything which the Revolution touched, it will be transformed, and from being hostile to social progress, it will become favorable to it.



In [21]:
# using a reduced text, the first 10 paragraphs
texts = text_paragraphs[:10]

In [22]:
texts

['The Project Gutenberg eBook of Les Misérables This ebook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this ebook or online at www.gutenberg.org. If you are not located in the United States, you will have to check the laws of the country where you are located before using this eBook.',
 'Title: Les Misérables',
 'Author: Victor Hugo',
 'Translator: Isabel Florence Hapgood',
 'Release date: June 22, 2008 [eBook #135] Most recently updated: October 29, 2024',
 'Language: English',
 'Credits: Judith Boss and David Widger',
 '*** START OF THE PROJECT GUTENBERG EBOOK LES MISÉRABLES ***',
 'LES MISÉRABLES',
 'By Victor Hugo']

In [23]:
texts[0]

'The Project Gutenberg eBook of Les Misérables This ebook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this ebook or online at www.gutenberg.org. If you are not located in the United States, you will have to check the laws of the country where you are located before using this eBook.'

In [24]:
# load bge language model to embed the text in chunks
model = FlagModel('BAAI/bge-small-zh-v1.5', use_fp16 = True)

In [25]:
# embedding single paragraphs
embeddings_0 = model.encode(texts[0])
embeddings_1 = model.encode(texts[3])

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [26]:
# calculate cosine similarity between the embedded paragraphs
similarity = embeddings_0 @ embeddings_1.T

In [27]:
similarity

np.float32(0.5518602)

In [31]:
# instantiate Python client 
client = Elasticsearch("http://elasticsearch:9200")

In [40]:
client.indices.exists(index = "tyyyg")

HeadApiResponse(False)

In [41]:
# custom mapping that defined the expected types of indices features
# define mapping parameters for the "chunk" and "embedding_vector" fields
# define "vector_dim"

mappings = {
    "properties": {
        "chunk": {
            "type": "text"
        },
        "embedding_vector": {
            "index": True,
            "type": "dense_vector",
            "dims": vector_dim,
            "similarity": "cosine",
        }
    }
}

{'name': 'fe633bdc30e0', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'FQFRdZJeQeORa-XHf-5NLA', 'version': {'number': '8.16.1', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'ffe992aa682c1968b5df375b5095b3a21f122bf3', 'build_date': '2024-11-19T16:00:31.793213192Z', 'build_snapshot': False, 'lucene_version': '9.12.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


In [85]:
# a chunk is n sentences
# every time I collect n sentences join them and that is a chunk, append to a list
# after collecting m chunks, embed and index

chunks = []
sentences = []
chunks_embeddings = []

sentence_limit = 3
chunk_limit = 10

# processing the data as a stream and buffer the paragraphs in batches instead of one by one
# calling nlp on a text returns a generator that yields Doc objects
doc_pipeline = nlp.pipe(texts, batch_size = 5, n_process = 1)

for doc in doc_pipeline:
    for sent in doc.sents:
        sentences.append(sent.text)
        
        if len(sentences) == sentence_limit:
            chunk = " ".join(sentences)
            chunks.append(chunk)
            sentences = sentences[1:]

        if len(chunks) == chunk_limit:
            # embed
            chunks_embeddings.append(model.encode(chunks))
            # index
            # clear the list of chunks
            chunks = []
            
        # print(sent)
        # print()
        
    print(doc.text)

# at the end of the loop, verify if there are sentences/chunks left to embed and embed them


The Project Gutenberg eBook of Les Misérables This ebook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this ebook or online at www.gutenberg.org. If you are not located in the United States, you will have to check the laws of the country where you are located before using this eBook.
Title: Les Misérables
Author: Victor Hugo
Translator: Isabel Florence Hapgood
Release date: June 22, 2008 [eBook #135] Most recently updated: October 29, 2024
Language: English
Credits: Judith Boss and David Widger
*** START OF THE PROJECT GUTENBERG EBOOK LES MISÉRABLES ***
LES MISÉRABLES
By Victor Hugo


In [86]:
doc_pipeline

<generator object Language.pipe at 0x7f20f2b41e70>

In [93]:
sentences

['LES MISÉRABLES', 'By Victor Hugo']

In [88]:
chunks

['*** START OF THE PROJECT GUTENBERG EBOOK LES MISÉRABLES *** LES MISÉRABLES',
 '*** LES MISÉRABLES By Victor Hugo']

In [89]:
len(chunks_embeddings)

1

In [92]:
chunks_embeddings[0].shape

(10, 512)