In [1]:
import spacy

In [2]:
# helper functions
def remove_newline(par):
    # remove newline characters from a paragraph
    return " ".join(par.split())

In [3]:
file_path = "../data/LesMiserables.txt"

In [4]:
# read the file into one long string "text"
with open(file_path, "r") as text_file:
    text = text_file.read()

In [5]:
# split text by paragraphs 
text_paragraphs = text.split("\n\n")

In [6]:
len(text_paragraphs)

14558

In [7]:
text_paragraphs[0]

'The Project Gutenberg eBook of Les Misérables\n    \nThis ebook is for the use of anyone anywhere in the United States and\nmost other parts of the world at no cost and with almost no restrictions\nwhatsoever. You may copy it, give it away or re-use it under the terms\nof the Project Gutenberg License included with this ebook or online\nat www.gutenberg.org. If you are not located in the United States,\nyou will have to check the laws of the country where you are located\nbefore using this eBook.'

In [8]:
# remove "\n" characters using the helper function
for i in range(len(text_paragraphs)):
    text_paragraphs[i] = remove_newline(text_paragraphs[i])


In [9]:
text_paragraphs[5000]

'At the beginning of the Restoration, the convent of the Petit-Picpus was in its decay; this forms a part of the general death of the order, which, after the eighteenth century, has been disappearing like all the religious orders. Contemplation is, like prayer, one of humanity’s needs; but, like everything which the Revolution touched, it will be transformed, and from being hostile to social progress, it will become favorable to it.'

In [10]:
type(text_paragraphs)

list

##### Sentence segmentation using spaCy

In [11]:
# Load pretrained English Language Model
nlp = spacy.load('en_core_web_sm') 

In [12]:
# create an object "doc" by creating an instance of the nlp class for one paragraph

doc = nlp(text_paragraphs[5000])

In [13]:
doc.text

'At the beginning of the Restoration, the convent of the Petit-Picpus was in its decay; this forms a part of the general death of the order, which, after the eighteenth century, has been disappearing like all the religious orders. Contemplation is, like prayer, one of humanity’s needs; but, like everything which the Revolution touched, it will be transformed, and from being hostile to social progress, it will become favorable to it.'

In [14]:
for token in doc:
    print(token.text, token.pos, token.dep)

At 85 443
the 90 415
beginning 92 439
of 85 443
the 90 415
Restoration 96 439
, 97 445
the 90 415
convent 92 429
of 85 443
the 90 415
Petit 96 7037928807040764755
- 97 445
Picpus 96 439
was 87 408
in 85 443
its 95 440
decay 92 439
; 97 445
this 95 429
forms 100 8206900633647566924
a 90 415
part 92 416
of 85 443
the 90 415
general 84 402
death 92 439
of 85 443
the 90 415
order 92 439
, 97 445
which 95 429
, 97 445
after 85 443
the 90 415
eighteenth 84 402
century 92 439
, 97 445
has 87 405
been 87 405
disappearing 100 447
like 85 443
all 90 13323405159917154080
the 90 415
religious 84 402
orders 92 439
. 97 445
Contemplation 92 429
is 87 8206900633647566924
, 97 445
like 85 443
prayer 92 439
, 97 445
one 93 404
of 85 443
humanity 92 440
’s 94 8110129090154140942
needs 92 439
; 97 445
but 89 407
, 97 445
like 85 443
everything 95 439
which 95 416
the 90 415
Revolution 96 429
touched 100 447
, 97 445
it 95 430
will 87 405
be 87 406
transformed 100 410
, 97 445
and 89 407
from 85 443
being

In [15]:
doc.sents

<generator at 0x7f2506777250>

In [16]:
# generate sentences when calling doc.sents (that generates tokens that point to each sentence in doc)
# print each sentence in doc
for sent in doc.sents:
    print(sent)
    print()

At the beginning of the Restoration, the convent of the Petit-Picpus was in its decay; this forms a part of the general death of the order, which, after the eighteenth century, has been disappearing like all the religious orders.

Contemplation is, like prayer, one of humanity’s needs; but, like everything which the Revolution touched, it will be transformed, and from being hostile to social progress, it will become favorable to it.



In [None]:
# process text as a stream and generates Doc objects in order

In [19]:
texts = text_paragraphs[:10]

In [20]:
texts

['The Project Gutenberg eBook of Les Misérables This ebook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this ebook or online at www.gutenberg.org. If you are not located in the United States, you will have to check the laws of the country where you are located before using this eBook.',
 'Title: Les Misérables',
 'Author: Victor Hugo',
 'Translator: Isabel Florence Hapgood',
 'Release date: June 22, 2008 [eBook #135] Most recently updated: October 29, 2024',
 'Language: English',
 'Credits: Judith Boss and David Widger',
 '*** START OF THE PROJECT GUTENBERG EBOOK LES MISÉRABLES ***',
 'LES MISÉRABLES',
 'By Victor Hugo']

In [33]:
# returns a generator that yields Doc objects
doc_pipeline = nlp.pipe(texts, batch_size = 5, n_process = 1)

In [27]:
doc_pipeline

<generator object Language.pipe at 0x7f24c9bf8bc0>

In [34]:
# a chunk is n sentences
# every time I collect n sentences join them and that is a chunk, append to a list
# after collecting m chunks, embed and index

chunks = []
sentences = []

sentence_limit = 3
chunk_limit = 10

for doc in doc_pipeline:
    for sent in doc.sents:
        sentences.append(sent)
        
        if len(sentences) == sentence_limit:
            chunk = " ".join(sentences)
            chunks.append(chunk)
            sentences = sentences[1:]

        if len(chunks) == chunk_limit:
            # embed
            # index
            chunks = []
            
        # print(sent)
        # print()
        
    print(doc.text)

# at the end of the loop, verify if there are sentences/chunks left to embed and embed them

The Project Gutenberg eBook of Les Misérables This ebook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever.

You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this ebook or online at www.gutenberg.org.

If you are not located in the United States, you will have to check the laws of the country where you are located before using this eBook.

The Project Gutenberg eBook of Les Misérables This ebook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this ebook or online at www.gutenberg.org. If you are not located in the United States, you will have to check the laws of the country where you are located before using this eBook.
Title: Les Misérabl