Module for splitting raw data into sentences.

Written by Artem

In [None]:
# input and output data paths
input_path = '/content/drive/MyDrive/artem-yushko/data-artem/raw/borshch-addon.txt'
output_path = '/content/drive/MyDrive/artem-yushko/data-artem/raw/borshch-addon-sentences.txt'

In [None]:
!pip install spacy_udpipe

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy_udpipe
  Downloading spacy_udpipe-1.0.0-py3-none-any.whl (11 kB)
Collecting ufal.udpipe>=1.2.0
  Downloading ufal.udpipe-1.2.0.3.tar.gz (304 kB)
[K     |████████████████████████████████| 304 kB 5.1 MB/s 
Building wheels for collected packages: ufal.udpipe
  Building wheel for ufal.udpipe (setup.py) ... [?25l[?25hdone
  Created wheel for ufal.udpipe: filename=ufal.udpipe-1.2.0.3-cp37-cp37m-linux_x86_64.whl size=5626689 sha256=e7c0af0884b94120653c43d0bcfc904439e6ef01f15cf0076cd071a05d58c4a5
  Stored in directory: /root/.cache/pip/wheels/b8/b5/8e/3da091629a21ce2d10bf90759d0cb034ba10a5cf7a01e83d64
Successfully built ufal.udpipe
Installing collected packages: ufal.udpipe, spacy-udpipe
Successfully installed spacy-udpipe-1.0.0 ufal.udpipe-1.2.0.3


In [None]:
# all the imports we will need
import re
import spacy_udpipe
import math

In [None]:
# spacy-specific model path
spacy_model_path = '/content/drive/MyDrive/maksym-bondarenko/spelling/udpipe/ukrainian-iu-ud-2.5-191206.udpipe'

# initializing the model
SPACY_UDPIPE_MODEL = spacy_udpipe.load_from_path(
    lang="uk",
    path=spacy_model_path,
)

# splits text into sentences. can handle contractions and stuff
def split_text_into_sentences(text):
  doc = SPACY_UDPIPE_MODEL(text)
  sentences = [sent.text.strip() for sent in doc.sents]
  return sentences

In [None]:
# loading the data
with open(input_path, 'r') as f:
  text = f.read()
  text_org = text

# Removing empty lines
text = re.sub(r"\n{2,}", r"\n", text)
lines = text.split("\n")
lines = lines[:-1] # -1 to remove the last empty line

In [None]:
# the final file
true_lines = []

# progress tracker
line_counter = 0

# splitting the text into sentences
for line in lines:
  sentences = split_text_into_sentences(line)
  # adding the sentences to the final list
  for sentence in sentences:
    true_lines.append(sentence)
  # progress tracker
  line_counter = line_counter + 1
  if line_counter % 15000 == 0:
    print("Progress: " + str(math.floor((line_counter/len(lines))*100)) + "%")
    print(str(line_counter) + " lines have been processed. " + str(len(true_lines)) + " sentences have been added.")

# removing the duplicates
true_lines = list(set(true_lines))
print("Total number of sentences: " + str(len(true_lines)))

Total number of sentences: 221548


In [None]:
# writing everything down
text = '\n'.join(true_lines)
with open(output_path, 'w') as f:
  f.write(text)