Amalgamation of all sentence cleaning methods we could come up with.

Made by Artem Yushko + Vlad Verteletskiy

In [None]:
input_path = "/content/drive/MyDrive/artem-yushko/data-artem/cleaned/borshch3.txt"
output_path = "/content/drive/MyDrive/artem-yushko/data-artem/cleaned/borshch4.txt"

# Internals

## Helpers

Defining helper functions

In [None]:
# all the imports we will need
!pip install pycld3 &> /dev/null
import re
import os
import cld3

# loading the data
with open(input_path, 'r') as f:
  text = f.read()
  initial_length = str(len(text.split("\n")))

In [None]:
# manipulating spaces in and between words

us = "[А-ЩЬЮЯЄҐІЇЭЫЪа-щьюяєґіїэыъ'0-9a-zA-Z()%‰\"№\+]" # ukrainian word symbol + brackets + quotation marks + percentage sign (ыыы костыль) + russian symbols (ik but it is what it is)
upr = r'[.?!,;:—-]' # ukrainian punctuation
uwr = re.compile(us + "+") # Matches a word. We want our model to predict hyphens, thus I remove - from here

def space_stripper(sentence): # to get rid of extra spaces
    sentence = re.sub(r"\s{2,}", ' ', sentence) # double+ spaces
    sentence = re.sub(r"^\s+", '', sentence) # a space in the beginning (if double, then has already been removed)
    sentence = re.sub(r"\s+$", '', sentence) # a space in the end
    sentence = re.sub(r'([0-9])([.?!,;:—-])\s([0-9])', r"\1\2\3", sentence) # spaces in punctuation between numbers
    return sentence

def space_oddity(sentence): # to add spaces in between of punctuation
    sentence = space_stripper(sentence) # get rid of extra spaces
    words = re.findall(uwr, sentence) # match words
    punctuation = re.split(uwr, sentence) # split the remains over words. The punctuation will be both at the beginning and in the end
    i = 0 # the index of considered punctuation
    sentence = "" # dummy for the newly created sentence
    while i < len(punctuation) - 1: # end before the last punctuation
      sentence += ' '.join(list(punctuation[i])) + ' ' +  words[i] + ' ' # the symbols between words now get to be joined by spaces. Likely with several spaces if there were spaces
      i += 1
    sentence += ' '.join(list(punctuation[-1])) # add the last punctuation to account for them not having the word following
    return space_stripper(sentence) # strip the remaining spaces just in case

def fried_nails(sentence): # the reversed function: to remove the extra spaces. Not 1-to-1 (or onto?), like the previous function
    sentence = re.sub('\xad', '', sentence)
    words = re.findall(uwr, sentence) # retrieve the words as usual
    punctuation = re.split(uwr, sentence) # retrieve the rest
    i = 0
    sentence = ""
    while i < len(punctuation) -1:
      sentence += ''.join(re.split(r'\s+', punctuation[i])) + ' ' +  words[i] # now we remove the convenient spaces from punctuation, losing info
      i += 1
    sentence += ''.join(re.split(r'\s+', punctuation[-1]))
    sentence = re.sub(chr(8212), " " + chr(8212) + " ", sentence) # the dash must be separated at all times, no matter what
    sentence = re.sub(r'\s*-\s*', "-", sentence) # the hyphen is considered to cling always
    quote_split = re.split(r'\s*"\s*', sentence) # now, we deal with quotation marks
    sentence = ""
    for i in range(len(quote_split)//2):
        sentence += quote_split[2*i] + ' "' + quote_split[2*i+1] + '" ' # The odd numbered mark is the left one, the even numbered is the right one.
    if len(quote_split) % 2:
        sentence += quote_split[-1]
    # else: # if the number of marks is odd
        # print("Лапки порахуй, мудило") # A suggestion to the user: "Sorry, the program would work incorrectly if you do not fix the quotation marks yourself"
    sentence = re.sub(r"\s([.,;:?!])", r"\1", space_stripper(sentence)) # The rest of the punctuation gets clinged
    sentence = re.sub(r"\(\s+", '(', sentence) # fix the left brackets avoiding the "(" case (three punctuation marks in a row)
    sentence = re.sub(re.compile(f"({us})(\()"), r'\1 \2', sentence) # uncling the left bracket from a word
    sentence = re.sub("\s+\)", ')', sentence) # in the same way
    sentence = re.sub(re.compile(f"(\))({us})"), r'\1 \2', sentence) # uncling the right bracket from a word
    sentence = re.sub(r'\s*-\s*', "-", sentence) # the hyphen is considered to cling always
    sentence = re.sub(r'"\s\(', '"(', sentence) # removing the space from " (
    sentence = re.sub(r'\)\s"', ')"', sentence) # removing the space from ) "
    sentence = re.sub(r'([0-9])([.?!,;:—-])\s([0-9])', r"\1\2\3", sentence) # spaces in punctuation between numbers
    return sentence

## Standardization

Unifying and cleaning the disrepancies between different formatting methods

In [None]:
# Removing empty lines
text = re.sub(r"\n{2,}", r"\n", text)
lines = text.split("\n")
lines = lines[:-1] # -1 to remove the last empty line

### Quotation marks

Full list of different quotation marks:

* “ ” English double.
* ‘ ’ English Single.
* « » French «Christmas trees».
* „ “ German «paws».
* „ ” Polish.
* » « Swedish reverse.
* " " Double universal.

In [None]:
# replacing the christmas trees and upper doubles
text = re.sub(r'[«»“”]', '"', text)
# removing the more exotic ones with the spaces to the right
text = re.sub(r"„\s*", '"', text)
lines = text.split("\n")

In [None]:
# # progress report
# print(f"The number of christmas trees «» and upper doubles “” is {len(re.findall(r'[«»“”]', text))}")
# print(f"The number of lower doubles „ is {len(re.findall(r'„', text))}")
# print(f"The number of English opening singles ‘ is {len(re.findall(r'‘', text))}")
# print(f"The number of English closing singles ’ is {len(re.findall(r'’', text))}")
# print("The number of English single standard marks ' is " + str(len(re.findall(r"'", text))))
# print('The number of English standard double marks " is ' + str(len(re.findall(r'"', text))))

In [None]:
# removing the unfit lines and returning back to the normal form
lines = [l for l in lines if not re.findall(r'‘', l)]
text = '\n'.join(lines)

### Apostrophes

In [None]:
# # progress report
# print(f"The total number of ’ is: " + str(len(re.findall(r'’', text))))
# print(f"The number of ’ as phonetical (in the beginning of a word) is: " + str(len(re.findall(r'(\W’\w|^’\w)', text))))
# print(f"The number of ’ as apostrophes indeed is: " + str(len(re.findall(r'\w’\w', text))))
# print(f"The number of ’ in the end of the word is: " + str(len(re.findall(r'(\w’\W|\w’$)', text))))
# print(f"The number of dangling ’ is: " + str(len(re.findall(r'\W’\W', text))))

In [None]:
# replacing the ` and ´
text = re.sub(r'[`´]', "'", text)
# replacing the true apostrothes with ' symbols
text = "\n".join([l for l in lines if not(len(re.findall(r'(\W’\w|^’\w)', l)) + len(re.findall(r'(\w’\W|\w’$)', l)) + len(re.findall(r'\W’\W', l)))])
text = re.sub(r"[’`]", "'", text)

### Quotation marks and apostrophes-imposters.

In [None]:
# fixing the apostrophes-imposters
letters = "[А-ЩЬЮЯЄҐІЇа-щьюяєґії]"
text = re.sub(re.compile(f'({letters})' + '"' + f'({letters})'), r"\1'\2", text)
lines = text.split('\n')

In [None]:
# removing the lines with an uneven number of marks
lines = [space_stripper(l) for l in lines if ((len(re.findall('"', l)) % 2) == 0)]
text = '\n'.join(lines)

### Dashes

We'll set the Telegram dash (8212) as the standard

All possible dashes:

u8208-8214, u8722, u12641

In [None]:
# # finding them
# print(f"The number of special weird dashes {chr(65293)} (Unicode {65293}) is {len(re.findall(chr(65293), text))}")

# all the possible dashes
dashes = [chr(8208), chr(8209), chr(8210), chr(8211), chr(8212), chr(8213), \
          chr(8214), chr(8722), chr(12641)]

# replacing all the dashes with double hyphens
for dash in dashes:
  text = re.sub(dash, "--", text)

# and then replacing all of them with our standardized dashes
text = re.sub(r"-{2,}", "—", text)
lines = text.split("\n")

In [None]:
# removing the very specific cases of єднальне тире, which is not fixable by hardcode anyway
lines = [l for l in lines if not (len(re.findall(r'(\S—\S)', l)) and not len(re.findall(r'(\d—\d)', l)))]

# unclinging the clinging dashes
clingy = [l for l in lines if re.findall(r'(\S—\s|\s—\S)', l)]
clingy = [re.sub(r'(\S)—\s', r"\1 — ", l) for l in clingy]
clingy = [re.sub(r'\s—(\S)', r" — \1", l) for l in clingy]
lines = [l for l in lines if not len(re.findall(r'(\S—\s|\s—\S)', l))] + clingy

In [None]:
# removing the phrase fragments
lines = [l for l in lines if not (re.findall(r"^\s*—", l))]
text = "\n".join(lines)

### Hyphens

In [None]:
# # statistics
# print(f"The number of proper hyphens is " + str(len(re.findall(r'\w-\w', text))))
# print(f"The number of dangling hyphens is " + str(len(re.findall(r'(^-\W|\W-\W)', text))))
# print(f"The number of left-clinging hyphens is " + str(len(re.findall(r'(\w-\W|\w-$)', text))))
# print(f"The number of right-clinging hyphens is " + str(len(re.findall(r'(\W-\w|^-\w)', text))))

The clinging hyphens are sometimes legit, so we do not touch them. The dangling hyphens most likely represent dashes, so we replace them:

In [None]:
# replacing dangling hyphens with dashes
text = re.sub(r"(\W)-(\W)", r"\1—\2", text)
lines = text.split("\n")
# getting rid of the dangling hyphens in the beginning of the sentence as dashes
lines = [re.sub(r"^\s*—", '', l) for l in lines]

### Spaces and ellipsis


The following irregularities are being dealt with right now:
* Too many spaces.
* Spaces before the beginning and after the end of the sentence
* Several marks in a row (double comma or whatever).
* Spaces and comma family (;:,): no before, one after.
* Spaces and period family (.?!): no before.
* Ellipsis of a new kind (8230).

In [None]:
lines = [re.sub(r"\s{2,}", ' ', l) for l in lines] # two and more spaces into one
lines = [re.sub(r"^\s+", '', l) for l in lines] # one or more spaces in the beginning to remove
lines = [re.sub(r"\s+$", '', l) for l in lines] # one or more spaces in the end to remove
lines = [re.sub(r"\s([,:;.!?…])", r"\1", l) for l in lines] # removing the spaces that do not belong here

In [None]:
# implementing the ellipsis
lines = [re.sub(r"\.{3,}", chr(8230) + ' ', l) for l in lines]
text = '\n'.join(lines)
text = re.sub(r"([^!?])\.\.", r"\1…", text)
text = re.sub(r"([!?])\.\.", r"\1‥", text)

In [None]:
# fixing the improper double marks
text = re.sub(r"([,;:])[,;:]", r"\1", text)
lines = text.split('\n')
lines = [re.sub(r'([.!?;])([^)",\s])', r"\1 \2", l) for l in lines]
lines = [re.sub(r'([,])([^"\s])', r"\1 \2", l) for l in lines]
text = '\n'.join(lines)
text = re.sub(r'([:])(\S)', r"\1 \2", text)
#[l for l in lines if re.findall(r"[,;:]\.", l)]

In [None]:
# removing the dullipsis and other weird stuff
text = re.sub(chr(8230), "...", text)
text = re.sub(chr(8229), " ", text)
lines = text.split('\n')
lines = [l for l in lines if not (re.findall(r'(\S—\s|\s—\S)', l))]

## Removing the unfit sentences

In [None]:
print('Original length: ' + str(len(lines)) + '\n')

# bruteforce cleaners of different stuff I found
# uncomment to see the statistics
# print('Number of sentences with ": —" signs: ' + str(len([l for l in lines if re.findall(": —", l)])))
# print('Number of sentences with ", —" signs: ' + str(len([l for l in lines if re.findall(", —", l)])))
# print('Number of sentences with "... —" signs: ' + str(len([l for l in lines if re.findall("\.\.\. —", l)])))
# print('Number of sentences with "… —" signs: ' + str(len(re.findall(chr(8230)+r'\s—', text))))
# print('Number of sentences with <> artifacts: ' + str(len([l for l in lines if re.findall(r'<*>', l)])))
# print('Number of sentences with [] artifacts: ' + str(len([l for l in lines if re.findall(r"\[*]", l)])))
# print('Number of sentences that start with punctuation except for dashes: ' + str(len([l for l in lines if l[0] in upr and l[0] != "—"])))
# print('Number of sentences that do not have any Ukrainian symbols: ' + str(len([l for l in lines if len(re.findall(r"[А-ЩЬЮЯЄҐІЇа-щьюяєґії]", l)) == 0])))
# print('Number of sentences with bullshit symbols: ' + str(len([l for l in lines if len(re.findall(r"[�ßüä~@#^*‘₴{}\|/<>]", l))])))

# making sure that the data is cleanable
lines = [space_stripper(l) for l in lines]

# removing too short or long lines
lines = [l for l in lines if len(l.split(' ')) in range(5, 30)]

# removing the direct speech
lines = [l for l in lines if not (re.findall("\: —", l) or re.findall(", —", l) or re.findall("\.\.\. —", l) or re.findall("; —", l) \
               or re.findall(chr(8230)+r' —', l) or re.findall("\. —", l) or re.findall('\? —', l) or re.findall('! —', l) or re.findall('—\.', l))]

# removing the sentences that start with punctuarion except for dashes
lines = [l for l in lines if not l[0] in upr]

# removing the other bs symbols and editorial artifacts
lines = [l for l in lines if re.findall(r"[А-ЩЬЮЯЄҐІЇа-щьюяєґії]", l) and not re.findall(r"[°♦_■•©�ßüä~@#^*‘₴{}\|/<>×○εχθρώνάδωΩΘλΣ\[\]]", l)]

# removing very specific sentences with no room for errorifying
lines = [space_stripper(l) for l in lines if not (re.findall('"\(', l) or re.findall(',\.', l) or re.findall('\.-', l) or re.findall('\.\.\.,', l) or re.findall('-,', l) \
         or re.findall('\.!', l) or re.findall(':\.', l) or re.findall('-;', l) or re.findall('"-', l) or (re.findall('\.{2}', l) and not re.findall('\.{3}', l)) \
         or re.findall('-\(', l) or re.findall('\.-', l))]

# removing apparent sentence fragments
lines = [l for l in lines if l[-1] in upr]

# removing sentences in Russian
lines = [l for l in lines if not (cld3.get_language(l)[0] != 'uk' and cld3.get_language(l)[1] >= .99)]

print('\nLength after initial cleaning: ' + str(len(lines)))

Original length: 1036836


Length after initial cleaning: 1031783


## Author punctuation cleaners

In [None]:
# removing the author punctuation
lines = [re.sub('\?.*', '?', l) for l in lines]
lines = [re.sub('!.*', '!', l) for l in lines]
lines = [re.sub('-\.', '.', l) for l in lines]
lines = [re.sub(' ""', ' ', l) for l in lines]
lines = [re.sub('/././.,', ',', l) for l in lines]
lines = [re.sub('/././.?', '?', l) for l in lines]

In [None]:
# repeating the process after removing the author punctuation

# making sure that the data is cleanable
lines = [space_stripper(l) for l in lines]

# removing the direct speech
lines = [l for l in lines if not (re.findall("\: —", l) or re.findall(", —", l) or re.findall("\.\.\. —", l) or re.findall("; —", l) \
               or re.findall(chr(8230)+r' —', l) or re.findall("\. —", l) or re.findall('\? —', l) or re.findall('! —', l) or re.findall('—\.', l))]

# removing the sentences that start with punctuarion except for dashes
lines = [l for l in lines if not l[0] in upr]

# removing the other bs symbols and editorial artifacts
lines = [l for l in lines if re.findall(r"[А-ЩЬЮЯЄҐІЇа-щьюяєґії]", l) and not re.findall(r"[°♦_■•©�ßüä~@#^*‘₴{}\|/<>π×○εχθρώνάδωΩΘλΣ\[\]]", l)]

# removing very specific sentences with no room for errorifying
lines = [space_stripper(l) for l in lines if not (re.findall('"\(', l) or re.findall(',\.', l) or re.findall('\.-', l) or re.findall('\.\.\.,', l) or re.findall('-,', l) \
         or re.findall('\.!', l) or re.findall(':\.', l) or re.findall('-;', l) or re.findall('"-', l) or (re.findall('\.{2}', l) and not re.findall('\.{3}', l)) \
         or re.findall('-\(', l) or re.findall('\.-', l))]

# removing the author capitalization
lines = [re.sub('Деталі: ', '', l) for l in lines]

print('\nLength after initial cleaning: ' + str(len(lines)))


Length after initial cleaning: 1031783


## Final brushes

In [None]:
# cleaning the bullshit symbols
bs_symbols = ['► ', '☛ ', '→ ', '⁃ ', '− ', '₂', '● ']
text = '\n'.join(lines)
for bs in bs_symbols:
  text = re.sub(bs, '', text)

In [None]:
# removing the emojis
emoj = re.compile("["
  u"\U0001F600-\U0001F64F"   # emoticons
  u"\U0001F300-\U0001F5FF"   # symbols & pictographs
  u"\U0001F680-\U0001F6FF"   # transport & map symbols
  u"\U0001F1E0-\U0001F1FF"   # flags (iOS)
  u"\U00002500-\U00002BEF"   # chinese char
  u"\U00002702-\U000027B0"
  u"\U00002702-\U000027B0"
  u"\U000024C2-\U0001F251"
  u"\U0001f926-\U0001f937"
  u"\U00010000-\U0010ffff"
  u"\u2640-\u2642" 
  u"\u2600-\u2B55"
  u"\u200d"
  u"\u23cf"
  u"\u23e9"
  u"\u231a"
  u"\ufe0f"  # dingbats
  u"\u3030"
  "]+", re.UNICODE)

lines = text.split("\n")
lines = [re.sub(emoj, '', l) for l in lines]

In [None]:
# removing the duplicates
lines = list(set(lines))
print(len(lines))

1031778


## Final save

In [None]:
print('Initial length: ' + initial_length)
print('Final length: ' + str(len(lines)))

Initial length: 1038023
Final length: 1031778


In [None]:
text = '\n'.join(lines)
with open(output_path, 'w') as f:
  f.write(text)