In [None]:
!pip install spacy_udpipe &> /dev/null 
!pip install pymorphy2 &> /dev/null 
!pip install pymorphy2-dicts-uk &> /dev/null 

#Supporting classes


In [None]:
import re

class SpaceHandler(object):
    """
    handles spaces before and after punctuation
    functions:
    - space_stripper - strips extra spaces from text, used in space_oddity
    - space_oddity - adds extra spaces before punctuation for tokenization
    - fried_nails - removes extra spaces before punctuation for anti-tokenization
    """
    def __init__(self):
        self.us = "[А-ЩЬЮЯЄҐІЇЭЫЪа-щьюяєґіїэыъ'0-9a-zA-Z()%‰\"№\+]" # ukrainian word symbol + brackets + quotation marks + percentage sign (ыыы костыль) + russian symbols (ik but it is what it is)
        self.upr = r'[.?!,;:—-]' # ukrainian punctuation
        self.uwr = re.compile(self.us + "+") # Matches a word. We want our model to predict hyphens, thus I remove - from here

    def space_stripper(self, sentence): # to get rid of extra spaces
        sentence = re.sub(r"\s{2,}", ' ', sentence) # double+ spaces
        sentence = re.sub(r"^\s+", '', sentence) # a space in the beginning (if double, then has already been removed)
        sentence = re.sub(r"\s+$", '', sentence) # a space in the end
        sentence = re.sub(r'([0-9])([.?!,;:—-])\s([0-9])', r"\1\2\3", sentence) # spaces in punctuation between numbers
        return sentence

    def space_oddity(self, sentence): # to add spaces in between of punctuation
        sentence = self.space_stripper(sentence) # get rid of extra spaces
        words = re.findall(self.uwr, sentence) # match words
        punctuation = re.split(self.uwr, sentence) # split the remains over words. The punctuation will be both at the beginning and in the end
        i = 0 # the index of considered punctuation
        sentence = "" # dummy for the newly created sentence
        while i < len(punctuation) - 1: # end before the last punctuation
            sentence += ' '.join(list(punctuation[i])) + ' ' +  words[i] + ' ' # the symbols between words now get to be joined by spaces. Likely with several spaces if there were spaces
            i += 1
        sentence += ' '.join(list(punctuation[-1])) # add the last punctuation to account for them not having the word following
        return self.space_stripper(sentence) # strip the remaining spaces just in case

    def fried_nails(self, sentence): # the reversed function: to remove the extra spaces. Not 1-to-1 (or onto?), like the previous function
        sentence = re.sub('\xad', '', sentence)
        words = re.findall(self.uwr, sentence) # retrieve the words as usual
        punctuation = re.split(self.uwr, sentence) # retrieve the rest
        i = 0
        sentence = ""
        while i < len(punctuation) -1:
            sentence += ''.join(re.split(r'\s+', punctuation[i])) + ' ' +  words[i] # now we remove the convenient spaces from punctuation, losing info
            i += 1
        sentence += ''.join(re.split(r'\s+', punctuation[-1]))
        sentence = re.sub(chr(8212), " " + chr(8212) + " ", sentence) # the dash must be separated at all times, no matter what
        sentence = re.sub(r'\s*-\s*', "-", sentence) # the hyphen is considered to cling always
        quote_split = re.split(r'\s*"\s*', sentence) # now, we deal with quotation marks
        sentence = ""
        for i in range(len(quote_split)//2):
            sentence += quote_split[2*i] + ' "' + quote_split[2*i+1] + '" ' # The odd numbered mark is the left one, the even numbered is the right one.
        if len(quote_split) % 2:
            sentence += quote_split[-1]
        # else: # if the number of marks is odd
            # print("Лапки порахуй, мудило") # A suggestion to the user: "Sorry, the program would work incorrectly if you do not fix the quotation marks yourself"
        sentence = re.sub(r"\s([.,;:?!])", r"\1", self.space_stripper(sentence)) # The rest of the punctuation gets clinged
        sentence = re.sub(r"\(\s+", '(', sentence) # fix the left brackets avoiding the "(" case (three punctuation marks in a row)
        sentence = re.sub(re.compile(f"({self.us})(\()"), r'\1 \2', sentence) # uncling the left bracket from a word
        sentence = re.sub("\s+\)", ')', sentence) # in the same way
        sentence = re.sub('–', ' –', sentence) # put space before the dash
        sentence = re.sub(re.compile(f"(\))({self.us})"), r'\1 \2', sentence) # uncling the right bracket from a word
        sentence = re.sub(r'\s*-\s*', "-", sentence) # the hyphen is considered to cling always
        sentence = re.sub(r"\s*’\s*", "’", sentence) # same for apostrophes
        sentence = re.sub(r"’\s*", "’", sentence) # same for apostrophes
        sentence = re.sub(r'"\s\(', '"(', sentence) # removing the space from " (
        sentence = re.sub(r'\)\s"', ')"', sentence) # removing the space from ) "
        sentence = re.sub(r'([0-9])([.?!,;:—-])\s([0-9])', r"\1\2\3", sentence) # spaces in punctuation between numbers
        return sentence

In [None]:
from collections import defaultdict
# class used to extract data about word (in context, such as part of language, gender, etc), and inflect the word
class FormExtractorInflector():
    matchings_part = {"VERB":"VERB", "PRON":"NPRO", "DET":"NPRO","ADJ":"ADJF", "NUM":"NUMR", "NOUN":"NOUN"} #match POS for inflector to be readable

    #cases from movainstitute to inflector
    cases = {"Nom":"N", "Gen": "R", "Dat":"D", "Acc":"Z", "Ins":"O", "Loc":"M", "Voc":"K"}

    v_forms = {"Inf" : ["Inf"], "Pr1": ["Pres", "1"], "Pr2" : ["Pres", "2"],
                    "Pr3" : ["Pres", "3"], "Fu1" : ["Fut", "1"], "Fu2" : ["Fut", "2"],
                    "Fu3" : ["Fut", "3"], "PsMs" : ["Past", "Masc"], "PsFe" : ["Past", "Fem"],
                    "PsNe" : ["Past", "Neut"]}
    genders = {"Masc":"masc", "Fem":"femn", "Neut":"neut"}
    quanitys = {"Plur":"plur", "Sing":"sing"}
    aspects = {"Perf":"perf", "Imp":None}

    v_forms_rev = {" ".join(v): k for k, v in v_forms.items()}
    def __init__(self, inflector):
        self.infl=inflector

    # helper function, just to save code
    def c(self, a):
        if a is None: return None
        if a in self.genders: return self.genders[a]
        if a in self.quanitys: return self.quanitys[a]
        if a in self.cases: return self.cases[a]
        if a in self.v_forms_rev: return self.v_forms_rev[a]
        if a in self.aspects: return self.aspects[a]
        return a

    # get data about word using Mova Institute
    # sentence_anal is a sentence after Mova Institute (array of words with info)
    # this function extracts main info about word at index - index
    def extract_data(self, sentence_anal, index):
        # not really needed, but changes from [[00,01,02], [10,11,12]] to [[00, 10], [01, 11], [02, 12]]
        anal = list(zip(*sentence_anal))

        # word itself
        source = anal[0][index]
        # which part of language it is
        part = anal[2][index]
        # matching_part is a dictionary just to change part naming from Mova Institute to inflector
        if part in self.matchings_part:
            part = self.matchings_part[part]
        #descr contains all other info (like gender, plural, etc.)
        descr = anal[4][index]
        descr = descr.split("|")
        
        case = None
        gender = None
        quantity = None
        verbform = None
        tense = None
        person = None
        aspect = None
        mood = None

        for i in descr:
            ir = i.split("=")
            if ir[0]=="Case":
                case = ir[1]
            elif ir[0]=="Gender":
                gender = ir[1]
            elif ir[0]=="Number":
                quantity = ir[1]
            elif ir[0]=="VerbForm":
                verbform = ir[1]
            elif ir[0]=="Tense":
                tense = ir[1]
            elif ir[0]=="Person":
                person = ir[1]
            elif ir[0]=="Aspect":
                aspect = ir[1]
            elif ir[0]=="Mood":
                mood = ir[1]
        
        #return info about word
        # case/Inf/etc, part of language, gender, quantity, aspect, word itself
        if case is not None:
            return self.c(case), part, self.c(gender), self.c(quantity), self.c(aspect), source
        if verbform == "Inf":
            return "Inf", part, self.c(gender), self.c(quantity), self.c(aspect), source
        if tense is not None:
            if person is None and gender is not None:
                return self.c(" ".join([tense, gender])), part, None, None, None, source#c(aspect)
            else:
                if person is None: person ="1"
                return self.c(" ".join([tense, person])), part, self.c(gender), self.c(quantity), self.c(aspect), source
        if mood=="Imp" and verbform=="Fin":
            if person is None: person="1"
            return "Imp"+person,part,self.c(gender),self.c(quantity),self.c(aspect)
        return None, part, None, None, None, source
    
    # inflects word if it is possible. If not (bcs it is verb, etc.) then returns the original word
    def inflect_word_ignore_part(self, word, case, part, gender=None, quantity=None, perf=None):
        try:
            return self.infl.inflect_word(word, case, part, gender=gender, quantity=quantity), True
        except Exception as e:
            print(e)
            for i in self.matchings_part.keys():
                try:
                    return self.infl.inflect_word(word, case, self.matchings_part[i], gender=gender, quantity=quantity), True
                except Exception as e:
                    print(e)
                    pass
        return word, False
    # checks if ending of words is the same (to avoid ся vs сь, etc.)
    def word_inflection_goodness_checker(self, word1, word2):
        score=0
        if word1[-1]==word2[-1]:
            score+=100
        if word1[-1]==word2[-1] and word1[-2]==word2[-2]:
            score+=50
        return score

    # gets data about word at index index in sentence, then puts "word" in the same form (case, gender, etc.) 
    def inflect_as_in_sentence(self, sentence_anal, index, word):
        # extract data
        case, part, gender, quantity, aspect, source = self.extract_data(sentence_anal, index)
        # return original if X part
        if part == "ADV" or part=="PUNCT" or part == "ADP":
            return word, True
        try:
            # case 1, "case" of the word is Z
            if case=="Z":
                # to fix some situations with inflector, it tries Z and N "cases", then selects best one, based on ending similarity
                inflection1 = self.inflect_word_ignore_part(word, case, part, gender=gender, quantity=quantity, perf=aspect)
                score1 = self.word_inflection_goodness_checker(inflection1[0], source)
                inflection2 = self.inflect_word_ignore_part(word, "N", part, gender=gender, quantity=quantity, perf=aspect)
                score2 = self.word_inflection_goodness_checker(inflection2[0], source)
                if score1>=score2:
                    return inflection1
                else:
                    return inflection2
            # case 2, "part" of the word is verb
            if part=="VERB":
                inflection = self.inflect_word_ignore_part(word, case, part, gender=gender, quantity=quantity, perf=aspect)
                
                # check for ending, ся vs сь
                score1 = self.word_inflection_goodness_checker(inflection[0][:-1]+"я", source)
                score2 = self.word_inflection_goodness_checker(inflection[0], source)
                if score1>=score2:
                    inflection=(inflection[0][:-1]+"я", inflection[1])
                return inflection
            # simplest case, covers most situations
            return self.inflect_word_ignore_part(word, case, part, gender=gender, quantity=quantity, perf=aspect)
        except Exception as e:
            print(e)
            #print("return original word in inflect_as_in_sentence")
            return word, False
        pass

In [None]:
class SurzhiksGenerator(FormExtractorInflector):

        
    def __init__(self, inflector, spacy_udpipe_model, pairs_path='/content/drive/MyDrive/ukramarly-main/errorifier/errorifier-data/lexical/surzhiks.txt'):
        self.SPACY_UDPIPE_MODEL=spacy_udpipe_model
        super().__init__(inflector=inflector)
        def pairs_analyzer(pairs):
            res=[]
            for i in range(len(pairs)):
                res.append([self.analyze_sentence(pairs[i][0]), self.analyze_sentence(pairs[i][1])])
            return res
        def bye_parentheses(mystring):
            start = mystring.find("(")
            end = mystring.find(")")
            if start != -1 and end != -1:
                return mystring[start+1:end]
            else:
                return mystring
        def clear_spaces(sentence):
            if sentence[0]==" ":
                sentence = sentence[1:]
            if sentence[-1]==" ":
                sentence = sentence[:-1]
            return sentence


        """
        формат суржиків:
        в списку:
            не правильно - правильно
        в коді:
            suzhiks = [
                [правильно, не правильно],
                [правильно, не правильно],
                ...
            ]

        """
        common_errors_path = pairs_path
        surzhiks2 = []
        with open(common_errors_path, encoding = 'utf-8', mode = 'r') as file:
            for line in file:
                    if len(line)==0:
                      continue
                    if line[0]=="#":
                      continue
                    if line[:2]=="\n":
                      continue
                    words = line.split("; ")
                    # MODIFIED THE FOLLOWING TWO LINES
                    left = clear_spaces(bye_parentheses(words[1]))
                    right = clear_spaces(bye_parentheses(words[0].replace("\n","")))
                    left_arr=left.split(" ")
                    rights=right.split(",")
                    right_arr=[]
                    for i in rights:
                        right_arr=i.split(" ")
                        if len(left_arr)!=len(right_arr):
                            right_arr=[]
                    if len(right_arr)==0 or len(right_arr)==1: continue

                    # for i in range(len(right_arr)):
                    #     surzhiks2.append([right_arr[i], left_arr[i]])

                    surzhiks2.append([" ".join(right_arr),left])
                    #words[0] - правильна фраза, words[1] - неправильна фраза

            file.close()
        self.surzhiks2 = surzhiks2
        self.pairs_anal=pairs_analyzer(surzhiks2)
        pass

    def analyze_sentence(self, text: str):
        i = 0
        splitted_by_words = []
        for token in self.SPACY_UDPIPE_MODEL(text):
            splitted_by_words.append ([
                token.text, #оригінал
                token.lemma_, # лема
                token.pos_, # частина мови
                token.dep_, # синтаксична роль
                str(token.morph),# морфологічний опис слова 
                0 if token.i == (token.head.i) else token.head.i #індекс голови
            ])
            i += 1
        return splitted_by_words

    def smart_n_grams_creator(self,anal):
        smart_n_grams = []
        for i in range(len(anal)):
            anal[i].append(i)
        for n in range(min(len(anal),4)):
            for i in range(len(anal)):
                n_gram = {}
                n_gram[i] = anal[i][5]
                last = i
                for j in range(len(anal)):
                    if j == n-1:
                        break
                    ind = n_gram[last]
                    n_gram[ind] = anal[ind][5]
                    last = ind
                    if n_gram[ind] == ind:
                        break
                n_gram_words = []
                for j in n_gram.keys():
                    n_gram_words.append(anal[j])
                smart_n_grams.append(n_gram_words)
        return smart_n_grams
    def arePermutations(self, arr1, arr2):
        if (len(arr1) != len(arr2)):
            return False
        hM = defaultdict (int)
        for i in range (len(arr1)):
            x = arr1[i]
            hM[x] += 1
        for i in range (len(arr2)):
            x = arr2[i]
            if x not in hM or hM[x] == 0:
                return False
            hM[x] -= 1
        return True
    def smart_n_grams_clearer(self,smart_n_grams_array):
        output = []
        for i in range(len(smart_n_grams_array)):
            for j in range(len(smart_n_grams_array)):
                if i==j:
                    continue
                anal1 = list(zip(*smart_n_grams_array[i]))
                anal2 = list(zip(*smart_n_grams_array[j]))
                if self.arePermutations(anal1[0], anal2[0]):
                    smart_n_grams_array[i] = "TAGGG"
        
        for i in smart_n_grams_array:
            if i!="TAGGG":
                output.append(i)
        return output
    def equality_checker(self, pair1_anal, pair2_anal, indexes=None):
        anal1 = list(zip(*pair1_anal))
        norm1=anal1[1]
        anal2 = list(zip(*pair2_anal))
        norm2=anal2[1]
        index_relation = {}
        fl = 0
        tf1=0
        tf2=0
        for i in range(len(norm1)):
            if anal1[2][i]!="PUNCT":
                tf1+=1
        for i in range(len(norm2)):
            if anal2[2][i]!="PUNCT":
                tf2+=1
        if tf1!=tf2:
            return False, index_relation
        for i in range(len(norm1)):
            for j in range(len(norm2)):
                if norm1[i]==norm2[j]:
                    if indexes is not None:
                        index_relation[j] = indexes[i]
                    else:
                        index_relation[j] = i
                    fl+=1
                    break
                elif anal1[2][i]=="PUNCT" or anal2[2][j]=="PUNCT":
                    continue
        return fl==tf1, index_relation
    def prepare_subsitution(self,sentence_anal, pair_anals, indexes, keep_if_same=True):
        anal = list(zip(*sentence_anal))
        pair0=list(zip(*(pair_anals[0])))
        pair=list(zip(*(pair_anals[1])))
        pair_inflected = []
        inflected_successfuly = True
        for i in range(min(len(pair[0]), len(pair0[0]))):
            success = False
            if pair0[0][i] == anal[0][indexes[i]] and keep_if_same:
                infl_pair, success = pair[0][i], True
            else:
                infl_pair, success = self.inflect_as_in_sentence(sentence_anal, indexes[i], pair[0][i])
            if not success:
                inflected_successfuly = False
            pair_inflected.append(infl_pair)
        return pair_inflected, inflected_successfuly
    def get_indexes_for_substitution(self,n_grams, pair_anal):
        for i in range(len(n_grams)):
            indexes = {ind : n_grams[i][ind][6] for ind in range(len(n_grams[i]))}
            equality, connection = self.equality_checker(n_grams[i], pair_anal[0], indexes=indexes)
            if equality:
                return True, connection, n_grams[i]
        return False, None, None
    def substitude(self,anal, smart_n_grams, pair_anal,keep_if_same=True):
        sentence_arr=list(list(zip(*anal))[0])
        found, indexes, _ = self.get_indexes_for_substitution(smart_n_grams, pair_anal)
        changed = True
        if not found:
            tokens = ["$KEEP" for i in range(len(sentence_arr))]
        else:
            inflected_pair, inflected_successfuly = self.prepare_subsitution(anal, pair_anal, indexes, keep_if_same=keep_if_same)
            changed = inflected_successfuly
            tokens = ["$KEEP" for i in range(len(sentence_arr))]
            for i in indexes.keys():
                sentence_arr[indexes[i]] = inflected_pair[i]
                tokens[indexes[i]] = "$ANTISURZHIFY"
        return sentence_arr, tokens, changed
    def sentence_adj_changer2(self, sentence):
        anal = self.analyze_sentence(sentence)
        ress = list(zip(*anal))
        result = list(ress[0])
        for i in range(len(ress[0])):
            if ress[2][i]=="NOUN":
                for j in range(len(ress[0])):
                    if ress[2][j]=="ADJ" and ress[5][j]==i:
                        result[j]=self.inflate_word_as_regardless(ress[0][i],ress[0][j])
        return " ".join(result)
    def smart_n_grams_token_clearer(self,n_grams):
        res = []
        for i in n_grams:
            fl=True
            for desc in i:
                if desc[3]!="$ANTISURZHIFY":
                    fl=False
            if fl:
                res.append(i)
        return res
    def antisurzhifier(self, sentence, tokens=None):
        anal = self.analyze_sentence(sentence)
        if tokens is not None:
          for i in range(len(anal)):
              anal[i][3]=tokens[i]
        if tokens is None:
          smart_n_grams = (self.smart_n_grams_clearer(self.smart_n_grams_creator(anal)))
        else:
          smart_n_grams = self.smart_n_grams_token_clearer(self.smart_n_grams_clearer(self.smart_n_grams_creator(anal)))
        pair_anal = self.pairs_anal[0]
        res=self.substitude(anal, smart_n_grams, pair_anal[::-1], keep_if_same=True)
        sentence_res = res[0]
        for pair_anal in self.pairs_anal:
            if sentence_res!=list(list(zip(*anal))[0]):
                anal = self.analyze_sentence(" ".join(sentence_res))
            if tokens is None:
              smart_n_grams = (self.smart_n_grams_clearer(self.smart_n_grams_creator(anal)))
            else:
              for i in range(len(anal)):
                  anal[i][3]=tokens[i]
              smart_n_grams = self.smart_n_grams_token_clearer(self.smart_n_grams_clearer(self.smart_n_grams_creator(anal)))
            res=self.substitude(anal, smart_n_grams, pair_anal[::-1], keep_if_same=True)
            sentence_res=res[0]
        result=""
        for i in sentence_res:
            puncts=[',','.',':',";",'-']
            if i in puncts:
                result+=i
            else:
                result+=" "+i
        return result[1:]

In [None]:
import spacy_udpipe
import pymorphy2

import sys
sys.path.append("/content/drive/MyDrive/STABLE/helpers")

# import inflector
from Inflector import Inflector

morph = pymorphy2.MorphAnalyzer(lang='uk')
inflector = Inflector(morph)


model_path = '/content/drive/MyDrive/model-directories/mova_institute.udpipe'
SPACY_UDPIPE_MODEL = spacy_udpipe.load_from_path(
lang="uk",
path=model_path
)

# Errorify dataset

In [6]:
# init
surzhik_generator = SurzhiksGenerator(inflector, SPACY_UDPIPE_MODEL)

In [None]:
surzhik_generator.antisurzhifier("я вимкнув світло")

In [7]:
# clean data FILE to be errorified
input_file = "/content/drive/MyDrive/artem-yushko/data-artem/cleaned/borshch4.txt"

# output FOLDER for the errorified and tagged data, future model input
out_folder = "/content/drive/MyDrive/UNLP/assist-data/surzhik-generated-1k"

In [8]:
import os
# creating the output folder
if not os.path.exists(out_folder):
  os.mkdir(out_folder)

# reading the file
with open(input_file, 'r') as f:
  text = f.read()
  lines = text.split('\n')

dataset = lines

In [9]:
space_handler = SpaceHandler()

In [1]:
# extract ukr key phrases which to look for
ukr_key_phrases = [word[1][:-1] for word in surzhik_generator.surzhiks2]
# convert it to pairs of keywords
ukr_key_words = []
for key_phrase in ukr_key_phrases:
  words = key_phrase.split(" ")
  ukr_key_words.append(words)

# extract ids of relevant sentences (those that contain all key words for given keyword pair)
relevant_id = {}
for key_phrase in ukr_key_phrases:
  relevant_id[key_phrase] = []
for sent_id in range(len(dataset)):
  sentence = space_handler.space_oddity(dataset[sent_id])
  for key_phrase_id in range(len(ukr_key_phrases)):
    phrase_present = True
    for word in ukr_key_words[key_phrase_id]:
      # if any word is not present, phrase_present = false
      el = " " + word + " "
      phrase_present = phrase_present and (el in sentence)
    if phrase_present:
      relevant_id[ukr_key_phrases[key_phrase_id]].append(sent_id)

# extract # of  sentences for each category 
for key_phrase in ukr_key_phrases:
  print(key_phrase + ': ' + str(len(relevant_id[key_phrase])))

NameError: ignored

In [11]:
# extract 20 sentences for each category
idxs_to_surzhify = []
for key_phrase in ukr_key_phrases:
  idxs_to_surzhify += relevant_id[key_phrase][:20]

sentences_to_surzhify = [dataset[id] for id in idxs_to_surzhify]
len(sentences_to_surzhify)

1098

In [None]:
# now only errorify relecant sengences
import time
s = time.time()
out_sentences =[]
for sent in sentences_to_surzhify:
  try:
    out_sentence = surzhik_generator.antisurzhifier(sent)
    out_sentence = space_handler.fried_nails(out_sentence)
  except:
    out_sentence = sent
  out_sentences.append(out_sentence)
print(time.time() - s)

list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range


In [None]:
text = '\n'.join(out_sentences)
with open(out_folder + "/source.txt", 'w') as f:
  f.writelines(text)

text = '\n'.join(sentences_to_surzhify)
with open(out_folder + "/target.txt", 'w') as f:
  f.write(text)