
<center>
        <img src="assets/img/dbconfig.png" width="10%" height="10%" alt="imports and Excel reading and configuration">
        #Imports and Excel reading and configuration:
</center>
<hr>

In [3]:
import pandas as pd
import numpy as np
import collections
import codecs, difflib, Levenshtein
import re # regex

'''
Pandas Definitions
'''
df = pd.read_excel (r'../../docs/220819 - ALL data merged from natalia for shahar.xlsx', skiprows = 1) #(use "r" before the path string to address special character, such as '\')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)

'''
DEFINITIONS
'''
RUSSIAN = "russian"
HEBREW = "hebrew"
# languages and their range in the excel file (column numbers)
LANGUAGES = {HEBREW: ["EL", "GO"], RUSSIAN: ["CH", "EK"]}
# vowels
VOWELS = "aeiou"
CONSONANTS = "qwrtyplkjhgfdszxcvbnm"
# symbols to be replaced
REPLACE_SYMBOLS =  {'ç': 'c', '"': "'", "-": " "}
# Hebrew: symbols to be removed. Don't add / because we need to remove N/A
REMOVE_SYMBOLS_RUS = "_*.,`?()/"
REMOVE_SYMBOLS_HEB = REMOVE_SYMBOLS_RUS + "'"
# check for transcribers' typos in words?
CHECK_FOR_TYPOS = False
# word similarity threshold
SIMILARITY_THRESHOLD = 0.7
# inspection list of possible errors and inaccuracies
INSPECT = list()
# do advanced replacements? (like v-f swaps and CVC mil'el vowel swaps)
ADVANCED_LANG_REPLACE = False

'''
HEBREW
'''
# allowed difference combinations of words (Hebrew)
ALLOWED_DIFF_HEB = ["chx", "hx", "kq", "iy", "fv"]
# allowed letters in word differnce (Hebrew)
ALLOWED_LETTERS_HEB = set("".join(ALLOWED_DIFF_HEB) + "aei")

# Letters and their possible replacements (Hebrew)
# NOTE: make sure all the letters in LETTER_REPLACEMENTS_HEB are also in ALLOWED_DIFF_HEB list or appended to ALLOWED_LETTERS_HEB!
LETTER_REPLACEMENTS_HEB = [{"h": ["i", "y"], "i": ["y", "h"], "y": ["i", "h"], "f": ["v"], "v": ["f"]},
                       {"k": ["q"], "ch": ["h", "x"], "h": ["x", "ch"], "x": ["h", "ch"]}]

'''
Russian
'''
# prohobited letters in word differnce (Russian)
PROHIBITED_LETTERS_RUS =  set("rpfglmnbvqwrtsxd" + "'" + VOWELS)
ALLOWED_LETTERS_RUS = set("jyh")

# Letters and their possible replacements (Russian)
# NOTE: make sure all the letters in LETTER_REPLACEMENTS_RUS are also in ALLOWED_DIFF_RUS list or in ALLOWED_LETTERS_RUS!
LETTER_REPLACEMENTS_RUS = {"j": ["y", "i"], "y": ["i"]} # "y" for plural; "j": e.g. esli - jesli - yesli

'''
BOTH
'''
BOTH = "both"
# transcriptions
heb_rep_phase_1 = {"c'":"צ", "b": "ב", "g": "ג", "d": "ד", "sh": "ש", "v": "ב/ו", "h": "ה", "z": "ז", "x": "ח", "t": "ת", "'": "ע", "y": "י", "k": "כ", "l": "ל", "m": "מ", "n": "נ", "s": "ס", "p": "פ", "q": "ק", "r": "ר"}
heb_rep_phase_2 = {"a": "א", "e": "א", "i": "א", "o": "א", "u": "א"}

<center>
        <img src="assets/img/freq.png" width="20%" height="20%" alt="Find words frequencies">
        <h3>Find words frequencies:</h3>
</center>
<hr>

In [1]:
'''
repl_symbols: takes a string and removes specific symbols from it
'''
def repl_symbols(string, lang):    
    if lang == HEBREW:
        remove_symbols = REMOVE_SYMBOLS_HEB
    elif lang == RUSSIAN:
        remove_symbols = REMOVE_SYMBOLS_RUS
    
    for a, b in REPLACE_SYMBOLS.items():
        string = string.replace(a, b)
    for a in remove_symbols:
        string = string.replace(a, '')
    return string

'''
hasDigits: checks if a word has digits in it or at sign (@) which indicates that the word is innovative
'''
def hasDigits(string):
    return any(char.isdigit() for char in string)

'''
loop_freq: returns a dictionary of each word in the dataframe and its frequency (sorted by frequency DESC)
'''
def loop_freq(col, lang):
    freq = {}
    
    for row in col:
        # skip NaN cells (empty cell)
        if isinstance(row, (bool, float)) and str(row).lower() == "nan" or str(row) == "":
            continue
            
        row = str(row)
        # inspect cells with numbers
        if hasDigits(row) and "CS" not in row:
            INSPECT.append(row)
            continue
            
        split = repl_symbols(row, lang).split(' ')
        for word in split:
            # ignore marks and short words and child's mistakes (usually capitalized)
            if len(word) < 3 or "@" in word or "xx" in word or "XX" in word or word != word.lower():
                continue
            if word in freq:
                freq[word] += 1
            else:
                freq[word] = 1            

    # delete keys
    keys_del = ['CORRECT', 'n/a', 'N/A', 'N/a', 'n/A']
    for key in keys_del:
        if key in freq:
            del freq[key]

    freq = sorted(freq.items(), key=lambda x: x[1], reverse=True)
    freq = collections.OrderedDict(freq)
        
    return freq

<center>
    <img src="assets/img/rus-il.png" width="13%" height="13%" alt="Shared functions for both languages">
    <h3>Functions for both languages:</h3>
</center>
<hr>

In [5]:
'''
hasVCV: checks if a string has a vowel-consonant-vowel, e.g. imAHOt
'''
def hasVCV(string, vowel = VOWELS, consonant = CONSONANTS):
    return bool(re.search("["+ vowel +"]["+ consonant +"][" + vowel + "]", string))


'''
hasCVC: checks if a string has a consonant-vowel-consonant, e.g. imAHOt
        Note: by default milra is allowed
'''
def hasCVC(string, vowel = VOWELS, consonant = CONSONANTS, milra = True):
    pattern = "["+ consonant +"]["+ vowel +"][" + consonant + "]"
    regex = re.search(pattern, string)
                 
    # don't allow milra, i.e. don't match in end of line, UNLESS the word can't have milra (it's too short)
    if milra or (not milra and len(string) <= 4):
        return bool(regex)
    else:
        # Check also if the string has the CVC at all.
        # Note: we remove only the 2 last chars from string because if we removed 3 we might lose CVC
        # e.g. if we take "xamor" and remove 2 chars from the end, we'll get "xa" (not CVC) instead of "xam"
        return (bool(regex) and hasCVC(string[:-2], vowel, consonant))

'''
hasVC: checks if a string has a vowel-consonant, e.g. cAT
'''
def hasVC(string, vowel = VOWELS, consonant = CONSONANTS):
    return bool(re.search("["+ vowel +"]["+ consonant + "]", string))


'''
hasCV: checks if a string has a consonant-vowel, e.g. BAg
'''
def hasCV(string, vowel = VOWELS, consonant = CONSONANTS):
    return bool(re.search("[" + consonant +"][" + vowel + "]", string))


'''
isSamePair: check if a pair of strings are equal to other pair of strings
'''
def isSamePair(s1, s2, t1, t2):
    if (s1 == t1 and s2 == t2) or (s1 == t2 and s2 == t1):
        print(s1, s2, "same!")
        return True
    return False

'''
doesPairInclude: check if a pair of strings indlude another pair of substrings
'''
def doesPairInclude(s1, s2, t1, t2):
    if (s1 in t1 and s2 in t2) or (s1 in t2 and s2 in t1):
        return True
    return False
    

'''
hasOnly: checks if a string has only specific letters
'''
def hasOnly(string, letters):
    for s in string:
        if s not in letters:
            return False
    return True

'''
isEqualOnRemove: takes two substring and two strings, and checks whether both strings contain one substring.
             Then remove the substrings from the strings
'''
def isEqualOnRemove(sub1, sub2, str1, str2):
    if (sub1 in str1 and sub2 in str2) or (sub1 in str2 and sub2 in str1):
        # now remove 
        str1 = str1.replace(sub1, "").replace(sub2, "")
        str2 = str2.replace(sub1, "").replace(sub2, "")
        if str1 == str2:
            return True
    return False

'''
removeSharedLetters: takes two strings and removes common letters, i.e. letters both share
                     Note: the result of removeSharedLetters("abb", "a") is ['b'] and not ['b', 'b']
'''
def removeSharedLetters(x, y):
    count = lambda x: collections.Counter(c for c in x.lower())
    cx, cy = count(x), count(y)
    diff  = cx - cy
    rev_diff = cy - cx    
    rev_diff = list(rev_diff)
    diff = list(diff)
    
    return sorted(rev_diff + diff)

'''
isEdgeLettersSame: checks if the edge letters are the same
                   @param applyTo: do we want to compare only the beginnings, or the ends or maybe both.
                                   accepts: ["start", "end", "both"]
'''
def isEdgeLettersSame(s1, s2, exceptions, applyTo = "both"):
    startOk = False
    endOk = False
    s1_body = s1
    s2_body = s2
    
    # if the first letter of each word aren't the same - don't merge
    if applyTo in ["start", "both"] and s1[0] != s2[0]:
        for orig, rep in exceptions.items():
            # check if s1 or s2 begins with orig at all
            if s1[0:len(orig)] == orig or s2[0:len(orig)] == orig:
                # determine who begins with orig
                hasOrig = s1 if s1[0:len(orig)] == orig else s2
                noOrig = s1 if s1[0:len(orig)] != orig else s2
                for r in rep:
                    if (noOrig[0:len(r)] == r):
                        startOk = True
                        # update the bodies start
                        if s1 == hasOrig:
                            s1_body = s1_body[len(orig):]
                            s2_body = s2_body[len(r):]
                        else:
                            s1_body = s1_body[len(r):]
                            s2_body = s2_body[len(orig):]
                            
                        break
    else:
        startOk = True
    
    
    # if the last letter of each word aren't the same - don't merge
    if applyTo in ["end", "both"] and s1[-1] != s2[-1]:
        for orig, rep in exceptions.items():
            # check if s1 or s2 ends with orig at all
            if s1[-len(orig):] == orig or s2[-len(orig):] == orig:
                # determine who ends with orig and who doesn't
                hasOrig = s1 if s1[-len(orig):] == orig else s2
                noOrig = s1 if s1[-len(orig):] != orig else s2
                for r in rep:
                    if (noOrig[-len(r):] == r):
                        endOk = True
                        # update the bodies end
                        if s1 == hasOrig:
                            s1_body = s1_body[:-len(orig)]
                            s2_body = s2_body[:-len(r)]
                        else:
                            s1_body = s1_body[:-len(r)]
                            s2_body = s2_body[:-len(orig)]
                        break
    else:
        endOk = True
    
    # if the words edges are the same, remove one char at the beginning and at the end
    # I am still checking if it's better to not do it (@Rony, 24-11-19 17:37)
    '''
    if s1_body == s1:
        s1_body = s1[1:-1]
    if s2_body == s2:
        s2_body = s2[1:-1]
    '''
    
    # return true only if start and end are okay
    return (startOk and endOk), s1_body, s2_body


'''
restDiffOk: checks if diff letters are indeed all in at least one mix 
            @param remove the items to be removed from diff
            @param s1 string1
            @param s2 string2
'''
def restDiffOk(diff, remove, s1, s2, mixes):
    diff = list(set(diff) - set(remove))
    
    for d in diff:
        noD = s1 if d not in s1 else s2
        hasD = s1 if d in s1 else s2
        
        for m in mixes:
            if d in m:
                g = list(set(m) - set(d))
                count = 0
                for i in g:
                    if i in noD:
                        count += 1
                    if count == len(g):
                        return True
                    
    return False

<center>
    <img src="assets/img/israel.png?v=1" width="8%" height="8%" alt="Functions for Hebrew">
    <h3>Functions for Hebrew:</h3>
</center>
<hr>

In [6]:
'''
allCombChecks: check all specific combinations 
'''
def allCombChecks(s1, s2, body1, body2):
    diff = removeSharedLetters(body1, body2)
    
    # e.g. rayinu - rainu ראינו
    if isEqualOnRemove("yi", "i", s1, s2) and "y" in diff:
        return True
    
    # e.g. beyit - beyt בית
    if isEqualOnRemove("yi", "y", s1, s2) and "i" in diff:
        return True
    
    # e.g. yihiye - hihiye
    if isEqualOnRemove("hi", "yi", s1, s2):
        return True
    
    # e.g. yihiye yihye diff
    if isEqualOnRemove("iy", "y", s1, s2) and "i" in diff:
        return True
        
    # e.g. matay - matai | layla, laila | eyx, eix
    if "i" in diff and "y" in diff and restDiffOk(diff, "iy", body1, body2, ALLOWED_DIFF_HEB):
        s1_VY = hasVC(body1, consonant = "y")
        s1_VI = hasVC(body1, consonant = "i")
        s2_VY = hasVC(body2, consonant = "y")
        s2_VI = hasVC(body2, consonant = "i")
        if (s1_VY and s2_VI) or (s2_VY and s1_VI):
            return True
    
    # do advanced replacements?
    if ADVANCED_LANG_REPLACE:
        # e.g. savta - safta | lisxov - lisxof
        if "f" in diff and "v" in diff and restDiffOk(diff, "iy", body1, body2, ALLOWED_DIFF_HEB):
            s1_Vv = hasVC(body1, consonant = "v")
            s1_Vf = hasVC(body1, consonant = "f")
            s2_Vv = hasVC(body2, consonant = "v")
            s2_Vf = hasVC(body2, consonant = "f")
            if (s1_Vv and s2_Vf) or (s2_Vv and s1_Vf):
                return True

        # CVC mil'el vowel swaps: if consonant-vowel-consonant and mil'el, e.g. mEciq - mAciq. (Note: only A-E and E-I replacements!)
        # problem with nouns: milon - melon @Rony
        if max(len(s1), len(s2)) <= 4 or s1[-3:] == s2[-3:]: # check if last 3 chars are the same
            if (hasCVC(s1, vowel = "i", milra = False) and hasCVC(s2, vowel = "e", milra = False)) or (hasCVC(s1, vowel = "e", milra = False) and hasCVC(s2, vowel = "i", milra = False)) and "e" in diff and "i" in diff:
                return True
            elif (hasCVC(s1, vowel = "a", milra = False) and hasCVC(s2, vowel = "e", milra = False)) or (hasCVC(s1, vowel = "e", milra = False) and hasCVC(s2, vowel = "a", milra = False)) and "a" in diff and "e" in diff:
                return True
    
    # e.g. raaa - raa ראה
    if isEqualOnRemove("aaa", "aa", body1, body1) and "a" in diff:
        return True
    
    # e.g. eyze - eyzeh, ayom - hayom
    if check_h(s1, s2, diff):
        return True
    
    return False

'''
isNikud: checks if a letter is a vowel
'''
def isNikud(letter):
    if letter in VOWELS:
        return True
    return False

'''
check_h: take two words and checks if there's h in the beginning or ending of one word that is preceeded or followed
         by a vowel, and not in the other word e.g. eyze - ezyeh, hayom - ayom
'''
def check_h(s1, s2, diff, onlyEdges = False):
    # e.g. hayom - ayom
    if (s1[0] == "h" and isNikud(s1[1]) and isNikud(s2[0])):
        return True
    # e.g. eyzeh - eyze
    if (s1[-1] == "h" and isNikud(s1[-2]) and isNikud(s2[-1])):
        return True
    # e.g. ayom - hayom
    if (s2[0] == "h" and isNikud(s2[1]) and isNikud(s1[0])):
        return True
    # e.g. eyze - eyzeh
    if(s2[-1] == "h" and isNikud(s2[-2]) and isNikud(s1[-1])):
        return True
    
    # Default: if we are not checking only the edges of the words
    if not onlyEdges:
        # if h is preceded and follwed by a vowel, e.g. imahot - imaot אמהות
        if "h" in diff:
            hasH = s1 if "h" in s1 else s2
            noH = s1 if "h" not in s1 else s2
            # check if one word has VHV and the other has VV
            if hasVCV(hasH, consonant = "h") and hasCV(noH, consonant = VOWELS):
                return True

        # e.g. haimahot - haimaot
        elif "h" in s1 or "h" in s2:
            s1_VHV = hasVCV(s1, consonant = "h")
            s2_VHV = hasVCV(s2, consonant = "h")
            if (s1_VHV and not s2_VHV) or (s2_VHV and not s1_VHV):
                return True
        
    return False


'''
shouldMerge_heb: check if two strings are similar enough transcription-wise to be merged
'''
def shouldMerge_heb(s1, s2):
    diff = removeSharedLetters(s1, s2)
    
    # First and strongest filter: if the difference has letters other than specified, don't merge!
    if not hasOnly(diff, ALLOWED_LETTERS_HEB):
        return False
    
    # check of the difference contains at least one of the ALLOWED_DIFF_HEB
    mixExists = False
    for mix in ALLOWED_DIFF_HEB:
        if "".join(diff) == mix:
            mixExists = True
            break
    
    # check if the strings' edges are identical
    edgesOk_1, body1, body2 = isEdgeLettersSame(s1, s2, LETTER_REPLACEMENTS_HEB[0])
    edgesOk_2, body3, body4 = isEdgeLettersSame(s1, s2, LETTER_REPLACEMENTS_HEB[1])
    
    if (edgesOk_1 or edgesOk_2) or check_h(s1, s2, diff, onlyEdges = True):
        # if their body is the same
        if body1 == body2 or body3 == body4:
            return True
        elif mixExists:
            return True
        # check the h differences (article, VHV etc.)
        elif check_h(s1, s2, diff):
            return True
        # now perform the heaviest check (contains check_h())
        elif allCombChecks(s1, s2, body1, body2) or allCombChecks(s1, s2, body3, body4):
            return True
    return False

<center>
    <img src="assets/img/russia.png" width="10%" height="10%" alt="Functions for Russian">
    <h3>Functions for Russian:</h3>
</center>
<hr>

In [7]:
'''
shouldMerge_rus: check if two strings are similar enough transcription-wise to be merged
'''
def shouldMerge_rus(s1, s2):
    diff = removeSharedLetters(s1, s2)

    # if the difference between the words is one of following leteters, then they shouldn't merge
    for letter in diff:
        if letter in PROHIBITED_LETTERS_RUS:
            return False
    
    # edge letters must be the same in both words
    edgesOk1, body1, body2 = isEdgeLettersSame(s1, s2, LETTER_REPLACEMENTS_RUS)
    edgesOk2, body3, body4 = isEdgeLettersSame(s1, s2, {"go": ["vo"]}, "end") # e.g. ego - evo
    if (edgesOk1 or edgesOk2) and (body1 == body2 or body3 == body4):
        return True
    
    '''
    Now we are done checking the edges. Now we move to checking the insides.
    '''
    
    # if y is not the last letter in both words (because it makes the word plural: e.g. kot - koty, kust - kusty)
    if "y" in diff and s1[-1] != "y" and s2[-1] != "y":
        hasY = s1 if "y" in s1 else s2
        # if y is preceded by a vowel (e.g. vstretil vstreytil) or followed by j (e.g. kotoruyu  - kotoruju)
        if hasVC(hasY, consonant = "y") or "yj" in hasY:
            return True
    
    # if both word are almost the same (until last 2 chars)
    # and if one word end with "yj" and the other with "y", e.g. kotoryj - kotory
    if s1[:-2] == s2[:-2] and (s1[-2:] == "yj" and s2[-1] == "y") or (s2[-2:] == "yj" and s1[-1] == "y"):
        return True
        
    # check for j:
    # 1. don't merge bashnU - bashnJU 2. continue if both don't end with j
    # Note: I should have added "a" to vowels in both hasCV but then I can't merge prosnJEmsJA - prosnEmsJA cuz it has double CV
    elif (isEqualOnRemove("j", "", s1, s2) and not hasCV(s1, consonant = "j", vowel = "iou") and not hasCV(s2, consonant = "j", vowel = "iou")
          and s1[-1] != "j" and s2[-1] != "j" and "j" in diff):
        return True
    
    # e.g. devocka - devochka
    if isEqualOnRemove("ck", "chk", s1, s2) and "h" in diff:
        return True
    
    return False

<center>
    <img src="assets/img/hamming.png" width="10%" height="10%">
    <h3>Calculate the similarity of a pair of words and decide whether they should merge:</h3>
</center>
<hr>

In [8]:
'''
shouldMerge: according to the language, apply the function that checks if two strings are similar enough
             transcription-wise to be merged
'''
def shouldMerge(capsule, lang, threshold, exclude_pairs):
    s1 = capsule[0]
    freq1 = capsule[1]
    s2 = capsule[2]
    freq2 = capsule[3]
    sim = capsule[4]
    maxLen = max(len(s1), len(s2))
    
    # block low similarity
    if not (sim >= threshold or (maxLen == 4 and sim >= 0.6) or (maxLen == 3 and sim >= 0.5)):
        return False
    
    # check if the pair is excluded
    if s1 in exclude_pairs and exclude_pairs[s1] == s2:
        return False
    elif s2 in exclude_pairs and exclude_pairs[s2] == s1:
        return False
    
    # allow typos
    if typosStats(capsule):
        return True
    
    # proceed by lanugage
    lang = lang.lower()
    if lang == HEBREW:
        return shouldMerge_heb(s1, s2)
    elif lang == RUSSIAN:
        return shouldMerge_rus(s1, s2)
    else:
        assert False

        
'''
typosStats: check for transcribers' typos in words?
            Note: we can't know if it's a typo or if it's something that the child did say.
            Thus for now I recommend defining CHECK_FOR_TYPOS = False
'''
def typosStats(capsule):
    if not CHECK_FOR_TYPOS:
        return False
    
    sim = capsule[4]
    if sim < 0.9:
        return False
    
    # frequency of word1
    freq1 = capsule[1]
    # frequency of word2
    freq2 = capsule[3]
        
    if (min(freq1, freq2) / (freq1 + freq2) <= 0.1) and min(freq1, freq2) <= 3:
        return True
    return False


'''
similar: get the similarity percentage of two words
'''
def similar(a, b, algo = "difflib"):
    if algo == "difflib":
        return difflib.SequenceMatcher(None, a, b).ratio()
    elif algo == "lev":
        return Levenshtein.ratio(a, b)
    elif algo == "sor":
        return 1 - distance.sorensen(a, b)
    elif algo == "jac":
        return 1 - distance.jaccard(a, b)

    
'''
hamming: get the hamming distance of two strings
'''
def hamming(s1, s2):
    return sum(ch1 != ch2 for ch1,ch2 in zip(s1,s2))


'''
find_sim: returns a dictionary of pairs of similar words that should be merged
          @param exclude_pairs pairs to be excluded
'''
def find_sim(freq, lang, threshold, exclude_pairs):
    count = 0
    count_bad = 0
    word_list = list(freq)
    replacements = {}
    bad = {}
    
    for i, word in enumerate(freq):
        for j in range(i):
            word2 = word_list[j]
            # calculate similarity percentage using difflib algorithm
            sim = similar(word, word2)
            # wrap all in a capsule
            capsule = [word, freq[word], word2, freq[word2], sim]
            
            # check if the words should really merge
            if shouldMerge(capsule, lang, threshold, exclude_pairs):
                ##print(word, word2, freq[word], freq[word2], sim)
                replacements[word] = word2
                count += 1
            else:
                if lang == HEBREW:
                    allowed_letters = ALLOWED_LETTERS_HEB
                elif lang == RUSSIAN:
                    allowed_letters = ALLOWED_LETTERS_RUS
                
                diff = removeSharedLetters(word, word2)
                # First and strongest filter: if the difference has letters other than specified, it's not interesting!
                # Also, don't show article difference, e.g. hayom - yom
                if not hasOnly(diff, allowed_letters) or (word[:2] == "ha" and word2[:2] != "ha") or (word2[:2] == "ha" and word[:2] != "ha"): ###
                    continue
                
                if sim >= threshold:
                    bad[word] = word2
                    count_bad += 1
                
    return (replacements, bad)

<center>
    <img src="assets/img/replace.png" width="10%" height="10%" alt="Helper functions for replacing">
    <h3 align="center">Helper functions for replacing:</h3>
</center>
<hr>

In [9]:
'''
xlsColIndex: convert Excel column to index, e.g. Z -> 25 | CH -> 85
'''
def xlsColIndex(col):
    col = col.lower()
    if len(col) == 1:
        return ord(col[0]) - 97
    elif len(col) == 2:
        return (ord(col[0]) - 97 + 1) * 26 + (ord(col[1]) - 97)

'''
replaceInDataFrame: applies the replacements in the dataframe
'''
def replaceInDataFrame(df, colname, replacements, lang):
    
    if lang == HEBREW:
        remove_symbols = REMOVE_SYMBOLS_HEB
        
    elif lang == RUSSIAN:
        remove_symbols = REMOVE_SYMBOLS_RUS
    
    # replace symbols
    for orig, rep in REPLACE_SYMBOLS.items():
        df[colname] = df[colname].str.replace(r"" + orig, rep)
    
    # remove symbols, @ marks, XXXX, N/A
    pattern = r"[\\"+ "\\".join(remove_symbols) + r"]|@[\w\:\.\;]{0,}|\b(\w*[Xx]+\w*)\b|[Nn]\/[Aa]|^N$|^A$"
    df[colname] = df[colname].str.replace(pattern, '')
    
    
    
    # trim and remove multiple spaces
    df[colname] = df[colname].str.strip()
    df[colname] = df[colname].str.replace("([\s]+)", ' ')
    
    # finally replace the words
    if len(replacements) > 0:
        replacements = {r'\b{}\b'.format(k):v for k, v in replacements.items()}
        df[colname] = df[colname].replace(to_replace = replacements, regex=True)
    
    # replace NaN with blank in all rows
    df[colname].replace(np.nan, '', regex=True, inplace=True)

<center>
    <img src="assets/img/start95.jpg" width="20%" height="20%" alt="Main">
</center>
<hr>

In [10]:
'''
MAIN
NOTE: once main is run, the dataframe changes, i.e. the replacements are applied to the original dataframe
      and will stick untill the excel file is read again.
'''
def main(interest = "good", language = "", threshold = SIMILARITY_THRESHOLD, exclude_pairs = {}, df = df, doPrint = True):
    if doPrint:
        print("Threshold:", threshold)
    
    # convert all cells that has only a number (int cell) to string
    # https://stackoverflow.com/questions/48978151/why-does-pandas-series-str-convert-numbers-to-nan
    # Does not work for some reason @Rony
    # df = df.apply(lambda x: x.apply(str) if x.dtype == 'object' else x)
    
    # total replacements counter
    total_good_count = 0
    total_bad_count = 0
    
    # while true is needed to merge multiple words into a single word, e.g. if "hatul" -> "chatul" and "xatull" -> "xatul"
    # so thanks to while true we'll be able to merge "chatul" -> "xatul"
    while True:
        last_count = total_good_count
        
        for lang, rang in LANGUAGES.items():

            if language not in lang:
                continue
            
            for i in range (xlsColIndex(rang[0]), xlsColIndex(rang[1]) + 1):
                colname = df.columns[i]
                heb_data = df[colname]
                heb_freq = loop_freq(heb_data, lang)
                replacements, bad = find_sim(heb_freq, lang, threshold, exclude_pairs)
                
                good_count = len(replacements)
                bad_count = len(bad)
                # apply replacements
                replaceInDataFrame(df, colname, replacements, lang)

                if good_count > 0 and interest in ["good", BOTH]:
                    if doPrint:
                        print("--------------------------")
                        print(lang + ": " + str(replacements))
                    # update the total replacements counter
                    total_good_count += good_count

                if bad_count > 0 and interest in ["bad", BOTH]:
                    if doPrint:
                        print("--------------------------")
                        print(lang + ": " + str(bad))
                    total_bad_count += bad_count
        
        # exit while loop if no any new good replacements 
        if last_count == total_good_count:
            break

    if doPrint:
        print("\nTotal replacements: " + str(total_good_count))
        print("\nTotal ignored: " + str(total_bad_count))

        # print inspections
        if len(INSPECT) > 0:
            print("*** Inspect problematic cells: " + " | ".join(INSPECT))