## Word correction

In [1]:
import kenlm, Levenshtein

In [2]:
model = kenlm.LanguageModel("language_model/airi.bin")

In [3]:
def load_unigram_set_from_arpa(arpa_path):
    """Read unigrams from arpa file."""
    unigrams = set()
    with open(arpa_path) as f:
        start_1_gram = False
        for line in f:
            line = line.strip()
            if line == "\\1-grams:":
                start_1_gram = True
            elif line == "\\2-grams:":
                break
            if start_1_gram and len(line) > 0:
                parts = line.split("\t")
                if len(parts) == 3:
                    unigrams.add(parts[1])
    if len(unigrams) == 0:
        raise ValueError("No unigrams found in arpa file. Something is wrong with the file.")
    return unigrams


def prepare_unigram_set(unigrams, kenlm_model: "kenlm.Model"):
    """Filter unigrams down to vocabulary that exists in kenlm_model."""
    if len(unigrams) < 1000:
        logger.warning(
            "Only %s unigrams passed as vocabulary. Is this small or artificial data?",
            len(unigrams),
        )
    unigram_set = set(unigrams)
    unigram_set = set([t for t in unigram_set if t in kenlm_model])
    retained_fraction = 1.0 if len(unigrams) == 0 else len(unigram_set) / len(unigrams)
    if retained_fraction < 0.1:
        logger.warning(
            "Only %s%% of unigrams in vocabulary found in kenlm model-- this might mean that your "
            "vocabulary and language model are incompatible. Is this intentional?",
            round(retained_fraction * 100, 1),
        )
    return unigram_set

In [5]:
unigrams = load_unigram_set_from_arpa("airi.arpa")
unigrams_set = prepare_unigram_set(unigrams, model)

In [20]:
def find_equivalent(word):
    max_distance = 1.2
    # Generate candidate words using the Levenshtein library
    candidates = [candidate for candidate in unigrams_set if Levenshtein.distance(word, candidate) <= max_distance]
    if len(candidates) == 0:
        return [word]
    return candidates

def check_and_correct(qism, model, gap_boshi=False, gap_oxiri=False):
    global qiymatlar
    qiymatlar = []
    if gap_boshi:
        if len(qism) == 3:
            # detect candidates
            first = find_equivalent(qism[0])
            second = find_equivalent(qism[1])
            third = find_equivalent(qism[2])
            # find the best first and second word
            bir_ikki = [(model.score('<s> '+i+' '+j),i+' '+j) for j in second for i in first]
            bir_ikki.sort(key=lambda x: x[0], reverse=True)

            bir = bir_ikki[0][1].split()[0]
            qiymatlar.append(bir_ikki[0])
            ikki = bir_ikki[0][1].split()[1]
            # find the best third word
            ikki_uch = [(model.score(ikki+' '+i),ikki+' '+i) for i in third]
            ikki_uch.sort(key=lambda x: x[0], reverse=True)
            uchlist = [(model.score(bir_ikki[0][1]+' '+i)+model.score(ikki+' '+i),i) for i in third]
            uchlist.sort(key=lambda x: x[0], reverse=True)
            uch = uchlist[0][1]
            qiymatlar.append(uchlist[0])
            return bir, ikki, uch
        elif len(qism) == 2:
            # detect candidates
            first = find_equivalent(qism[0])
            second = find_equivalent(qism[1])
            # find the best first and second word
            bir_ikki = [(model.score('<s> '+i+' '+j),i+' '+j) for j in second for i in first]
            bir_ikki.sort(key=lambda x: x[0], reverse=True)

            bir = bir_ikki[0][1].split()[0]
            ikki = bir_ikki[0][1].split()[1]
            qiymatlar.append(bir_ikki[0])
            return bir, ikki
        else:
            # detect candidates
            first = find_equivalent(qism[0])
            # find the best word
            bir = [(model.score('<s> '+i),i) for i in first]
            bir.sort(key=lambda x: x[0], reverse=True)
            
            bir = bir[0][1]
            qiymatlar.append(bir[0])
            return [bir]
    elif gap_oxiri:
        third = find_equivalent(qism[2])
        uchlist = [(model.score(qism[0]+' '+qism[1]+' '+i+' </s>')+model.score(qism[1]+' '+i+' </s>'),i) for i in third]
        uchlist.sort(key=lambda x: x[0], reverse=True)
        uch = uchlist[0][1]
        qiymatlar.append(uchlist[0])
        return qism[0], qism[1], uch
    else:
        third = find_equivalent(qism[2])
        uchlist = [(model.score(qism[0]+' '+qism[1]+' '+i)+model.score(qism[1]+' '+i),i) for i in third]
        uchlist.sort(key=lambda x: x[0], reverse=True)
        uch = uchlist[0][1]
        qiymatlar.append(uchlist[0])
        return qism[0], qism[1], uch


In [21]:
gap = "nma bulyapti sanga"
print("Boshlang'ich gap:")
print(gap,end='\n\n')
gap = gap.split()
if len(gap) >= 3:
    for i in range(len(gap)-2):
        if i == 0:
            gap[0], gap[1], gap[2] = check_and_correct(gap[:3],model,gap_boshi=True)
        elif i == len(gap)-3:
            gap[-3], gap[-2], gap[-1] = check_and_correct(gap[i:],model,gap_oxiri=True)
        else:
            gap[i], gap[i+1], gap[i+2] = check_and_correct(gap[i:i+3],model)
else:
    gap = check_and_correct(gap,model,gap_boshi=True)
    
print("To'g'irlangan gap:")
print(' '.join(gap))
print(qiymatlar)

Boshlang'ich gap:
nma bulyapti sanga

To'g'irlangan gap:
nima bulyapti senga
[(-11.298677444458008, 'nima bulyapti'), (-25.54973793029785, 'senga')]


In [22]:
find_equivalent('bulyapti')

['bulyapti']