In [14]:
import os, re
from string import punctuation
import numpy as np
import json
from collections import Counter
from pprint import pprint
punct = set(punctuation)
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
import textdistance

In [2]:
bad = open('sents_with_mistakes.txt', encoding='utf8').read().splitlines()
true = open('correct_sents.txt', encoding='utf8').read().splitlines()

In [3]:
def align_words(sent_1, sent_2):
    tokens_1 = sent_1.lower().split()
    tokens_2 = sent_2.lower().split()
    
    tokens_1 = [re.sub('(^\W+|\W+$)', '', token) for token in tokens_1 if (set(token)-punct)]
    tokens_2 = [re.sub('(^\W+|\W+$)', '', token) for token in tokens_2 if (set(token)-punct)]
    
    return list(zip(tokens_1, tokens_2))

In [4]:
corpus = [sent.split() for sent in open('corpus_ng.txt', encoding='utf8').read().splitlines()]
WORDS = Counter()
for sent in corpus:
    WORDS.update(sent)

In [5]:
vocab = list(WORDS.keys())
id2word = {i:word for i, word in enumerate(vocab)}

vec = TfidfVectorizer(analyzer='char', ngram_range=(1,1))
X = vec.fit_transform(vocab)

In [6]:
def get_closest_match_vec(text, X, vec, TOPN=3):
    v = vec.transform([text])
    similarities = cosine_distances(v, X)
    topn = similarities.argsort()[0][:TOPN]
    
    return [id2word[top] for top in topn]

In [7]:
def get_closest_hybrid_match(text, X, vec, metric=textdistance.levenshtein):
    candidates = get_closest_match_vec(text, X, vec, 7)
    similarities = Counter()
    for word in candidates:
        similarities[word] = metric.normalized_similarity(text, word) 
    return similarities.most_common(1)[0][0]

In [8]:
get_closest_hybrid_match('алкогнль', X, vec)

'алкоголь'

In [9]:
correct = 0
total = 0
mistake_word_pairs = []
for i in range(len(true)):
    word_pairs = align_words(true[i], bad[i])
    for pair in word_pairs:
        predicted = get_closest_hybrid_match(pair[1], X, vec)
        if predicted == pair[0]:
            correct += 1
        else:
            mistake_word_pairs.append(pair)
        total += 1
    if not i % 10:
        print(i)
        print(correct/total)

0
0.5333333333333333
10
0.8076923076923077
20
0.8095238095238095
30
0.8106060606060606
40
0.8128654970760234
50
0.8180354267310789
60
0.8097222222222222
70
0.8131188118811881
80
0.8172157279489904
90
0.8182711198428291
100
0.8145161290322581
110
0.8178001679261125
120
0.8205723124516628
130
0.819047619047619
140
0.8154477101845523
150
0.8238276299112801
160
0.8225235849056604
170
0.8231330713082537
180
0.8205128205128205
190
0.8221442885771543
200
0.8249158249158249
210
0.8271324863883848
220
0.8277634961439588
230
0.8287474332648871
240
0.8333977580208736
250
0.8303341902313625
260
0.8308125219838199
270
0.8289384719405003
280
0.8305905130687319
290
0.8283048211508554
300
0.8299278846153846
310
0.8302052785923754
320
0.8296002268216615
330
0.8287765810549572
340
0.8278556034482759
350
0.8272703480764197
360
0.8274201723264065
370
0.8285785164971471
380
0.8291917973462002
390
0.8297015632401705
400
0.8306657388077012
410
0.8311541929666366
420
0.8313039628071729
430
0.8314679294016358


In [21]:
mistake_word_pairs[:25]

[('симпатичнейшее', 'симпатичнейшое'),
 ('шпионское', 'шпионское'),
 ('гламурный', 'гламурный'),
 ('бонда', 'бонда'),
 ('superheadz', 'superheadz'),
 ('clap', 'clap'),
 ('camera', 'camera'),
 ('получатся', 'полчатся'),
 ('язычки', 'язычки'),
 ('очень', 'оччччень'),
 ('милые', 'милые'),
 ('насчет', 'нащщот'),
 ('чавеса', 'чавеса'),
 ('попавшим', 'попавшим'),
 ('аварийно-спасательных', 'аварийно-спасательных'),
 ('в', 'вобщем'),
 ('общем', 'как'),
 ('как', 'вы'),
 ('вы', 'знаете'),
 ('знаете', 'из'),
 ('из', 'моего'),
 ('моего', 'не'),
 ('недавнего', 'давнего'),
 ('пропажу', 'пропажу'),
 ('почте.ру', 'почте.ру')]

In [22]:
len(mistake_word_pairs)/total

0.16959648421893728

In [23]:
1-len(mistake_word_pairs)/total

0.8304035157810628