In [1]:
import re

BAD_WORDS = "\w{0,5}[хx]([хx\s\!@#$%\^&*+-\|\/]{0,6})[уy]([уy\s\!@#$%\^&*+-\|\/]{0,6})[ёiлeеюийя]\w{0,7}|\w{0,6}[пp]([пp\s\!@#$%\^&*+-\|\/]{0,6})[iие]([iие\s\!@#$%\^&*+-\|\/]{0,6})[3зс]([3зс\s\!@#$%\^&*+-\|\/]{0,6})[дd]\w{0,10}|[сcs][уy]([уy\!@#$%\^&*+-\|\/]{0,6})[4чkк]\w{1,3}|\w{0,4}[bб]([bб\s\!@#$%\^&*+-\|\/]{0,6})[lл]([lл\s\!@#$%\^&*+-\|\/]{0,6})[yя]\w{0,10}|\w{0,8}[её][bб][лске@eыиаa][наи@йвл]\w{0,8}|\w{0,4}[еe]([еe\s\!@#$%\^&*+-\|\/]{0,6})[бb]([бb\s\!@#$%\^&*+-\|\/]{0,6})[uу]([uу\s\!@#$%\^&*+-\|\/]{0,6})[н4ч]\w{0,4}|\w{0,4}[еeё]([еeё\s\!@#$%\^&*+-\|\/]{0,6})[бb]([бb\s\!@#$%\^&*+-\|\/]{0,6})[нn]([нn\s\!@#$%\^&*+-\|\/]{0,6})[уy]\w{0,4}|\w{0,4}[еe]([еe\s\!@#$%\^&*+-\|\/]{0,6})[бb]([бb\s\!@#$%\^&*+-\|\/]{0,6})[оoаa@]([оoаa@\s\!@#$%\^&*+-\|\/]{0,6})[тnнt]\w{0,4}|\w{0,10}[ё]([ё\!@#$%\^&*+-\|\/]{0,6})[б]\w{0,6}|\w{0,4}[pп]([pп\s\!@#$%\^&*+-\|\/]{0,6})[иeеi]([иeеi\s\!@#$%\^&*+-\|\/]{0,6})[дd]([дd\s\!@#$%\^&*+-\|\/]{0,6})[oоаa@еeиi]([oоаa@еeиi\s\!@#$%\^&*+-\|\/]{0,6})[рr]\w{0,12}"


In [2]:
import json

In [3]:
with open('dataset.json') as f:
    data = json.load(f)


In [4]:
re.search(BAD_WORDS, 'ы')



In [7]:
import re 
import regex
import unicodedata
from tqdm import tqdm_notebook as tqdm

from obscene_words_filter import conf
from obscene_words_filter.words_filter import ObsceneWordsFilter

FWORD = ObsceneWordsFilter(conf.bad_words_re, conf.good_words_re)
MAX_LENGTH = 8


def is_cyrillic(lower_char):
    return regex.search(r'\p{IsCyrillic}', lower_char) is not None

def is_latin(lower_char):
    return ord(lower_char) > ord('a') and ord(lower_char) < ord('z')

def is_latin_st(s):
    return any([is_latin(c) for c in s])

def is_acceptable_lc(lc):
    return is_cyrillic(lc) or is_latin(lc) or lc in (' ', '-', '?', '!')

def is_acceptable_sentence(sent):
    l = len(sent.split(' '))
    if l > MAX_LENGTH or l == 0:
        return False

    good = re.search(BAD_WORDS, sent) is None
    
    return 'http' not in sent and good and not is_latin_st(sent)

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def normalizeString(s):
    s = re.sub('\[id.*\],', '', s)
    if '.' in s:
        s = s[:s.find('.')]
    s = ''.join(filter(lambda x: is_acceptable_lc(x), s))
    s = s.replace('?', ' ? ').replace('!', ' ! ').replace(',', ' , ')
    s = unicodeToAscii(s.lower().strip())
    return s



def filterPair(p):
    return is_acceptable_sentence(p[0]) and is_acceptable_sentence(p[1])

def filterPairs(pairs):
    pairs = [[normalizeString(p) for p in pair] for pair in tqdm(pairs)]
    return [pair for pair in tqdm(pairs) if filterPair(pair)]

In [8]:
filtered_pairs = filterPairs(data)

HBox(children=(IntProgress(value=0, max=2071545), HTML(value='')))

HBox(children=(IntProgress(value=0, max=2071545), HTML(value='')))

In [9]:
with open('filtered_dataset1.json', 'w', encoding='utf8') as f:
    json.dump(filtered_pairs, f, ensure_ascii=False)

In [10]:
pairs2 = []
with open('vk_q.txt') as f:
    for line in f:
        try:
            q, a, _ = line.split('\\')
            pairs2 += [[q, a]]
        except Exception:
            pass

In [11]:
filtered_pairs2 = filterPairs(pairs2)

HBox(children=(IntProgress(value=0, max=549901), HTML(value='')))

HBox(children=(IntProgress(value=0, max=549901), HTML(value='')))

In [12]:
with open('filtered_dataset2.json', 'w') as f:
    json.dump(filtered_pairs2, f, ensure_ascii=False)
    

In [13]:
filtered_all = filtered_pairs + filtered_pairs2

In [14]:
with open('filtered_dataset_all.json', 'w') as f:
    json.dump(filtered_all, f, ensure_ascii=False)
    

In [62]:
len(filtered_all)

1261450

In [63]:
from collections import defaultdict
count = defaultdict(int)

for pair in tqdm(filtered_all):
    for x in pair:
        for z in x.split(' '):
            count[z] += 1


HBox(children=(IntProgress(value=0, max=1261450), HTML(value='')))

In [64]:
import numpy as np

In [82]:
threshoold = np.percentile(sorted(count.values()), 96)
print(threshoold)

56.0


In [83]:
def rare(sent):
    for x in sent.split(' '):
        if count[x] <= threshoold:
            return True
    return False
    
def new_filter(pair):
    return not rare(pair[0]) and not rare(pair[1])

In [84]:
filtered_all_sm = [p for p in filtered_all if new_filter(p)]
print(len(filtered_all_sm))
filtered_all_sm = [p for p in filtered_all_sm if len(p[0].split(' ')) > 0 and len(p[1].split(' ')) > 0]
filtered_all_sm = [p for p in filtered_all_sm if len(p[0]) > 0 and len(p[1]) > 0]

print(len(filtered_all_sm))

572675
468952


In [87]:
print(len(filtered_all_sm))

468952


In [86]:
with open('filtered_dataset_sm.json', 'w') as f:
    json.dump(filtered_all_sm, f, ensure_ascii=False)
    

In [None]:
import gensim
from gensim.test.utils import datapath

In [None]:
wv_from_bin = gensim.models.Word2Vec.load("ru.bin")

In [35]:
new_sentences = []
for pair in tqdm(filtered_all):
    for x in pair:
        new_sentences += [['@'] + x.split(' ') + ['#']]



HBox(children=(IntProgress(value=0, max=1261450), HTML(value='')))

In [36]:
wv_from_bin.build_vocab(new_sentences, update=True)
print('training')
wv_from_bin.train(new_sentences, total_examples=len(new_sentences), epochs=1)

training


(6691704, 14390621)

In [37]:
wv_from_bin.save('wv')

In [33]:
#wv_from_bin.wv.similar_by_vector(a)

  """Entry point for launching an IPython kernel.


KeyError: "word 'упрь' not in vocabulary"

5