In [110]:
import re
from nile.api.v1 import (
    clusters,
    aggregators as na
)

import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pymystem3 import Mystem
import pymorphy2

sns.set()
pd.set_option('display.min_rows', 50)
cluster = clusters.Hahn()

## Подготовка текстового датасета

In [167]:
path = '//home/images/dev/spochukaev/hse_diploma/text_classification_with_answers_2'
text_class_2 = cluster.read(path).as_dataframe()

path = '//home/images/dev/spochukaev/hse_diploma/text_classification_all_honeypots_3'
text_all_2 = cluster.read(path).as_dataframe()

In [168]:
text_class_2['aggr_answer'] = text_class_2['aggr_answer'].str.decode('utf8')
text_class_2['assessors_answer'] = text_class_2['assessors_answer'].str.decode('utf8')
text_class_2['some_answer'] = text_class_2['some_answer'].str.decode('utf8')
text_class_2['query'] = text_class_2['query'].str.decode('utf8')
text_class_2['least_common'] = text_class_2['least_common'].str.decode('utf8')

text_all_2['assessors_answer'] = text_all_2['assessors_answer'].str.decode('utf8')
text_all_2['query'] = text_all_2['query'].str.decode('utf8')

In [173]:
text_class_2 = text_class_2.drop_duplicates(subset=['query']).drop('list_answers', axis=1)

In [175]:
text_class_2.to_csv('text_dataset.csv')

# Добавление фильтра по языкам

In [4]:
text_class_2['english'] = text_class_2['query'].apply(lambda x: re.search('[a-zA-Z]', x) != None)
text_class_2['russian'] = text_class_2['query'].apply(lambda x: re.search('[А-Яа-я]', x) != None)
text_class_2['ukrain'] = text_class_2['query'].apply(lambda x: re.search('[ЇїІіЄєҐґ]', x) != None)

In [22]:
df = text_class_2[(~text_class_2['english']) & (text_class_2['russian']) & (~text_class_2['ukrain'])]

In [129]:
m = pymorphy2.MorphAnalyzer()

convert_dict = {
    'A':       'ADJ',                                                                                                                                                                                                                                                                    
    'ADV':    'ADV',                                                                                                                                                                                                                                                                    
    'ADVPRO':  'ADV',                                                                                                                                                                                                                                                                    
    'ANUM':    'ADJ',                                                                                                                                                                                                                                                                    
    'APRO':    'DET',                                                                                                                                                                                                                                                                    
    'COM':     'ADJ',                                                                                                                                                                                                                                                                   
    'CONJ':    'SCONJ',                                                                                                                                                                                                                                                                  
    'INTJ':    'INTJ',                                                                                                                                                                                                                                                                   
    'NONLEX':  'X',                                                                                                                                                                                                                                                                     
    'NUM':     'NUM',                                                                                                                                                                                                                                                                    
    'PART':    'PART',                                                                                                                                                                                                                                                                   
    'PR':      'ADP',                                                                                                                                                                                                                                                                   
    'S':       'NOUN',                                                                                                                                                                                                                                                                   
    'SPRO':    'PRON',                                                                                                                                                                                                                                                                   
    'UNKN':    'X',                                                                                                                                                                                                                                                                     
    'V':     'VERB'  
}

pymorphy2_dict = {
    'NOUN': 'NOUN', 
    'ADJF': 'ADJ', 
    'ADJS': 'ADJ',
    'COMP':     'ADJ', 
    'VERB':     'VERB',
    'INFN':     'VERB', 
    'PRTF':    'ADV',                                                                                                                                                                                                                                                                    
    'PRTS':  'ADV',
    'GRND':  'ADV',
    'NUMR':     'NUM',                                                                                                                                                                                                                                                                    
    'NPRO':    'PRON',                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    
    'CONJ':    'CONJ',                                                                                                                                                                                                                                                                  
    'INTJ':    'INTJ',                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    
    'PRCL':    'PART',  
    'PREP':      'ADP',                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               
    'SPRO':    'PRON',
    'ADVB': 'ADV',
    'PRED': 'ADV'
 
}


def tag(word, with_tags=False):
    processed = m.parse(word)
    if with_tags:
        try:
            lemma = processed[0].normal_form + "_" + pymorphy2_dict[processed[0].tag.POS]
        except KeyError:
            print(processed[0])
            lemma = processed[0].normal_form + "_X"
    else:
        lemma = processed[0].normal_form 
    return lemma

In [124]:
tag('правильно', True)

'правильно_ADV'

In [150]:
queries = []
for query in df['query']:
    query_ = query.lower()
    query_ = re.sub(r'[^\w\s]',' ',query_)
    query_ = re.sub(r'\d+','',query_)
    query_ = re.sub(r'\s+',' ',query_)
    query_ = query_.strip()
    updated_query = []
    #print(query_)
    for word in query_.split(' '):
        try:
            if len(word) > 3:
                word = tag(word)
                if word not in stopwords.words("russian"):
                    updated_query.append(word)
        except KeyError:
            print(f'KeyError: query: {query} . Word: {word}')
        except IndexError:
            print(f'IndexError: query: {query} . Word: {word}')
    queries.append({'Initial_query': query, 'Updated_query': ' '.join(updated_query)})

## Пробую navec

In [151]:
#! pip install navec

In [152]:
from navec import Navec

path = 'navec_hudlit_v1_12B_500K_300d_100q.tar'
navec = Navec.load(path)

In [153]:
known_words = set()
unknown_words = set()
for query in queries:
    for word in query['Updated_query'].split(' '):
        if word not in navec:
            unknown_words.add(word)
        else:
            known_words.add(word)

In [154]:
print(len(known_words))
print(len(unknown_words))

8681
3628


In [155]:
list(unknown_words)[:100]

['',
 'ацикловира',
 'лудшея',
 'руский',
 'семый',
 'честейн',
 'микробиолгия',
 'мауголь',
 'панеттьерь',
 'белагропромбанк',
 'еврохоккей',
 'серябкина',
 'ивацевичи',
 'брсма',
 'лепбук',
 'варикоцель',
 'требуються',
 'таймлёсс',
 'оченний',
 'сэкс',
 'суиня',
 'страстнуть',
 'фурри',
 'трихополый',
 'диацент',
 'юриевич',
 'варюшка',
 'райхон',
 'фаннинга',
 'взломаный',
 'шостка',
 'минеткать',
 'платёж',
 'делюкс',
 'пропановый',
 'варкравт',
 'аденокарцинома',
 'нүдистый',
 'ситилинк',
 'киевстарый',
 'лораса',
 'сивацкай',
 'желировать',
 'кражный',
 'хитман',
 'фуджифильм',
 'вальгусный',
 'дигла',
 'сантехнический',
 'валюш',
 'хавортий',
 'хамедорей',
 'аркхэма',
 'ферритина',
 'хемсворт',
 'майнкрафт',
 'джекман',
 'екфт',
 'кизларь',
 'скелетрон',
 'калкина',
 'фейсситтинг',
 'сузук',
 'акулиничев',
 'чиназ',
 'курцхаар',
 'эйджиро',
 'скапюшон',
 'расклёшить',
 'флешерс',
 'лачетти',
 'оригон',
 'сиил',
 'страпон',
 'мотоскутер',
 'кенис',
 'арцруни',
 'талула',
 'народ

# Скачка векторов CBOW c RUvectors из Gensim

In [83]:
import re
import gensim
import logging
import nltk.data 
import pandas as pd
import urllib.request
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from gensim.models import word2vec
from nltk.tokenize import sent_tokenize, RegexpTokenizer
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/spochukaev/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [84]:
! wget http://vectors.nlpl.eu/repository/20/220.zip

--2021-12-20 19:40:14--  http://vectors.nlpl.eu/repository/20/220.zip
Resolving vectors.nlpl.eu (vectors.nlpl.eu)... 64:ff9b::81f0:bdb5
Connecting to vectors.nlpl.eu (vectors.nlpl.eu)|64:ff9b::81f0:bdb5|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 638171816 (609M) [application/zip]
Saving to: ‘220.zip’


2021-12-20 19:40:23 (68.8 MB/s) - ‘220.zip’ saved [638171816/638171816]



In [87]:
model_path = 'model.bin'

model_ru = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True)

In [130]:
queries = []
for query in df['query']:
    query_ = query.lower()
    query_ = re.sub(r'[^\w\s]',' ',query_)
    query_ = re.sub(r'\d+','',query_)
    query_ = re.sub(r'\s+',' ',query_)
    query_ = query_.strip()
    updated_query = []
    #print(query_)
    for word in query_.split(' '):
        try:
            if len(word) > 3:
                word = tag(word, True)
                if word not in stopwords.words("russian"):
                    updated_query.append(word)
        except KeyError:
            print(f'KeyError: query: {query} . Word: {word}')
        except IndexError:
            print(f'IndexError: query: {query} . Word: {word}')
    queries.append({'Initial_query': query, 'Updated_query': ' '.join(updated_query)})

Parse(word='редми', tag=OpencorporaTag('UNKN'), normal_form='редми', score=1.0, methods_stack=((UnknAnalyzer(), 'дми'), (KnownPrefixAnalyzer(known_prefixes=<...>, min_remainder_length=3, score_multiplier=0.75), 'ре')))
Parse(word='редми', tag=OpencorporaTag('UNKN'), normal_form='редми', score=1.0, methods_stack=((UnknAnalyzer(), 'дми'), (KnownPrefixAnalyzer(known_prefixes=<...>, min_remainder_length=3, score_multiplier=0.75), 'ре')))
Parse(word='редми', tag=OpencorporaTag('UNKN'), normal_form='редми', score=1.0, methods_stack=((UnknAnalyzer(), 'дми'), (KnownPrefixAnalyzer(known_prefixes=<...>, min_remainder_length=3, score_multiplier=0.75), 'ре')))
Parse(word='редми', tag=OpencorporaTag('UNKN'), normal_form='редми', score=1.0, methods_stack=((UnknAnalyzer(), 'дми'), (KnownPrefixAnalyzer(known_prefixes=<...>, min_remainder_length=3, score_multiplier=0.75), 'ре')))
Parse(word='редми', tag=OpencorporaTag('UNKN'), normal_form='редми', score=1.0, methods_stack=((UnknAnalyzer(), 'дми'), (Kno

In [133]:
known_words = set()
unknown_words = set()
for query in queries:
    for word in query['Updated_query'].split(' '):
        if word not in model_ru:
            unknown_words.add(word)
        else:
            known_words.add(word)

In [134]:
print(len(known_words))
print(len(unknown_words))

7342
5050


In [156]:
'варкравт_NOUN' in model_ru

False

In [140]:
m.parse('шлепанец')

[Parse(word='шлёпанец', tag=OpencorporaTag('NOUN,inan,masc sing,nomn'), normal_form='шлёпанец', score=0.5, methods_stack=((DictionaryAnalyzer(), 'шлёпанец', 118, 0),)),
 Parse(word='шлёпанец', tag=OpencorporaTag('NOUN,inan,masc sing,accs'), normal_form='шлёпанец', score=0.5, methods_stack=((DictionaryAnalyzer(), 'шлёпанец', 118, 3),))]

In [135]:
list(unknown_words)[:100]

['',
 'гербалайф_NOUN',
 'жилкин_NOUN',
 'кушадас_NOUN',
 'пинч_NOUN',
 'кролин_NOUN',
 'повера_NOUN',
 'бежать_ADV',
 'нднс_NOUN',
 'кидман_NOUN',
 'динагуля_NOUN',
 'мясникович_NOUN',
 'меренговый_ADJ',
 'кузьменко_NOUN',
 'редми_X',
 'шплинт_NOUN',
 'санинск_NOUN',
 'европрестиж_NOUN',
 'полмимерный_ADJ',
 'гирич_NOUN',
 'ариаморган_NOUN',
 'запрешить_ADV',
 'клёш_NOUN',
 'булутс_NOUN',
 'окк_NOUN',
 'ривольвера_NOUN',
 'фесить_ADV',
 'авив_NOUN',
 'аревик_NOUN',
 'нейромультивит_NOUN',
 'вацап_NOUN',
 'духтари_NOUN',
 'шпиц_NOUN',
 'симб_NOUN',
 'шлёпанец_NOUN',
 'опарыш_NOUN',
 'павлов_NOUN',
 'сквирт_NOUN',
 'джонатан_NOUN',
 'ланцберг_NOUN',
 'кросться_VERB',
 'берджесс_NOUN',
 'расдвегать_VERB',
 'киевстарый_ADJ',
 'маркес_NOUN',
 'кабзиятгакарша_NOUN',
 'термобельё_NOUN',
 'юнайтед_NOUN',
 'монклер_NOUN',
 'буллка_NOUN',
 'диснейленд_NOUN',
 'скримера_NOUN',
 'нарвал_NOUN',
 'уайлдёр_NOUN',
 'обьгэс_NOUN',
 'агелера_NOUN',
 'аквапринт_NOUN',
 'крошить_ADV',
 'платтить_ADV',
 '

# Скачка векторов FASTTEXT c RUvectors из Gensim

In [83]:
import re
import gensim
import logging
import nltk.data 
import pandas as pd
import urllib.request
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from gensim.models import word2vec
from nltk.tokenize import sent_tokenize, RegexpTokenizer
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/spochukaev/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [157]:
! wget http://vectors.nlpl.eu/repository/20/213.zip

--2021-12-20 20:41:50--  http://vectors.nlpl.eu/repository/20/213.zip
Resolving vectors.nlpl.eu (vectors.nlpl.eu)... 64:ff9b::81f0:bdb5
Connecting to vectors.nlpl.eu (vectors.nlpl.eu)|64:ff9b::81f0:bdb5|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1485270300 (1.4G) [application/zip]
Saving to: ‘213.zip’


2021-12-20 20:42:10 (71.7 MB/s) - ‘213.zip’ saved [1485270300/1485270300]



In [159]:
! unzip 213.zip

Archive:  213.zip
  inflating: meta.json               
  inflating: model.model             
  inflating: model.model.vectors_ngrams.npy  
  inflating: model.model.vectors.npy  
  inflating: model.model.vectors_vocab.npy  
  inflating: README                  


In [160]:
model_path = 'model.model'

big_model = gensim.models.fasttext.FastTextKeyedVectors.load(model_path)

In [163]:
queries = []
for query in df['query']:
    query_ = query.lower()
    query_ = re.sub(r'[^\w\s]',' ',query_)
    query_ = re.sub(r'\d+','',query_)
    query_ = re.sub(r'\s+',' ',query_)
    query_ = query_.strip()
    updated_query = []
    #print(query_)
    for word in query_.split(' '):
        try:
            if len(word) > 3:
                word = tag(word, True)
                if word not in stopwords.words("russian"):
                    updated_query.append(word)
        except KeyError:
            print(f'KeyError: query: {query} . Word: {word}')
        except IndexError:
            print(f'IndexError: query: {query} . Word: {word}')
    queries.append({'Initial_query': query, 'Updated_query': ' '.join(updated_query)})

Parse(word='редми', tag=OpencorporaTag('UNKN'), normal_form='редми', score=1.0, methods_stack=((UnknAnalyzer(), 'дми'), (KnownPrefixAnalyzer(known_prefixes=<...>, min_remainder_length=3, score_multiplier=0.75), 'ре')))
Parse(word='редми', tag=OpencorporaTag('UNKN'), normal_form='редми', score=1.0, methods_stack=((UnknAnalyzer(), 'дми'), (KnownPrefixAnalyzer(known_prefixes=<...>, min_remainder_length=3, score_multiplier=0.75), 'ре')))
Parse(word='редми', tag=OpencorporaTag('UNKN'), normal_form='редми', score=1.0, methods_stack=((UnknAnalyzer(), 'дми'), (KnownPrefixAnalyzer(known_prefixes=<...>, min_remainder_length=3, score_multiplier=0.75), 'ре')))
Parse(word='редми', tag=OpencorporaTag('UNKN'), normal_form='редми', score=1.0, methods_stack=((UnknAnalyzer(), 'дми'), (KnownPrefixAnalyzer(known_prefixes=<...>, min_remainder_length=3, score_multiplier=0.75), 'ре')))
Parse(word='редми', tag=OpencorporaTag('UNKN'), normal_form='редми', score=1.0, methods_stack=((UnknAnalyzer(), 'дми'), (Kno

In [164]:
known_words = set()
unknown_words = set()
for query in queries:
    for word in query['Updated_query'].split(' '):
        if word not in big_model:
            unknown_words.add(word)
        else:
            known_words.add(word)

In [165]:
print(len(known_words))
print(len(unknown_words))

12392
0
