In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import vaex as vx
import sys
import re
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import itertools
from tqdm.contrib.itertools import product
import pymorphy2
%matplotlib inline


tqdm.pandas()

In [None]:
#!pip3 install vaex
#!pip3 install autocorrect
#!pip3 install fuzzywuzzy
#!pip3 install python-Levenshtein
#!pip3 install pymorphy2

### Чтение данных

In [None]:
%%time
data = vx.open('history_small.csv')
data.__delitem__('weekday')
data.__delitem__('time')
data.__delitem__('Unnamed: 0')
data = data[['UQ', 'cnt']]

In [None]:
morph = pymorphy2.MorphAnalyzer()

In [None]:
data = data.dropna(column_names=['UQ'])
data['UQ'] = data['UQ'].str.lower()

In [None]:
source = list(data['UQ'].unique())

In [None]:
stop_words = stopwords.words('russian')

In [None]:
%%time
key_source = data['UQ'].values

### Обработка входных данных

In [None]:
# Выбор уникальных слов и подсчет количества использований, замена всех технических символов,
# очистка от всех слов, не содержащих буквы

In [None]:
word = {}
for i in tqdm(range(len(source))):
    string = re.sub("[^0-9a-zа-я]+", " ", source[i])
    x = string.split(' ')
    for j in x:
        if j in word.keys():
            word[j] +=1
        else:
            word[j] = 1

In [None]:
# Отбор слов, содержащих только быквы и формирования списка наиболее используемых слов

In [None]:
words = pd.DataFrame.from_dict(word, orient='index').reset_index()
words.rename({0:'num'}, axis=1, inplace=True)
print(words.shape)
words = words[words['index'].str.contains(r'[a-zа-я]')]
print(words.shape)
words.sort_values(by='num', ascending=False, inplace=True)
words_nostop = words[~words['index'].isin(stop_words)]
top_words = words_nostop[words_nostop.num>=5]

In [None]:
# Приведение прилагательных и существительных к нормализованной форме, где это возможно

In [None]:
POS_list = ['ADJF']
wd_list = list(words_nostop['index'])
norm_words = {}
for i in tqdm(range(len(wd_list))):
    p = morph.parse(wd_list[i])
    s = set(map(lambda l: l.normal_form, p))
    st = set(map(lambda l: l.tag.POS, p))
    if len(s) == 1:
        norm_words[wd_list[i]] = p[0].normal_form
    elif st == {'ADJF'}:
        norm_words[wd_list[i]] = p[0].normal_form
    else:
        norm_words[wd_list[i]] = wd_list[i]

In [None]:
# Создание чистого датасета ключевых слов для формирования тэговых запросов

In [None]:
words_clear = pd.DataFrame.from_dict(norm_words, orient='index', columns = ['word']).reset_index()
words_clear = words_clear.merge(words_nostop, how='left', on='index')
words_clear = words_clear.groupby('word').sum().reset_index().sort_values(by='num', ascending=False)
words_clear.rename({'word': 'index'}, axis=1, inplace=True)
words_clear = words_clear.loc[words_clear.num>=words_clear.num.quantile(0.95)].copy()

In [None]:
# Формирование списков наиболее используемых словосочетанй из 2 и 3 слов

In [None]:
double_words = {}
triple_words = {}
for i in tqdm(range(len(source))):
    x = key_source[i].as_py().split(' ')
    s = ''
    if (len(x))>=2:
        if not any(list(map(lambda w: w in stop_words,x[:2]))):
            s = ' '.join(x[:2])
            if s in double_words.keys():
                double_words[s] += 1
            else:
                double_words[s] = 1
        s= ''
        if len(x)>2:
            s = ' '.join(x[:3])
            if s in triple_words.keys():
                triple_words[s] += 1
            else:
                triple_words[s] = 1

In [None]:
double_words = pd.DataFrame.from_dict(double_words, orient='index').reset_index()
double_words.rename({0:'num'}, axis=1, inplace=True)
double_words.sort_values(by='num', ascending=False, inplace=True)
double_words = double_words[double_words['num']>= 50]

triple_words = pd.DataFrame.from_dict(triple_words, orient='index').reset_index()
triple_words.rename({0:'num'}, axis=1, inplace=True)
triple_words.sort_values(by='num', ascending=False, inplace=True)
triple_words = triple_words[triple_words['num']>= 50]

In [None]:
# Очистка используемых словосочетаний, объединение словосочетаний, отличающихся только расстановкой слов

In [None]:
zz = double_words.copy()
zz['index_modi'] = zz['index'].str.split(' ').apply(lambda x: ' '.join(sorted(x)))
temp = zz.groupby('index_modi').sum().reset_index()
zz.drop_duplicates(subset=['index_modi'], inplace=True)
zz = temp.merge(zz[['index', 'index_modi']], how='left', on='index_modi')
double_words = zz[['index', 'num']].sort_values(by='num')


zz = triple_words.copy()
zz['index_modi'] = zz['index'].str.split(' ').apply(lambda x: ' '.join(sorted(x)))
temp = zz.groupby('index_modi').sum().reset_index()
zz.drop_duplicates(subset=['index_modi'], inplace=True)
zz = temp.merge(zz[['index', 'index_modi']], how='left', on='index_modi')
triple_words = zz[['index', 'num']].sort_values(by='num')

del zz

In [None]:
# Сохранение данных для перехода к этапу расчета

In [None]:
words_clear['source'] = 'words_clear'
double_words['source'] = 'double_words'
triple_words['source'] = 'triple_words'
words_nostop['source'] = 'words_nostop'
total_df = pd.concat([words_clear, double_words, triple_words, words_nostop], ignore_index=True)
dd = pd.DataFrame(source)
dd = to_csv('source_req.csv')