In [None]:
path_to_queries = './../data/ranking-long-tail-queries-fall-2020/queries.tsv'

In [None]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import re
import os
import functools
import operator
from sklearn.feature_extraction.text import CountVectorizer
import requests
from tqdm import notebook
import pymorphy2
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn

In [None]:
queries_df = pd.read_csv(path_to_queries, sep='\t', header=None)
queries_df.head()

In [None]:
queries = queries_df.values[:, 1]

json_path = 'https://speller.yandex.net/services/spellservice.json/checkText?text='

corrected_queries = []
for query in notebook.tqdm(queries):
    r = requests.get(json_path + query)
    if r.status_code != 200:
        print('ERROR')
    changes = {change['word']: change['s'][0] for change in r.json() if len(change['s']) > 0}

    corr_query = query
    for word, suggestion in changes.items():
        corr_query = corr_query.replace(word, suggestion)
    
    corrected_queries.append(corr_query)

In [None]:
my_stopwords = list(set(stopwords.words('russian')) - set(['как', 'когда', 'почему', 'зачем', 'чтобы', 'что']))
my_stopwords[:5]

In [None]:
morph = pymorphy2.MorphAnalyzer()

In [None]:
PYMORPHY_CACHE = {}
def lemmatizer(words):
    global PYMORPHY_CACHE
    for word in words:
        word_hash = hash(word)
        if word_hash not in PYMORPHY_CACHE:
            PYMORPHY_CACHE[word_hash] = morph.parse(word)[0].normal_form
        yield PYMORPHY_CACHE[word_hash]

In [None]:
def clean_query(query):
    query_tokens = list(lemmatizer(query.split()))
    query_tokens = [q_tok for q_tok in query_tokens if q_tok not in my_stopwords
                    and q_tok not in stopwords.words('english')]
    query = ' '.join(query_tokens)
    return query

In [None]:
final_queries = []
for query in notebook.tqdm(corrected_queries):
    final_queries.append(clean_query(query))

In [None]:
queries_df[1] = final_queries

In [None]:
final_queries_df = queries_df.rename(columns={0: 'QueryId', 1: 'Query'})
final_queries_df.head()

In [None]:
final_queries_df.to_csv('queries_corrected', index=None)