In [1]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text # Not used directly but needed to import TF ops.

from modules.evaluation import evaluate

In [2]:
model = hub.KerasLayer("models/USEm_large3")
df = pd.read_parquet('data/qa_clean.parquet')
graph = pd.read_parquet('data/qa_sim.parquet').loc[:, ['q_id', 'ans_ids']]

In [3]:
questions = df['question'].unique()
answers = df['answer'].to_numpy()
similarity = []
for st in graph['ans_ids'].to_numpy():
    el = eval(st)
    if isinstance(el, int):
        similarity.append([el])
    else:
        similarity.append(list(el))

Считаем MAP@10

In [5]:
map10 = evaluate(model, questions, answers, similarity, model_type='use', batch_size=64)

Calculating embeddings: 100%|██████████| 328/328 [26:06<00:00,  4.78s/it]
Calculating embeddings: 100%|██████████| 771/771 [37:17<00:00,  2.90s/it]
Searching for top k texts for all inputs: 100%|██████████| 21018/21018 [14:46<00:00, 23.70it/s]


In [6]:
print(map10)

0.06969540386306033


## Подбор кандидатов

In [2]:
df = pd.read_parquet('data/qq_sim.parquet')
questions = df['question'].unique()
model = hub.KerasLayer("models/USEm_large3")
df['checked'] = False
df['scores'] = np.nan

In [3]:
from modules.evaluation import calculate_embeddings, Index
from tqdm import tqdm

embs = calculate_embeddings(model, questions, dims=512, model_type='use')
search_index = Index(model, questions, 512, model_type='use')
candidates = []
for i, _ in tqdm(enumerate(questions), desc='Searching for top k texts for all inputs', total=len(questions)):
    texts, _ = search_index.search(embs[i].reshape((1, -1)), k=11) #search wants 2 dims
    candidates.append(texts[1:])

df['candidates'] = candidates

Calculating embeddings: 100%|██████████| 80/80 [06:34<00:00,  4.93s/it]
Calculating embeddings: 100%|██████████| 80/80 [06:08<00:00,  4.61s/it]
Searching for top k texts for all inputs: 100%|██████████| 5161/5161 [00:25<00:00, 203.49it/s]


In [4]:
import argparse, elasticsearch, json
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

# index and document type constants
INDEX_NAME = "documents"
TYPE = "document"

# get a client
es = Elasticsearch()

# create an index, ignore if it exists already
es.indices.create(index='documents', ignore=400)

# json-ize the lines in the file
def make_documents(f):
    for l in f:
        doc = {
                '_op_type': 'create',
                '_index': INDEX_NAME,
                '_type': TYPE,
                '_source': {'text': l.strip() }
        }
        yield( doc )            

In [5]:
# put documents in index in bulk
bulk(es, make_documents(questions))

# count the matches
count = es.count(index=INDEX_NAME, doc_type=TYPE, body={ "query": {"match_all" : { }}})

# now we can do searches.
print("Ok. I've got an index of {0} documents. Let's do some searches...".format(count['count']))

Ok. I've got an index of 2445 documents. Let's do some searches...


In [9]:
candidates = []
for i, row in tqdm(df.iterrows(), desc='Elastic search по всем вопросам', total=len(df)):
    local_cands = list(row['candidates'])
    results = es.search(index=INDEX_NAME, doc_type=TYPE, body={"query": {"match": {"text": row['question']}}})
    for res in results['hits']['hits'][1:6]:
        local_cands.append(res['_source']['text'])

    candidates.append(local_cands)

df['candidates'] = candidates

Elastic search по всем вопросам: 100%|██████████| 5161/5161 [00:23<00:00, 223.94it/s]


In [10]:
df.loc[1, 'candidates'][10]

'А кто-нибудь пробовал работать с корпусом изданных книг? Грубо говоря скачать флибусту и дальше на основе этих данных делать модели / аналитику.\nМне интересен опыт работы с данными и какие задачи на них решались и кто сейчас занимается этим.\nНапример, насколько реально (и сложно) оценить уровень языка, грамотность, ожидаемую популярность книги, издаваемость (форматность) книги и т.п.'

In [11]:
df.head()

Unnamed: 0,question,similiar_questions_ids_in_clean_df,question_ids_in_clean_df,checked,candidates,scores
0,"Народ, кто знает, есть ли в GO библиотека для ...",,0,False,"[Ребята, привет!\nНа днях коллега из академии ...",
1,"Всем привет, нуждаюсь в помощи по составлению ...",,"1, 2, 3, 4",False,[Прошел ассессмент на Senior Systems Engineer ...,
2,"и еще вопрос, нет ли веяний, что этот проект п...",,"5, 6",False,"[Кто-нибудь знает, есть ли что-нибудь похожее ...",
3,Кто-нибудь здесь проходил\n<http://coursera.or...,,7,False,[<https://www.coursera.org/learn/reinforcement...,
4,Кто-нибудь знает готовые реализации для обнару...,,66,False,[Что можно почитать на тему отслеживание анома...,


In [12]:
df.to_parquet('data/qq_sim.parquet', compression='brotli', index=False)

С использованием SentenceBERT

In [1]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from modules.evaluation import evaluate

In [2]:
df = pd.read_parquet('data/qa_clean.parquet')
graph = pd.read_parquet('data/qa_sim.parquet').loc[:, ['q_id', 'ans_ids']]
model = SentenceTransformer('distiluse-base-multilingual-cased')

In [3]:
df['question'] = df['question'].str.lower()
df['answer'] = df['answer'].str.lower()

In [4]:
questions = df['question'].unique()
answers = df['answer'].to_numpy()
similarity = []
for st in graph['ans_ids'].to_numpy():
    el = eval(st)
    if isinstance(el, int):
        similarity.append([el])
    else:
        similarity.append(list(el))

In [5]:
map10 = evaluate(model, questions, answers, similarity, model_type='sbert', batch_size=128)

Calculating embeddings: 100%|██████████| 164/164 [10:39<00:00,  3.90s/it]
Calculating embeddings: 100%|██████████| 771/771 [18:40<00:00,  1.45s/it]
Searching: 100%|██████████| 21018/21018 [11:28<00:00, 30.52it/s]


In [6]:
print(map10)

0.0494763472063598
