In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm, tqdm_pandas
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

from copy import copy
import os
import sys
sys.path.insert(0, '../code/venv')
from qa_system import QuestionAnswerSystem, Question
from utils import Utility

In [5]:
df = pd.read_csv('../data/train_task_b.csv')

In [3]:
qa_system = QuestionAnswerSystem()
qa_system.create_database(df)
qa_system.add_database_to_index()

100%|██████████| 9078/9078 [02:53<00:00, 52.25it/s]
100%|██████████| 9078/9078 [00:27<00:00, 332.84it/s]


## Выбираем лучший алгоритм поиска релевантного документа на подвыборке

In [3]:
df_sample = df.sample(frac=0.1, random_state=0)

In [22]:
search_rel_question_doc_alg_str = "Frequency"
qa_system = QuestionAnswerSystem(search_rel_question_doc_alg_str)

accuracy = 0
errors = {}
for question_str, paragraph_id, question_id in tqdm(df_sample[['question', 'paragraph_id', 'question_id']].values, total=df_sample.question.nunique()):
    question = Question(question_str)
    #question.find_question_str_lem(question_id=question_id)
    doc_ids = qa_system.find_rel_question_doc_ids(question=question, question_id=question_id)
    if paragraph_id in doc_ids:
        accuracy += 1
    else:
        errors[question_id] = copy(doc_ids)

print('{}: Accuracy: {}'.format(search_rel_question_doc_alg_str, accuracy/df_sample.question.nunique()))

100%|██████████| 5036/5036 [14:50<00:00,  5.66it/s]

Frequency: Accuracy: 0.664614773629865





## Для лучшего алгоритма делаем пересчет по всей коллекции

In [3]:
search_rel_question_doc_alg_str = "BM25F"
if not os.path.exists(search_rel_question_doc_alg_str):
    os.mkdir(search_rel_question_doc_alg_str)
qa_system = QuestionAnswerSystem(search_rel_question_doc_alg_str)

accuracy = 0
errors = {}
for question_str, paragraph_id, question_id in tqdm(df[['question', 'paragraph_id', 'question_id']].values, total=df.question.nunique()):
    question = Question(question_str)
    #question.find_question_str_lem(question_id=question_id)
    doc_ids = qa_system.find_rel_question_doc_ids(question=question, question_id=question_id)
    if paragraph_id in doc_ids:
        accuracy += 1
    else:
        errors[question_id] = doc_ids
    np.save('{}/{}.npy'.format(search_rel_question_doc_alg_str, question_id), doc_ids)
np.save('{}_interrogative_pronouns_errors.npy'.format(search_rel_question_doc_alg_str), errors)
print('{}: Accuracy: {}'.format(search_rel_question_doc_alg_str, accuracy/df.question.nunique()))

50364it [1:37:14,  8.63it/s]                           

BM25F: Accuracy: 0.9724588471237664





## Формируем датасет для обучения (этап 2)

In [2]:
search_rel_question_doc_alg_str = "BM25F"
qa_system = QuestionAnswerSystem(search_rel_question_doc_alg_str)
train_df = qa_system.create_train_dataset()

100%|██████████| 50364/50364 [06:37<00:00, 126.61it/s]


## Переводим вопрос и ответ в лемматизированную форму

In [6]:
from pymystem3 import Mystem
stem = Mystem()

answers_lem = []
questions_lem = []
for question_id, question_str, answer in tqdm(df[['question_id', 'question', 'answer']].values, total=df.question.nunique()):
    question = Question(question_str)
    question.find_question_str_lem(question_id)
    questions_lem.append(question.question_str_lem)
    answers_lem.append(''.join(stem.lemmatize(answer))[:-1])
    
df['answer_lem'] = answers_lem
df['question_lem'] = questions_lem 

50364it [02:55, 287.45it/s]                           


In [7]:
train_df = pd.merge(train_df, df[['question_lem', 'question_id', 'answer_lem']], how='left', on='question_id')

In [8]:
train_df.to_pickle('train_df_interrogative_pronouns.pkl')

## Фильтруем датасет по пересечению слов из вопроса и ответа

In [3]:
train_df = pd.read_pickle('train_df_interrogative_pronouns.pkl')

In [9]:
train_df_filtered = QuestionAnswerSystem.filter_train_dataset(train_df)

100%|██████████| 50363/50363 [3:17:33<00:00,  4.25it/s]  


In [40]:
train_df_filtered.to_pickle('train_df_interrogative_pronouns_filtered.pkl')

## Делаем разметку для классификатора

In [41]:
train_df_filtered = pd.read_pickle('train_df_interrogative_pronouns_filtered.pkl')

In [42]:
train_df_filtered = QuestionAnswerSystem.create_target(train_df_filtered)

100%|██████████| 1506660/1506660 [02:33<00:00, 9844.05it/s]


In [43]:
train_df_filtered.to_pickle('train_df_interrogative_pronouns_with_target.pkl')

## Сравнение моделей этапа 2

In [2]:
df = pd.read_pickle('train_df_interrogative_pronouns_with_target.pkl')
df = df.set_index('question_id')

## Обучение и валидация

In [3]:
train_df_idxs, test_df_idxs = Utility.train_test_split(df)

### Предложение с максимальным пересечением по словам с вопросом

In [4]:
qa_system = QuestionAnswerSystem()
max_match_sentance = df.groupby('question_id').apply(lambda x: qa_system.get_max_match_sentence(x))
df['max_match_sentance'] = max_match_sentance
df['max_match_sentance_answer'] = (df.max_match_sentance == df.sentence_lem).astype(int)

In [7]:
print('Train:')
Utility.calculate_metric(df.loc[train_df_idxs].answer_in_sentence, df.loc[train_df_idxs].max_match_sentance_answer)

print('Test:')
Utility.calculate_metric(df.loc[test_df_idxs].answer_in_sentence, df.loc[test_df_idxs].max_match_sentance_answer)

Train:
Precision: 0.7049888477935571
Recall: 0.6281444958744214
F1-score: 0.6643519442338135
Test:
Precision: 0.7048704936400184
Recall: 0.6265010837092144
F1-score: 0.6633792333457388


### Лог-регрессия

In [4]:
qa_system = QuestionAnswerSystem()
tqdm_pandas(tqdm(total=df.shape[0]))
base_features = df.progress_apply(lambda x: qa_system.create_base_features(x), axis=1)
for f in base_features.columns:
    df[f] = base_features[f]

  0%|          | 0/1506660 [00:00<?, ?it/s]
 31%|███       | 466409/1506660 [42:10<2:07:13, 136.28it/s]

KeyboardInterrupt: 

 31%|███       | 466424/1506660 [42:23<2:07:13, 136.28it/s]

In [77]:
target = 'answer_in_sentence'
predictions = df.columns.difference([target, 'sentence_lem', 'question_lem', 'answer_lem', 'doc_number'])

doc_number_dummies = pd.get_dummies(df.doc_number)

sc = StandardScaler()
X_train_sc = sc.fit_transform(df.loc[train_df_idxs][predictions])
X_test_sc = sc.transform(df.loc[test_df_idxs][predictions])

X_train = np.concatenate((X_train_sc, doc_number_dummies.loc[train_df_idxs].values), axis=1)
X_test = np.concatenate((X_test_sc, doc_number_dummies.loc[test_df_idxs].values), axis=1)

clf = LogisticRegression()
clf.fit(X_train, df.loc[train_df_idxs][target])

train_predict = clf.predict(X_train)
test_predict = clf.predict(X_test)

print('Train:')
Utility.calculate_metric(df.loc[train_df_idxs].answer_in_sentence, train_predict)

print('Test:')
Utility.calculate_metric(df.loc[test_df_idxs].answer_in_sentence, test_predict)

Train:
Precision: 0.8120591581342435
Recall: 0.5386898772388811
F1-score: 0.6477118054505309
Test:
Precision: 0.8157081014223871
Recall: 0.5408587663288618
F1-score: 0.6504402958788305


In [None]:
clf = LogisticRegression(penalty='l1')
clf.fit(X_train, df.loc[train_df_idxs][target])

In [None]:
1

In [103]:
LogisticRegression?

In [101]:
np.array(res)[np.argsort(np.abs(clf.coef_[0]))[::-1]]

array(['0', 'sentence_lem_no_stop_words_len', 'sentence_lem_no_punct_len',
       'sentence_lem_no_punct_question_lem_no_interrogative_pronouns_intersect_len',
       '9', '8', '7', '6', '5', '4',
       'question_lem_no_stop_words_sentence_lem_no_punct_jaccard_sim', '3',
       'question_lem_no_stop_words_sentence_lem_no_punct_fuzz_token_set_ratio',
       'sentence_lem_no_stop_words_question_lem_no_stop_words_intersect_len',
       'sentence_lem_no_stop_words_question_lem_no_interrogative_pronouns_intersect_len',
       'sentence_lem_no_stop_words_question_lem_no_punct_intersect_len',
       'sentence_lem_no_stop_words_question_lem_no_punct_jaccard_sim',
       'sentence_lem_no_stop_words_question_lem_no_interrogative_pronouns_jaccard_sim',
       '2',
       'sentence_lem_no_stop_words_question_lem_no_stop_words_fuzz_token_set_ratio',
       'sentence_lem_no_stop_words_question_lem_no_stop_words_jaccard_sim',
       'question_lem_no_stop_words_sentence_lem_no_punct_fuzz_token_sort_r

In [100]:
res = list(predictions) + list(doc_number_dummies.columns)

In [98]:
predictions

Index(['question_lem_no_punct_sentence_lem_no_punct_fuzz_partial_ratio',
       'question_lem_no_punct_sentence_lem_no_punct_fuzz_ratio',
       'question_lem_no_punct_sentence_lem_no_punct_fuzz_token_set_ratio',
       'question_lem_no_punct_sentence_lem_no_punct_fuzz_token_sort_ratio',
       'question_lem_no_punct_sentence_lem_no_punct_intersect_len',
       'question_lem_no_punct_sentence_lem_no_punct_jaccard_sim',
       'question_lem_no_stop_words_sentence_lem_no_punct_fuzz_partial_ratio',
       'question_lem_no_stop_words_sentence_lem_no_punct_fuzz_ratio',
       'question_lem_no_stop_words_sentence_lem_no_punct_fuzz_token_set_ratio',
       'question_lem_no_stop_words_sentence_lem_no_punct_fuzz_token_sort_ratio',
       'question_lem_no_stop_words_sentence_lem_no_punct_intersect_len',
       'question_lem_no_stop_words_sentence_lem_no_punct_jaccard_sim',
       'sentence_lem_no_punct_len',
       'sentence_lem_no_punct_question_lem_no_interrogative_pronouns_fuzz_partial_ratio'

In [97]:
list(doc_number_dummies.columns).

AttributeError: 'list' object has no attribute 'shape'