In [6]:
import pandas as pd
import numpy as np
from tqdm import tqdm, tqdm_pandas
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
import xgboost as xgb
from sklearn.preprocessing import StandardScaler

from copy import copy
import os
import sys
sys.path.insert(0, '../code/venv')
from qa_system import QuestionAnswerSystem
from utils import Utility

In [7]:
df = pd.read_csv('../data/train_task_b.csv')
df = df.set_index('question_id')

## Создаем БД

In [None]:
qa_system = QuestionAnswerSystem()
qa_system.create_database(df)
qa_system.add_database_to_index()

## Переводим вопрос и ответ в лемматизированную форму

In [20]:
tqdm_pandas(tqdm(total=df.index.nunique()))
df['question_lem'] = df.groupby('question_id').progress_apply(lambda x: Utility.lemmatize_question(x.question.values[0]))
df['answer_lem'] = df.groupby('question_id').progress_apply(lambda x: Utility.lemmatize(x.answer.values[0]))

  0%|          | 0/50364 [00:00<?, ?it/s]
50365it [01:35, 528.18it/s]                           
100%|██████████| 50365/50365 [00:45<00:00, 1115.87it/s]


## Выбираем лучший алгоритм поиска релевантного документа на подвыборке

In [3]:
df_sample = df.sample(frac=0.01, random_state=0)

In [4]:
## Freq: 0.66
## Tf-idf: 0.88
## Bm25f: 0.9688
## MAX_INTERSECT_DOC: 0.76

search_rel_question_doc_alg_str = "BM25F"
qa_system = QuestionAnswerSystem(search_rel_question_doc_alg_str)

accuracy = 0
errors = {}
for question_lem, paragraph_id, question_id in tqdm(df_sample.reset_index()[['question_lem', 'paragraph_id', 'question_id']].values, total=df_sample.question.nunique()):
    doc_ids = qa_system.find_rel_question_doc_ids(question_str_lem=question_lem)
    if paragraph_id in doc_ids:
        accuracy += 1
    else:
        errors[question_id] = copy(doc_ids)

print('{}: Accuracy: {}'.format(search_rel_question_doc_alg_str, accuracy/df_sample.question.nunique()))

100%|██████████| 504/504 [00:54<00:00,  9.21it/s]

BM25F: Accuracy: 0.9761904761904762





## Для лучшего алгоритма делаем пересчет по всей коллекции

In [4]:
search_rel_question_doc_alg_str = "BM25F"
if not os.path.exists(search_rel_question_doc_alg_str):
    os.mkdir(search_rel_question_doc_alg_str)
qa_system = QuestionAnswerSystem(search_rel_question_doc_alg_str) 

accuracy = 0
errors = {}
for question_lem, paragraph_id, question_id in tqdm(df.reset_index()[['question_lem', 'paragraph_id', 'question_id']].values, total=df.question.nunique()):
    doc_ids = qa_system.find_rel_question_doc_ids(question_str_lem=question_lem)
    if paragraph_id in doc_ids:
        accuracy += 1
    else:
        errors[question_id] = doc_ids
    np.save('{}/{}.npy'.format(search_rel_question_doc_alg_str, question_id), doc_ids)
np.save('{}_interrogative_pronouns_errors.npy'.format(search_rel_question_doc_alg_str), errors)
print('{}: Accuracy: {}'.format(search_rel_question_doc_alg_str, accuracy/df.question.nunique()))

50364it [1:33:29,  8.98it/s]                           

BM25F: Accuracy: 0.9726971267449018





## Формируем датасет для обучения (этап 2)
## Не учитываем те вопросы, по которым ошиблись на этапе 1

In [2]:
search_rel_question_doc_alg_str = 'BM25F'
errors = np.load('{}_interrogative_pronouns_errors.npy'.format(search_rel_question_doc_alg_str)).item()
train_df = QuestionAnswerSystem.create_train_dataset(errors=errors)

tqdm_pandas(tqdm(total=train_df.shape[0]))
train_df['sentence_lem'] = train_df.progress_apply(lambda x: Utility.lemmatize(x.sentence), axis=1)
train_df.to_pickle('train_df.pkl')

100%|██████████| 50364/50364 [03:15<00:00, 257.87it/s]
  0%|          | 0/2718499 [00:00<?, ?it/s]
100%|██████████| 2718499/2718499 [2:30:25<00:00, 301.20it/s]  


## Делаем разметку для классификатора

In [24]:
train_df = pd.merge(train_df, df.reset_index()[['question_id', 'question', 'question_lem', 'answer', 'answer_lem']], how='left', on='question_id')
train_df_with_target = QuestionAnswerSystem.create_target(train_df)
train_df_with_target.to_pickle('train_df_with_target.pkl')

100%|██████████| 2718499/2718499 [04:32<00:00, 9986.65it/s] 


## Фильтруем датасет по наличию хотя бы одного предложения с ответом

In [27]:
train_df_with_target = train_df_with_target.set_index('question_id')
train_df_with_target = train_df_with_target[train_df_with_target.groupby('question_id').apply(lambda x: any(x.answer_in_sentence == 1))]
train_df_with_target.to_pickle('train_df_with_target_filtered.pkl')

  


## Этап 2. Построение классификатора Ans_in_sentence

### 4% вопросов отсеялись на этапе 1

In [10]:
df = pd.read_pickle('train_df_with_target_filtered.pkl')

In [13]:
train_df_idxs, test_df_idxs = Utility.train_test_split(df)

### Базовые статистики

In [6]:
import pickle
with open('idfs.pickle', 'rb' ) as f:
    idfs = pickle.load(f)
with open('idfs_lema.pickle', 'rb' ) as f:
    idfs_lem = pickle.load(f)
    
tqdm_pandas(tqdm(total=df.index.nunique()))
base_stats = df.groupby('question_id').progress_apply(lambda x: 
                                                     QuestionAnswerSystem.get_base_stats(
                                                         x.question.values[0],
                                                         list(x.sentence),
                                                         x.question_lem.values[0],
                                                         list(x.sentence_lem)
                                                     )).reset_index()
base_stats.drop('level_1', axis=1, inplace=True)
base_stats.columns = [
    'question_id',
    'unique_word_count_score',
    'unique_lem_word_count_score',
    
    'unique_word_percent_score',
    'unique_lem_word_percent_score',
    
    'sentence_len',
    'sentence_lem_len',
    
    'bm25f_score',
    'bm25f_lem_score',
    
    'tf_idf_score',
    'tf_idf_lem_score',
    
    'sentence',
    'sentence_lem'
]
base_stats.drop('sentence_lem', inplace=True, axis=1)
base_stats.to_pickle('base_stats.pkl')
df = pd.merge(df.reset_index(), base_stats, how='left', on=('question_id', 'sentence')).set_index('question_id')

  0%|          | 0/48238 [00:00<?, ?it/s]
48239it [50:52, 15.80it/s]                             


### Бейзлайны:
* max_unique_word_count_score
* max_unique_word_percent_score
* max_tf_idf_score
* max_bm25f_score

In [11]:
def get_scores(df, columns):
    n_questions = df.index.nunique()
    scores = {}
    tqdm_pandas(tqdm(total=n_questions))
    for col in columns:
        scores[col] = df.groupby('question_id').progress_apply(lambda x: Utility.get_answer_by_score(x, col)).sum()/n_questions
    return scores

train_scores = get_scores(df.loc[train_df_idxs], base_stats_new.columns.difference(['question_id', 'sentence_len', 'sentence_lem_len', 'sentence']))
test_scores = get_scores(df.loc[test_df_idxs], base_stats_new.columns.difference(['question_id', 'sentence_len', 'sentence_lem_len', 'sentence']))

scores = {'Train': {}, 'Test': {}}
scores['Train'] = train_scores
scores['Test'] = test_scores
pd.DataFrame(scores).T

  0%|          | 0/33767 [00:00<?, ?it/s]
33768it [00:40, 830.47it/s]                           
100%|██████████| 33768/33768 [00:44<00:00, 764.89it/s]
100%|██████████| 33768/33768 [00:38<00:00, 883.93it/s] 
100%|██████████| 33768/33768 [00:39<00:00, 863.71it/s] 
100%|██████████| 33768/33768 [00:37<00:00, 905.36it/s] 
100%|██████████| 33768/33768 [00:35<00:00, 953.73it/s] 
100%|██████████| 33768/33768 [00:37<00:00, 907.91it/s] 
100%|██████████| 33768/33768 [00:36<00:00, 913.67it/s] 
  0%|          | 0/14471 [00:00<?, ?it/s]
14472it [00:15, 927.18it/s]                           
100%|██████████| 14472/14472 [00:15<00:00, 939.97it/s] 
100%|██████████| 14472/14472 [00:14<00:00, 981.02it/s] 
100%|██████████| 14472/14472 [00:14<00:00, 993.37it/s] 
100%|██████████| 14472/14472 [00:15<00:00, 911.01it/s] 
100%|██████████| 14472/14472 [00:19<00:00, 727.57it/s]
100%|██████████| 14472/14472 [00:16<00:00, 871.89it/s]
100%|██████████| 14472/14472 [00:15<00:00, 957.21it/s] 


Unnamed: 0,bm25f_lem_score,bm25f_score,tf_idf_lem_score,tf_idf_score,unique_lem_word_count_score,unique_lem_word_percent_score,unique_word_count_score,unique_word_percent_score
Test,0.744524,0.69387,0.637758,0.67846,0.776726,0.776726,0.746666,0.746666
Train,0.748186,0.703468,0.641869,0.685107,0.7802,0.7802,0.749104,0.749104


In [26]:
df = pd.read_pickle('train_df_with_target_filtered.pkl')
train_df_idxs, test_df_idxs = Utility.train_test_split(df)

base_stats = pd.read_pickle('base_stats.pkl')
base_stats.drop('tf_idf_score', axis=1, inplace=True)
base_stats.drop('tf_idf_lem_score', axis=1, inplace=True)

base_stats_new = pd.read_pickle('tf_idf_bm25f_stats.pkl')
base_stats = pd.merge(base_stats, base_stats_new, how='left', on=('question_id', 'sentence'))

df = pd.merge(df.reset_index(), base_stats, how='left', on=('question_id', 'sentence')).set_index('question_id')

### Лог-регрессия на базовых фичах

In [4]:
target = 'answer_in_sentence'
predictors = df.columns.difference([
    'sentence',
    'sentence_lem',
    'question',
    'question_lem',
    'answer',
    'answer_lem',
    target
])
df_train = df.loc[train_df_idxs].copy()
df_test = df.loc[test_df_idxs].copy()

sc = StandardScaler()
X_train_sc = sc.fit_transform(df_train[predictors])
X_test_sc = sc.transform(df_test[predictors])

clf = LogisticRegression()
clf.fit(X_train_sc, df_train[target])

df_train['train_predict_proba'] = clf.predict_proba(X_train_sc)[:, 1]
df_test['test_predict_proba'] = clf.predict_proba(X_test_sc)[:, 1]

tqdm_pandas(tqdm(total=df_train.index.nunique()))
train_score = df_train.groupby('question_id').progress_apply(lambda x: Utility.get_answer_by_score(x, 'train_predict_proba')).sum()/df_train.index.nunique()

tqdm_pandas(tqdm(total=df_test.index.nunique()))
test_score = df_test.groupby('question_id').progress_apply(lambda x: Utility.get_answer_by_score(x, 'test_predict_proba')).sum()/df_test.index.nunique()

scores = {'Train': train_score, 'Test': test_score}
scores

  0%|          | 0/33767 [00:00<?, ?it/s]
33768it [01:42, 329.75it/s]                           
  0%|          | 0/14471 [00:00<?, ?it/s]
14472it [00:39, 363.30it/s]                           


{'Test': 0.81749706309170067, 'Train': 0.82423668078301304}

### LGB

In [5]:
clf = lgb.LGBMClassifier(n_estimators=300, learning_rate=0.1, max_depth=3, min_child_samples=1000, n_jobs=-1)
clf.fit(df_train[predictors], df_train[target])

df_train['train_predict_proba'] = clf.predict_proba(df_train[predictors])[:, 1]
df_test['test_predict_proba'] = clf.predict_proba(df_test[predictors])[:, 1]

tqdm_pandas(tqdm(total=df_train.index.nunique()))
train_score = df_train.groupby('question_id').progress_apply(lambda x: Utility.get_answer_by_score(x, 'train_predict_proba')).sum()/df_train.index.nunique()

tqdm_pandas(tqdm(total=df_test.index.nunique()))
test_score = df_test.groupby('question_id').progress_apply(lambda x: Utility.get_answer_by_score(x, 'test_predict_proba')).sum()/df_test.index.nunique()

scores = {'Train': train_score, 'Test': test_score}
scores

  0%|          | 0/33767 [00:00<?, ?it/s]
33768it [00:46, 726.00it/s]                           
  0%|          | 0/14471 [00:00<?, ?it/s]
14472it [00:19, 739.02it/s]                           


{'Test': 0.8253748877064474, 'Train': 0.82802736399443244}

In [31]:
clf = lgb.LGBMClassifier(n_estimators=300, learning_rate=0.1, max_depth=3, min_child_samples=1000, n_jobs=-1)
clf.fit(df_train[predictors], df_train[target])

df_train['train_predict_proba'] = clf.predict_proba(df_train[predictors])[:, 1]
df_test['test_predict_proba'] = clf.predict_proba(df_test[predictors])[:, 1]

tqdm_pandas(tqdm(total=df_train.index.nunique()))
train_score = df_train.groupby('question_id').progress_apply(lambda x: Utility.get_answer_by_score(x, 'train_predict_proba')).sum()/df_train.index.nunique()

tqdm_pandas(tqdm(total=df_test.index.nunique()))
test_score = df_test.groupby('question_id').progress_apply(lambda x: Utility.get_answer_by_score(x, 'test_predict_proba')).sum()/df_test.index.nunique()

scores = {'Train': train_score, 'Test': test_score}
scores

  0%|          | 0/33767 [00:00<?, ?it/s]
33768it [02:06, 267.76it/s]                           
  0%|          | 0/14471 [00:00<?, ?it/s]
14472it [00:31, 454.79it/s]                           


{'Test': 0.8245456430101582, 'Train': 0.82737584031746969}