In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm, tqdm_pandas
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from itertools import combinations_with_replacement
from scipy.spatial.distance import cosine

from copy import copy
import os
import gc

#from qa_system import QuestionAnswerSystem
from utils import Utility

In [52]:
df = pd.read_csv('../data/train_task_b.csv')
df = df.set_index('question_id')

## Создаем БД

In [None]:
qa_system = QuestionAnswerSystem()
qa_system.create_database(df)
qa_system.add_database_to_index()

## Переводим вопрос и ответ в лемматизированную форму

In [20]:
tqdm_pandas(tqdm(total=df.index.nunique()))
df['question_lem'] = df.groupby('question_id').progress_apply(lambda x: Utility.lemmatize_question(x.question.values[0]))
df['answer_lem'] = df.groupby('question_id').progress_apply(lambda x: Utility.lemmatize(x.answer.values[0]))

  0%|          | 0/50364 [00:00<?, ?it/s]
50365it [01:35, 528.18it/s]                           
100%|██████████| 50365/50365 [00:45<00:00, 1115.87it/s]


## Выбираем лучший алгоритм поиска релевантного документа на подвыборке

In [3]:
df_sample = df.sample(frac=0.01, random_state=0)

In [4]:
## Freq: 0.66
## Tf-idf: 0.88
## Bm25f: 0.9688
## MAX_INTERSECT_DOC: 0.76

search_rel_question_doc_alg_str = "BM25F"
qa_system = QuestionAnswerSystem(search_rel_question_doc_alg_str)

accuracy = 0
errors = {}
for question_lem, paragraph_id, question_id in tqdm(df_sample.reset_index()[['question_lem', 'paragraph_id', 'question_id']].values, total=df_sample.question.nunique()):
    doc_ids = qa_system.find_rel_question_doc_ids(question_str_lem=question_lem)
    if paragraph_id in doc_ids:
        accuracy += 1
    else:
        errors[question_id] = copy(doc_ids)

print('{}: Accuracy: {}'.format(search_rel_question_doc_alg_str, accuracy/df_sample.question.nunique()))

100%|██████████| 504/504 [00:54<00:00,  9.21it/s]

BM25F: Accuracy: 0.9761904761904762





## Для лучшего алгоритма делаем пересчет по всей коллекции

In [4]:
search_rel_question_doc_alg_str = "BM25F"
if not os.path.exists(search_rel_question_doc_alg_str):
    os.mkdir(search_rel_question_doc_alg_str)
qa_system = QuestionAnswerSystem(search_rel_question_doc_alg_str) 

accuracy = 0
errors = {}
for question_lem, paragraph_id, question_id in tqdm(df.reset_index()[['question_lem', 'paragraph_id', 'question_id']].values, total=df.question.nunique()):
    doc_ids = qa_system.find_rel_question_doc_ids(question_str_lem=question_lem)
    if paragraph_id in doc_ids:
        accuracy += 1
    else:
        errors[question_id] = doc_ids
    np.save('{}/{}.npy'.format(search_rel_question_doc_alg_str, question_id), doc_ids)
np.save('{}_interrogative_pronouns_errors.npy'.format(search_rel_question_doc_alg_str), errors)
print('{}: Accuracy: {}'.format(search_rel_question_doc_alg_str, accuracy/df.question.nunique()))

50364it [1:33:29,  8.98it/s]                           

BM25F: Accuracy: 0.9726971267449018





## Формируем датасет для обучения (этап 2)
## Не учитываем те вопросы, по которым ошиблись на этапе 1

In [2]:
search_rel_question_doc_alg_str = 'BM25F'
errors = np.load('{}_interrogative_pronouns_errors.npy'.format(search_rel_question_doc_alg_str)).item()
train_df = QuestionAnswerSystem.create_train_dataset(errors=errors)

tqdm_pandas(tqdm(total=train_df.shape[0]))
train_df['sentence_lem'] = train_df.progress_apply(lambda x: Utility.lemmatize(x.sentence), axis=1)
train_df.to_pickle('train_df.pkl')

100%|██████████| 50364/50364 [03:15<00:00, 257.87it/s]
  0%|          | 0/2718499 [00:00<?, ?it/s]
100%|██████████| 2718499/2718499 [2:30:25<00:00, 301.20it/s]  


## Делаем разметку для классификатора

In [24]:
train_df = pd.merge(train_df, df.reset_index()[['question_id', 'question', 'question_lem', 'answer', 'answer_lem']], how='left', on='question_id')
train_df_with_target = QuestionAnswerSystem.create_target(train_df)
train_df_with_target.to_pickle('train_df_with_target.pkl')

100%|██████████| 2718499/2718499 [04:32<00:00, 9986.65it/s] 


## Фильтруем датасет по наличию хотя бы одного предложения с ответом

In [27]:
train_df_with_target = train_df_with_target.set_index('question_id')
train_df_with_target = train_df_with_target[train_df_with_target.groupby('question_id').apply(lambda x: any(x.answer_in_sentence == 1))]
train_df_with_target.to_pickle('train_df_with_target_filtered.pkl')

  


## Этап 2. Построение классификатора Ans_in_sentence

### 4% вопросов отсеялись на этапе 1

In [3]:
df = pd.read_pickle('train_df_with_target_filtered.pkl').reset_index()
df.drop_duplicates(subset=['question_id', 'sentence'], keep='first', inplace=True)
df = df.set_index('question_id')

In [4]:
train_df_idxs, test_df_idxs = Utility.train_test_split(df)

### Базовые статистики

In [6]:
import pickle
with open('idfs.pickle', 'rb' ) as f:
    idfs = pickle.load(f)
with open('idfs_lema.pickle', 'rb' ) as f:
    idfs_lem = pickle.load(f)
    
tqdm_pandas(tqdm(total=df.index.nunique()))
base_stats = df.groupby('question_id').progress_apply(lambda x: 
                                                     QuestionAnswerSystem.get_base_stats(
                                                         x.question.values[0],
                                                         list(x.sentence),
                                                         x.question_lem.values[0],
                                                         list(x.sentence_lem)
                                                     )).reset_index()
base_stats.drop('level_1', axis=1, inplace=True)
base_stats.columns = [
    'question_id',
    'unique_word_count_score',
    'unique_lem_word_count_score',
    
    'unique_word_percent_score',
    'unique_lem_word_percent_score',
    
    'sentence_len',
    'sentence_lem_len',
    
    'bm25f_score',
    'bm25f_lem_score',
    
    'tf_idf_score',
    'tf_idf_lem_score',
    
    'sentence',
    'sentence_lem'
]
base_stats.drop('sentence_lem', inplace=True, axis=1)
base_stats.to_pickle('base_stats.pkl')
df = pd.merge(df.reset_index(), base_stats, how='left', on=('question_id', 'sentence')).set_index('question_id')

  0%|          | 0/48238 [00:00<?, ?it/s]
48239it [50:52, 15.80it/s]                             


In [5]:
base_stats = pd.read_pickle('base_stats.pkl')
base_stats.drop_duplicates(subset=['question_id', 'sentence'], keep='first', inplace=True)
df = pd.merge(df.reset_index(), base_stats, how='left', on=('question_id', 'sentence')).set_index('question_id')

### Бейзлайны:
* max_unique_word_count_score
* max_unique_word_percent_score
* max_tf_idf_score
* max_bm25f_score

In [5]:
def get_scores(df, columns):
    n_questions = df.index.nunique()
    scores = {}
    tqdm_pandas(tqdm(total=n_questions))
    for col in columns:
        scores[col] = df.groupby('question_id').progress_apply(lambda x: Utility.get_answer_by_score(x, col)).sum()/n_questions
    return scores

train_scores = get_scores(df.loc[train_df_idxs], base_stats.columns.difference(['question_id', 'sentence_len', 'sentence_lem_len', 'sentence']))
test_scores = get_scores(df.loc[test_df_idxs], base_stats.columns.difference(['question_id', 'sentence_len', 'sentence_lem_len', 'sentence']))

scores = {'Train': {}, 'Test': {}}
scores['Train'] = train_scores
scores['Test'] = test_scores
pd.DataFrame(scores).T

  0%|          | 0/33767 [00:00<?, ?it/s]
33768it [00:21, 1600.23it/s]                           
100%|██████████| 33768/33768 [00:20<00:00, 1651.77it/s]
100%|██████████| 33768/33768 [00:21<00:00, 1606.59it/s]
100%|██████████| 33768/33768 [00:21<00:00, 1591.96it/s]
100%|██████████| 33768/33768 [00:20<00:00, 1678.68it/s]
100%|██████████| 33768/33768 [00:20<00:00, 1677.31it/s]
100%|██████████| 33768/33768 [00:19<00:00, 1720.74it/s]
100%|██████████| 33768/33768 [00:20<00:00, 1664.39it/s]
  0%|          | 0/14471 [00:00<?, ?it/s]
14472it [00:08, 1787.35it/s]                           
100%|██████████| 14472/14472 [00:08<00:00, 1652.58it/s]
100%|██████████| 14472/14472 [00:09<00:00, 1563.70it/s]
100%|██████████| 14472/14472 [00:08<00:00, 1711.18it/s]
100%|██████████| 14472/14472 [00:08<00:00, 1695.85it/s]
100%|██████████| 14472/14472 [00:08<00:00, 1635.30it/s]
100%|██████████| 14472/14472 [00:08<00:00, 1739.08it/s]
100%|██████████| 14472/14472 [00:08<00:00, 1671.17it/s]


Unnamed: 0,bm25f_lem_score,bm25f_score,tf_idf_lem_score,tf_idf_score,unique_lem_word_count_score,unique_lem_word_percent_score,unique_word_count_score,unique_word_percent_score
Test,0.937166,0.92076,0.932218,0.921084,0.940105,0.940105,0.928041,0.928041
Train,0.938901,0.922034,0.933665,0.922082,0.941904,0.941904,0.92985,0.92985


### Лог-регрессия на базовых фичах

In [5]:
target = 'answer_in_sentence'
predictors = df.columns.difference([
    'sentence',
    'sentence_lem',
    'question',
    'question_lem',
    'answer',
    'answer_lem',
    target
])
df_train = df.loc[train_df_idxs].copy()
df_test = df.loc[test_df_idxs].copy()

sc = StandardScaler()
X_train_sc = sc.fit_transform(df_train[predictors])
X_test_sc = sc.transform(df_test[predictors])

clf = LogisticRegression()
clf.fit(X_train_sc, df_train[target])

df_train['train_predict_proba'] = clf.predict_proba(X_train_sc)[:, 1]
df_test['test_predict_proba'] = clf.predict_proba(X_test_sc)[:, 1]

tqdm_pandas(tqdm(total=df_train.index.nunique()))
train_score = df_train.groupby('question_id').progress_apply(lambda x: Utility.get_answer_by_score(x, 'train_predict_proba')).sum()/df_train.index.nunique()

tqdm_pandas(tqdm(total=df_test.index.nunique()))
test_score = df_test.groupby('question_id').progress_apply(lambda x: Utility.get_answer_by_score(x, 'test_predict_proba')).sum()/df_test.index.nunique()

scores = {'Train': train_score, 'Test': test_score}
scores

  0%|          | 0/33767 [00:00<?, ?it/s]
33768it [00:17, 1888.78it/s]                           
  0%|          | 0/14471 [00:00<?, ?it/s]
14472it [00:08, 1728.32it/s]                           


{'Test': 0.9686258282162566, 'Train': 0.9698701282366897}

### LGB

In [6]:
# Метрика (accuracy по вопросам)
clf = lgb.LGBMClassifier(n_estimators=300, learning_rate=0.1, max_depth=3, min_child_samples=1000, n_jobs=-1)
clf.fit(df_train[predictors], df_train[target])

df_train['train_predict_proba'] = clf.predict_proba(df_train[predictors])[:, 1]
df_test['test_predict_proba'] = clf.predict_proba(df_test[predictors])[:, 1]

tqdm_pandas(tqdm(total=df_train.index.nunique()))
train_score = df_train.groupby('question_id').progress_apply(lambda x: Utility.get_answer_by_score(x, 'train_predict_proba')).sum()/df_train.index.nunique()

tqdm_pandas(tqdm(total=df_test.index.nunique()))
test_score = df_test.groupby('question_id').progress_apply(lambda x: Utility.get_answer_by_score(x, 'test_predict_proba')).sum()/df_test.index.nunique()

scores = {'Train': train_score, 'Test': test_score}
scores

  0%|          | 0/33767 [00:00<?, ?it/s]
33768it [00:51, 654.10it/s]                           
  0%|          | 0/14471 [00:00<?, ?it/s]
14472it [00:24, 585.89it/s]                           


{'Test': 0.8253748877064474, 'Train': 0.82802736399443244}

### Интеракшенсы

In [6]:
# Тип вопроса
print('Тип вопроса start...{}'.format(df.shape))
df = Utility.applyParallel(df.groupby('question_id'), func=Utility.get_question_type)

# Неры предложений
print('Неры предложений start...{}'.format(df.shape))
res = Utility.applyParallel(df.reset_index().groupby('question_id'), func=Utility.get_sentence_ners)
res.columns = ['question_id', 'sentence', 'sentence_ners']
#res = pd.read_pickle('df_with_target_stat.pkl')[['question_id', 'sentence', 'sentence_ners']]
df = pd.merge(df.reset_index(), res, how='left', on=('question_id', 'sentence')).set_index('question_id')

# Индикаторы неров в предложениях
ners = ['Per', 'Geox', 'Orgn', 'Date', 'Num']
print('Индикаторы неров в предложениях start...{}'.format(df.shape))
for ner in ners:
    df[ner] = 0    
tqdm_pandas(tqdm(total=df.shape[0]))
df = df.progress_apply(lambda x: Utility.get_sentence_ners_indicators(x), axis=1)

# Статистику по таргету считаем только по трейну
print('Статистики start...{}'.format(df.shape))
df_right = df.loc[train_df_idxs]
df_right = df_right[df_right.answer_in_sentence == 1]

# Частоты всречаемости типов вопросов (фильтруем)
freqs = df_right.question_type.value_counts().reset_index()
freqs.columns = ['question_type', 'freq']
df_right = pd.merge(df_right, freqs, how='left', on='question_type')
df_right = df_right[df_right.freq > 1]

# global: сколько раз нер встретился в ответах на все типы вопросов
for ner in ners:
    df_right['{}_global'.format(ner)] = df_right[ner].sum() 
    
# local: сколько раз нер встретился в ответ на данный типа вопрос
tqdm_pandas(tqdm(total=df_right.question_type.nunique()))
df_right = df_right.groupby('question_type').progress_apply(lambda x: Utility.get_ners_counts_by_question_type(x, ners))
   
# Нормируем
for ner in ners:
    df_right['{}_local'.format(ner)] /= df_right['{}_global'.format(ner)].sum()
    
# Определяем самый типовой нер для данного типа вопроса
print('Типовой нер start...{}'.format(df.shape))
tqdm_pandas(tqdm(total=df_right.question_type.nunique()))
df_right = df_right.groupby('question_type').progress_apply(lambda x: Utility.get_most_freq_ner_question_type(x, ners))

# Вставляем в исходный датасет фичу
df = pd.merge(df.reset_index(), df_right[['question_type', 'question_type_ner', 'freq']].drop_duplicates(), how='left', on='question_type').set_index('question_id')

# Индикатор пересечения неров из предложения и вопроса
print('Пересечение start...{}'.format(df.shape))
tqdm_pandas(tqdm(total=df.index.nunique()))
res = df.reset_index().groupby('question_id').progress_apply(lambda x: Utility.get_sentence_ner_question_type_indicator(x))
res.columns = ['question_id', 'sentence', 'question_type_ner_in_sentence_ners']
res = res.set_index(['question_id', 'sentence'])

# Интеракшенсы: дали такие же качество
print('Интеракшенсы start...{}'.format(df.shape))
tqdm_pandas(tqdm(total=df.index.nunique()))
combs = list(combinations_with_replacement(['Per', 'Geox', 'Orgn', 'Date', 'Num'], r=2))
res1 = df.reset_index().groupby('question_id').progress_apply(lambda x: Utility.get_sentence_ner_question_type_interactions(x, combs))
res1.columns = ['question_id', 'sentence'] + ['{}_{}'.format(comb[0], comb[1]) for comb in combs]
res1 = res1.set_index(['question_id', 'sentence'])

df = df.reset_index().set_index(['question_id', 'sentence'])
df['question_type_ner_in_sentence_ners'] = res.question_type_ner_in_sentence_ners

for col in ['{}_{}'.format(comb[0], comb[1]) for comb in combs]:
    df[col] = res1[col]
    
df = df.reset_index().set_index('question_id')
df.to_pickle('one_stable_version_with_interactions.pkl')

Тип вопроса start...(2646917, 18)


100%|██████████| 48238/48238 [02:38<00:00, 303.63it/s]


Неры предложений start...(2646917, 19)


100%|██████████| 48238/48238 [42:36<00:00, 18.87it/s]
  0%|          | 0/2646917 [00:00<?, ?it/s]

Индикаторы неров в предложениях start...(2646917, 20)



2646918it [05:00, 8813.75it/s]                              


Статистики start...(2646917, 25)


  0%|          | 0/3698 [00:00<?, ?it/s]
3699it [00:11, 313.08it/s]                          
  0%|          | 0/3698 [00:00<?, ?it/s]
  2%|▏         | 61/3698 [00:00<00:06, 605.42it/s]

Типовой нер start...(2646917, 25)


3699it [00:07, 493.24it/s]                           
  0%|          | 0/48238 [00:00<?, ?it/s]

Пересечение start...(2646917, 27)



48239it [01:05, 731.80it/s]                            
  0%|          | 0/48238 [00:00<?, ?it/s]

Интеракшенсы start...(2646917, 27)



48239it [02:01, 396.94it/s]                           


In [21]:
target = 'answer_in_sentence'
predictors = df.columns.difference([
    'sentence',
    'sentence_lem',
    'question',
    'question_lem',
    'answer',
    'answer_lem',
    'question_type',
    'sentence_ners',
    'Per', 'Geox', 'Orgn', 'Date', 'Num',
    'question_type_ner',
    'freq',

'Date_Date', 'Date_Num', 'Geox_Date', 'Geox_Geox', 'Geox_Num',
       'Geox_Orgn', 'Num_Num', 'Orgn_Date', 'Orgn_Num', 'Orgn_Orgn',
       'Per_Date', 'Per_Geox', 'Per_Num', 'Per_Orgn', 'Per_Per',
    target
])
df_train = df.loc[train_df_idxs].copy()
df_test = df.loc[test_df_idxs].copy()

sc = StandardScaler()
X_train_sc = sc.fit_transform(df_train[predictors])
X_test_sc = sc.transform(df_test[predictors])

clf = LogisticRegression()
clf.fit(X_train_sc, df_train[target])

df_train['train_predict_proba'] = clf.predict_proba(X_train_sc)[:, 1]
df_test['test_predict_proba'] = clf.predict_proba(X_test_sc)[:, 1]

tqdm_pandas(tqdm(total=df_train.index.nunique()))
train_score = df_train.groupby('question_id').progress_apply(lambda x: Utility.get_answer_by_score(x, 'train_predict_proba')).sum()/df_train.index.nunique()

tqdm_pandas(tqdm(total=df_test.index.nunique()))
test_score = df_test.groupby('question_id').progress_apply(lambda x: Utility.get_answer_by_score(x, 'test_predict_proba')).sum()/df_test.index.nunique()

scores = {'Train': train_score, 'Test': test_score}
scores

  0%|          | 0/33767 [00:00<?, ?it/s]
33768it [00:20, 1636.70it/s]                           
  0%|          | 0/14471 [00:00<?, ?it/s]
14472it [00:08, 1644.52it/s]                           


{'Test': 0.9690714166976677, 'Train': 0.9712376105986555}

### Word2Vec

In [2]:
df = pd.read_pickle('one_stable_version_with_interactions.pkl')
train_df_idxs, test_df_idxs = Utility.train_test_split(df)

In [6]:
# 300 -- размерность вектора
# 5 -- размерность окна
!wget http://rusvectores.org/static/models/rusvectores4/ruwikiruscorpora/ruwikiruscorpora_upos_skipgram_300_2_2018.vec.gz

--2018-03-18 18:21:42--  http://rusvectores.org/static/models/rusvectores4/ruwikiruscorpora/ruwikiruscorpora_upos_skipgram_300_2_2018.vec.gz
Resolving rusvectores.org (rusvectores.org)... 176.195.17.217
Connecting to rusvectores.org (rusvectores.org)|176.195.17.217|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 394697055 (376M) [application/x-gzip]
Saving to: ‘ruwikiruscorpora_upos_skipgram_300_2_2018.vec.gz’


2018-03-18 18:22:41 (6.32 MB/s) - ‘ruwikiruscorpora_upos_skipgram_300_2_2018.vec.gz’ saved [394697055/394697055]



In [4]:
#df = df.reset_index()
#df = Utility.applyParallel(df.groupby(df.index), func=Utility.get_question_sentence_word2vec_cosine_dist)
tqdm_pandas(tqdm(total=df.shape[0]))
df = df.progress_apply(lambda x: Utility.get_question_sentence_word2vec_cosine_dist(x), axis=1)
df.question_sentence_word2vec_cosine_dist.fillna(-1, inplace=True)

## Test: 0.9279097264237055, Train: 0.9105578792069107 (одиночный)

  0%|          | 0/2646917 [00:00<?, ?it/s]
  dist = 1.0 - uv / np.sqrt(uu * vv)
2646918it [41:52, 1053.67it/s]                              


In [8]:
target = 'answer_in_sentence'
predictors = df.columns.difference([
    'sentence',
    'sentence_lem',
    'question',
    'question_lem',
    'answer',
    'answer_lem',
    'question_type',
    'sentence_ners',
    'Per', 'Geox', 'Orgn', 'Date', 'Num',
    'question_type_ner',
    'freq',
'Date_Date', 'Date_Num', 'Geox_Date', 'Geox_Geox', 'Geox_Num',
       'Geox_Orgn', 'Num_Num', 'Orgn_Date', 'Orgn_Num', 'Orgn_Orgn',
       'Per_Date', 'Per_Geox', 'Per_Num', 'Per_Orgn', 'Per_Per',

    target
])
df_train = df.loc[train_df_idxs].copy()
df_test = df.loc[test_df_idxs].copy()

sc = StandardScaler()
X_train_sc = sc.fit_transform(df_train[predictors])
X_test_sc = sc.transform(df_test[predictors])

clf = LogisticRegression()
clf.fit(X_train_sc, df_train[target])

df_train['train_predict_proba'] = clf.predict_proba(X_train_sc)[:, 1]
df_test['test_predict_proba'] = clf.predict_proba(X_test_sc)[:, 1]

tqdm_pandas(tqdm(total=df_train.index.nunique()))
train_score = df_train.groupby('question_id').progress_apply(lambda x: Utility.get_answer_by_score(x, 'train_predict_proba')).sum()/df_train.index.nunique()

tqdm_pandas(tqdm(total=df_test.index.nunique()))
test_score = df_test.groupby('question_id').progress_apply(lambda x: Utility.get_answer_by_score(x, 'test_predict_proba')).sum()/df_test.index.nunique()

scores = {'Train': train_score, 'Test': test_score}
scores

  0%|          | 0/33767 [00:00<?, ?it/s]
33768it [00:19, 1744.30it/s]                           
  0%|          | 0/14471 [00:00<?, ?it/s]
14472it [00:08, 1740.97it/s]                           


{'Test': 0.9690994824114406, 'Train': 0.9712612797432139}

In [9]:
predictors

Index(['bm25f_lem_score', 'bm25f_score', 'doc_number',
       'question_sentence_word2vec_cosine_dist',
       'question_type_ner_in_sentence_ners', 'sentence_lem_len',
       'sentence_len', 'tf_idf_lem_score', 'tf_idf_score',
       'unique_lem_word_count_score', 'unique_lem_word_percent_score',
       'unique_word_count_score', 'unique_word_percent_score'],
      dtype='object')

## TopicModel