# Baseline: MaxMatch

In [None]:
import re
import numpy as np
import pandas as pd

In [None]:
import os
import sys

DATA_FILE = os.environ.get('INPUT', 'train.csv')
PREDICTION_FILE = os.environ.get('OUTPUT')

Читаем входные файлы с данными

In [None]:
df = pd.DataFrame.from_csv(DATA_FILE, sep=',', index_col=None)
if 'answer' in df.columns.values:
    df = df[['paragraph_id', 'question_id', 'paragraph', 'question', 'answer']]
else:
    df = df[['paragraph_id', 'question_id', 'paragraph', 'question']]

In [None]:
df.head(1)

Для каждого вопроса находим предложение параграфа, которое содержит наибольшее количество слов из вопроса

In [None]:
""" Official evaluation script for the SDSJ dataset. """
from collections import Counter
import re

def normalize_answer(text):
    """Lower text and remove punctuation and extra whitespace."""
    return ' '.join(re.findall(r"\w+", text)).lower()

def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

In [None]:
def sentence_to_word(sentences):
    sentences_in_words = list()
    for sentence in sentences:
        sentences_in_words.append(normalize_answer(sentence).split())
    return sentences_in_words


def text_to_sentence(text):
    sentences = text.split(".")
    return [s.strip() for s in sentences if s.strip() != '']

In [None]:
def get_max_match_sentance(data_row):
    sentences = text_to_sentence(data_row["paragraph"])
    sentences_in_words = sentence_to_word(sentences)
    question_in_words = sentence_to_word([data_row["question"]])[0]

    max_overlap = None
    max_match_sentance_id = None

    question_words = set(question_in_words)
    for sentance_id in range(len(sentences_in_words)):
        sentence_words = set(sentences_in_words[sentance_id])
        overlap = len(sentence_words.intersection(question_words))
        if max_overlap is None or overlap > max_overlap:
            max_overlap = overlap
            max_match_sentance_id = sentance_id

    return sentences[max_match_sentance_id]

Отвечаем на вопрос предложением, которое максимально пересекается с вопросом по словам

In [None]:
df['predictions'] = None
for data_ind in df.index.values:
    full_sentance = get_max_match_sentance(df.loc[data_ind])
    df.loc[data_ind,('predictions')] = full_sentance

Оцениваем качество решения

In [None]:
def get_score(df_solution, df_predictions):
    score = {
        'f1':
        np.mean([
            f1_score(prediction, answer) for answer, prediction in zip(df_solution, df_predictions)
        ]),
    }
    return score

In [None]:
if 'answer' in df.columns.values:
    print(get_score(df['predictions'].values, df['answer'].values))

Пример создания файла с ответ

In [None]:
if PREDICTION_FILE is not None:
    df['answer'] = df['predictions']
    df.set_index(['paragraph_id', 'question_id'])['answer'].to_csv(PREDICTION_FILE, header=True)