In [None]:
import nltk
import pandas as pd
import pymorphy2
import string
import re
import subprocess
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import RobustScaler, StandardScaler

nltk.download('punkt')

morph = pymorphy2.MorphAnalyzer()


def lemmatize_sentence(s):
    words = word_tokenize(s)
    # a hack of sorts
    if len(words) >= 2 and words[-1] == "." and len(words[-2]) == 1:
        words[-2] += "."
        words = words[:-1]
    for i in range(len(words)):
        words[i] = morph.parse(words[i])[0].normal_form
    return ' '.join(words)


def get_cosine_sim(*strs):
    vectors = [t for t in get_vectors(*strs)]
    return cosine_similarity(vectors)


def get_vectors(*strs):
    text = [t for t in strs]
    vectorizer = TfidfVectorizer(text)
    vectorizer.fit(text)
    return vectorizer.transform(text).toarray()


def precision(s, ref):
    w1 = s.split()
    w2 = ref.split()
    if len(w1) == 0:
        return 0
    return len([w for w in w1 if w in w2]) / len(w1)


def recall(s, ref):
    w1 = s.split()
    w2 = ref.split()
    if len(w2) == 0:
        return 0
    return len([w for w in w2 if w in w1]) / len(w2)


def f1_score(s, ref):
    p = precision(s, ref)
    r = recall(s, ref)
    if p + r == 0:
        return 0
    return 2 * p * r / (p + r)



translate_table = dict((ord(char), None) for char in string.punctuation)

regressor = LogisticRegression()

train_data = pd.read_csv('files/train_qa.csv')



In [None]:
x_train = []
y_train = []
cnt = 0
for entry in train_data.iterrows():
    cnt += 1
    if cnt % 50 == 0:
        print(cnt)
    sentences = sent_tokenize(entry[1]['paragraph'], language='russian')
    candidates = []
    for s in sentences:
        sentence = lemmatize_sentence(s).translate(translate_table)
        candidates.append((sentence, len(sentence), len(sentence.split())))
    lemmatized_question = lemmatize_sentence(entry[1]['question'].translate(translate_table))
    lemmatized_answer = lemmatize_sentence(entry[1]['answer']).translate(translate_table)
    for c in candidates:
        similarities = get_cosine_sim(c[0], lemmatized_question)
        x_train.append([similarities[-1][0], c[1], c[2]])
        y_train.append(f1_score(c[0], lemmatized_answer))


In [None]:
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import GradientBoostingRegressor
x_train = StandardScaler().fit_transform(x_train)
regressor = GradientBoostingRegressor(learning_rate=0.01, n_estimators=200)
regressor.fit(x_train, y_train)



In [None]:
months = ['январ', 'феврал', 'март', 'апрел', 'ма', 'июн', 'июл', 'август', 'сентябр', 'октябр', 'ноябр', 'декабр']

In [None]:
import re


test_data = pd.read_csv('files/dataset_281937_1.txt', names=["paragraph_id", "question_id", "paragraph", "question"],
                        sep='\t')

x_test = []
for entry in test_data.iterrows():
    sentences = sent_tokenize(entry[1]['paragraph'], language='russian')
    if sentences[0] == 'paragraph':
        continue
    candidates = []
    spans = []

    for s in sentences:
        sentence = lemmatize_sentence(s).translate(translate_table)
        candidates.append((sentence, len(sentence), len(sentence.split())))
    lemmatized_question = lemmatize_sentence(entry[1]['question'].translate(translate_table))
    for c in candidates:
        similarities = get_cosine_sim(c[0], lemmatized_question)
        x_test.append([similarities[-1][0], c[1], c[2]])

    x_test = StandardScaler().fit_transform(x_test)
    result = regressor.predict(x_test).tolist()
    idx = result.index(max(result))
    if idx > len(sentences):
        idx = len(sentences) - 1
    sent = sentences[idx]    
    sent = re.sub("[\[].*?[\]]", "", sent)
        
    words = sent.split()
    if 'когда' in lemmatized_question or 'какой год' in lemmatized_question or \
       'какой месяц' in lemmatized_question or 'каков дата' in lemmatized_question or \
        'каков год' in lemmatized_question or 'каков месяц' in lemmatized_question:
        full_date = re.search(r'(\d+(/|-|\.){1}\d+(/|-|\.){1}\d{2,4})', sent)
        year = re.search(r'((\d{2,4}) (году|год|г))', sent)
        if full_date is not None:
            sent = full_date.group(1)
        elif year is not None:
            year_name = next((x for x in words if year.group(2) in x), None)
            year_idx = words.index(year_name)
            words_span = words[max(0, year_idx - 3):year_idx + 3]
            sent = ' '.join(words_span)
        elif any(m in sent for m in months):    
            month_name = next((x for x in words if any(m in x for m in months)), None)
            if month_name is not None:
                month_idx = words.index(month_name)
                words_span = words[max(0, month_idx - 3):month_idx + 3]
                sent = ' '.join(words_span)
    else:
        spans = []
        for i in range(len(words)):
            for span in range(5, 8):
                spans.append(words[i:i + span])
        spans_str = [' '.join(span) for span in spans]    
        similarities = get_cosine_sim(*spans_str, lemmatized_question)        
        similarities = similarities[-1][:-1].tolist()
        idx = similarities.index(max(similarities))
        span = spans[idx]
        start_idx = words.index(span[0])
        end_idx = words.index(span[-1])
        new_sent = ' '.join(words[max(0, start_idx - 2):end_idx + 4])
        if len(new_sent) > 1:
            sent = ' '.join(words[max(0, start_idx - 2):end_idx + 4])                  
    print(entry[1]['question_id'], end='\t')
    print(sent)
    x_test = []