In [1]:
import pandas as pd
from collections import Counter
import functools
import tqdm
import re
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
dftrain, dftest = pd.read_csv("../train_task1.csv"), pd.read_csv("../test_task1.csv")

In [3]:
@functools.lru_cache(maxsize=2 ** 19)
def uniq_words(text):
    return set(re.findall("\w+", text))

def calculate_idfs(data):
    counter_paragraph = Counter()
    uniq_paragraphs = data['paragraph'].unique()
    for paragraph in tqdm.tqdm(uniq_paragraphs, desc="calc idf"):
        set_words = uniq_words(paragraph)
        counter_paragraph.update(set_words)
        
    num_docs = uniq_paragraphs.shape[0]
    idfs = {}
    for word in counter_paragraph:
        idfs[word] = np.log(num_docs / counter_paragraph[word])
    return idfs

In [4]:
idfs = calculate_idfs(dftrain)

calc idf: 100%|██████████| 5218/5218 [00:00<00:00, 8656.74it/s]


In [5]:
for name, df in [('train', dftrain), ('test', dftest)]:
    for index, row in tqdm.tqdm(df.iterrows(), total=df.shape[0], desc="build features for " + name):
        question = uniq_words(row.question)
        paragraph = uniq_words(row.paragraph)
        df.loc[index, 'len_paragraph'] = len(paragraph)
        df.loc[index, 'len_question'] = len(question)
        df.loc[index, 'len_intersection'] = len(paragraph & question)
        df.loc[index, 'idf_question'] = np.sum([idfs.get(word, 0.0) for word in question])
        df.loc[index, 'idf_paragraph'] = np.sum([idfs.get(word, 0.0) for word in paragraph])
        df.loc[index, 'idf_intersection'] = np.sum([idfs.get(word, 0.0) for word in paragraph & question])

build features for train: 100%|██████████| 96612/96612 [08:00<00:00, 201.25it/s]
build features for test: 100%|██████████| 75033/75033 [05:18<00:00, 235.69it/s]


In [6]:
dftrain.head()

Unnamed: 0,paragraph_id,question_id,paragraph,question,target,len_paragraph,len_question,len_intersection,idf_question,idf_paragraph,idf_intersection
0,5551,56724,31 августа 2016 года АО Банк Финсервис получил...,Кто подтвердил рейтинг кредитоспособности ПАО ...,0.0,65.0,17.0,3.0,83.650717,320.188531,1.446893
1,6678,64727,Лимфатическая система выступает как дополнение...,Какое положение между одиночным и стадным обра...,0.0,70.0,13.0,4.0,64.781264,382.531147,9.737011
2,5941,56747,Тиристорно-импульсная система управления (ТИСУ...,Чем не создается необходимый по величине ток?,1.0,71.0,7.0,5.0,32.478578,389.340211,20.286093
3,2075,50763,После начала борьбы с православием в Литовском...,"В какой части города многочисленны пруды, обра...",0.0,95.0,13.0,3.0,63.039527,532.465918,1.712881
4,5204,53421,В 1954 году Шостакович писал: Дисциплина труда...,К какой культуре по мнению Шнитке принадлежали...,0.0,109.0,10.0,3.0,43.181146,538.020191,11.646477


In [7]:
columns = ['len_paragraph', 'len_question', 'len_intersection', 'idf_question', 'idf_paragraph', 'idf_intersection']
model = GradientBoostingClassifier().fit(dftrain[columns], dftrain['target'])
dftest['prediction'] = model.predict_proba(dftest[columns])[:, 1]

In [9]:
dftest[['paragraph_id', 'question_id', 'prediction']].to_csv("prediction.csv", index=False)