In [1]:
import pandas as pd
from collections import Counter
import tqdm
import re
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
import functools
import sys
from __future__ import division # for python2 compatability

In [6]:
dftrain = pd.read_csv("train_task1_latest.csv", encoding='utf-8')
dftest = pd.read_csv("test_task1_for_users.csv", encoding='utf-8')

In [3]:
def uniq_words(text):
    return set(re.findall("\w+", text, re.UNICODE))

def calculate_idfs(data):
    counter_paragraph = Counter()
    uniq_paragraphs = data['paragraph'].unique()
    for paragraph in tqdm.tqdm(uniq_paragraphs, desc="calc idf"):
        set_words = uniq_words(paragraph)
        counter_paragraph.update(set_words)
        
    num_docs = uniq_paragraphs.shape[0]
    idfs = {}
    for word in counter_paragraph:
        idfs[word] = np.log(num_docs / counter_paragraph[word])
    return idfs

In [7]:
idfs = calculate_idfs(dftrain)

calc idf: 100%|██████████| 9078/9078 [00:01<00:00, 7585.95it/s] 


In [8]:
for name, df in [('train', dftrain), ('test', dftest)]:
    for index, row in tqdm.tqdm(df.iterrows(), total=df.shape[0], desc="build features for " + name):
        question = uniq_words(row.question)
        paragraph = uniq_words(row.paragraph)
        df.loc[index, 'len_paragraph'] = len(paragraph)
        df.loc[index, 'len_question'] = len(question)
        df.loc[index, 'len_intersection'] = len(paragraph & question)
        df.loc[index, 'idf_question'] = np.sum([idfs.get(word, 0.0) for word in question])
        df.loc[index, 'idf_paragraph'] = np.sum([idfs.get(word, 0.0) for word in paragraph])
        df.loc[index, 'idf_intersection'] = np.sum([idfs.get(word, 0.0) for word in paragraph & question])

build features for train: 100%|██████████| 119398/119398 [11:36<00:00, 171.41it/s]
build features for test: 100%|██████████| 74294/74294 [06:07<00:00, 202.16it/s]


In [9]:
dftrain.head()

Unnamed: 0,paragraph_id,question_id,paragraph,question,target,len_paragraph,len_question,len_intersection,idf_question,idf_paragraph,idf_intersection
0,1094,46273,"В отличие от рыб, земноводные (амфибии) и прес...",С какого года Русское Царство перешло на летои...,0.0,67.0,19.0,3.0,82.337722,355.51046,2.139502
1,7414,19164,В 1049 году Балдуину V удалось отнять у Герман...,Кто упомянул о его первых разногласиях со Штей...,0.0,89.0,31.0,3.0,158.749487,498.261027,3.142724
2,6744,39767,Стремление достичь предельных значений ёмкости...,Как называется имеющая мировое значение эпоха ...,0.0,62.0,20.0,3.0,102.316632,356.768654,0.543727
3,7300,36318,Первый практически пригодный двухтактный газов...,Что усугублялось из-за международного давления...,0.0,66.0,14.0,3.0,58.820079,364.617823,1.035894
4,7077,41534,Требуя от художника углубленного изучения изоб...,Какой характер носят пророчества Леонардо да В...,0.0,93.0,7.0,3.0,43.903109,509.843626,17.245687


In [10]:
columns = ['len_paragraph', 'len_question', 'len_intersection', 'idf_question', 'idf_paragraph', 'idf_intersection']
model = GradientBoostingClassifier().fit(dftrain[columns], dftrain['target'])
dftest['prediction'] = model.predict_proba(dftest[columns])[:, 1]

In [11]:
dftest[['paragraph_id', 'question_id', 'prediction']].to_csv("prediction.csv", index=False)