In [1]:
import pandas as pd
from collections import Counter
import tqdm
import re
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
import functools
import sys
from __future__ import division # for python2 compatability
from nltk.stem.snowball import RussianStemmer

In [2]:
dftrain = pd.read_csv("train_task1_latest.csv", encoding='utf-8')[:-1000]
dfval = pd.read_csv("train_task1_latest.csv", encoding='utf-8')[-1000:]
dftest = pd.read_csv("sdsj_A_test.csv", encoding='utf-8')

In [12]:
stemer = RussianStemmer()
regex = re.compile('[^а-яА-Я ]')
stem_cache = {}

def get_stem(token):
    stem = stem_cache.get(token, None)
    if stem:
        return stem
    token = regex.sub('', token).lower()
    stem = stemer.stem(token)
    stem_cache[token] = stem
    return stem

def uniq_words(text):
    words = re.findall("\w+", text, re.UNICODE);
    stem_words = []
    for word in words:
        stem_words.append(get_stem(word))
    return set(stem_words)

def calculate_idfs(data):
    counter_paragraph = Counter()
    uniq_paragraphs = data['paragraph'].unique()
    for paragraph in tqdm.tqdm(uniq_paragraphs, desc="calc idf"):
        set_words = uniq_words(paragraph)
        counter_paragraph.update(set_words)
        
    num_docs = uniq_paragraphs.shape[0]
    idfs = {}
    for word in counter_paragraph:
        idfs[word] = np.log(num_docs / counter_paragraph[word])
    return idfs

In [13]:
idfs = calculate_idfs(dftrain)

calc idf: 100%|██████████| 9062/9062 [00:22<00:00, 404.21it/s]


In [14]:
for name, df in [('train', dftrain), ('test', dftest), ('val', dfval)]:
    for index, row in tqdm.tqdm(df.iterrows(), total=df.shape[0], desc="build features for " + name):
        question = uniq_words(row.question)
        paragraph = uniq_words(row.paragraph)
        df.loc[index, 'len_paragraph'] = len(paragraph)
        df.loc[index, 'len_question'] = len(question)
        df.loc[index, 'len_intersection'] = len(paragraph & question)
        df.loc[index, 'idf_question'] = np.sum([idfs.get(word, 0.0) for word in question])
        df.loc[index, 'idf_paragraph'] = np.sum([idfs.get(word, 0.0) for word in paragraph])
        df.loc[index, 'idf_intersection'] = np.sum([idfs.get(word, 0.0) for word in paragraph & question])

build features for train: 100%|██████████| 118398/118398 [14:19<00:00, 137.76it/s]
build features for test: 100%|██████████| 74286/74286 [06:43<00:00, 184.23it/s]
build features for val: 100%|██████████| 1000/1000 [00:03<00:00, 253.89it/s]


In [15]:
dftrain.head()

Unnamed: 0,paragraph_id,question_id,paragraph,question,target,len_paragraph,len_question,len_intersection,idf_question,idf_paragraph,idf_intersection
0,1094,46273,"В отличие от рыб, земноводные (амфибии) и прес...",С какого года Русское Царство перешло на летои...,0.0,65.0,19.0,4.0,58.826132,276.868809,2.562488
1,7414,19164,В 1049 году Балдуину V удалось отнять у Герман...,Кто упомянул о его первых разногласиях со Штей...,0.0,75.0,31.0,4.0,130.963883,320.112592,3.065944
2,6744,39767,Стремление достичь предельных значений ёмкости...,Как называется имеющая мировое значение эпоха ...,0.0,57.0,20.0,5.0,63.495112,234.850278,5.788364
3,7300,36318,Первый практически пригодный двухтактный газов...,Что усугублялось из-за международного давления...,0.0,57.0,14.0,3.0,45.780333,246.37495,0.942725
4,7077,41534,Требуя от художника углубленного изучения изоб...,Какой характер носят пророчества Леонардо да В...,0.0,86.0,7.0,4.0,32.919097,369.838222,20.729693


In [16]:
columns = ['len_paragraph', 'len_question', 'len_intersection', 'idf_question', 'idf_paragraph', 'idf_intersection']
model = GradientBoostingClassifier().fit(dftrain[columns], dftrain['target'])
dftest['prediction'] = model.predict(dftest[columns])

In [17]:
dftest[['paragraph_id', 'question_id', 'prediction']].to_csv("prediction.csv", index=False)

In [18]:
from sklearn.metrics import accuracy_score

predict_val = model.predict(dfval[columns])
accuracy_score(y_true=list(dfval['target']), y_pred=predict_val, normalize=False)

963

In [20]:
from sklearn.metrics import roc_auc_score
roc_auc_score(list(dfval['target']), predict_val)

0.96171804909876846