In [1]:
from multiprocessing import Process, Queue

import pandas as pd
import numpy as np

import warnings
import json
import io
import re

from collections import Counter

from xgboost import XGBClassifier

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [2]:
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [3]:
train_data = []
with io.open('train_data.json','r',encoding='utf8') as f:
    for line in f.readlines():
        d = json.loads(line)
        train_data.append(d)

In [4]:
test_data = []
with io.open('test_data.json','r',encoding='utf8') as f:
    for line in f.readlines():
        d = json.loads(line)
        test_data.append(d)

In [5]:
def reform_data(Data, index_bias, res_queue):
    Index = index_bias
    result = []
    for data in Data:
        marks = []
        good_marks = set()
        par = data['Paragraph']
        pos = -1
        for i in re.findall(re.compile(u'[!"…\.»?]', re.U), par):
            pos = par.find(i, pos + 1, len(par))
            marks += [{u'Index': Index, u'Pos': pos, u'Mark': par[pos], u'Label': False}]
            Index += 1
            
        for sentence in data[u'Sentences']:
            good_marks.add(par.find(sentence) + len(sentence) - 1)
        
        for mark in marks:
            if(mark['Pos'] in good_marks):
                mark['Label'] = True
        result += [{'Paragraph': par, 'Marks': marks}]
    res_queue.put(result)

In [6]:
chuncs_non_ref = np.array_split(train_data, 24)

train_reform = []
processes = []
res_queue = Queue() 
WORKER_NUM = len(chuncs_non_ref)
for i in xrange(WORKER_NUM):
    process = Process(target=reform_data, args=(chuncs_non_ref[i], 100000 * i, res_queue))
    processes.append(process)
    process.start()
    
complete_workers = 0
while complete_workers != WORKER_NUM:
    item = res_queue.get()
    complete_workers += 1
    train_reform += item
    print complete_workers
        
for process in processes: process.join()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24


In [7]:
class MyLabelOneHotEncoder():
    def __init__(self):
        self.label_enc = LabelEncoder()
        self.one_hot_enc = OneHotEncoder()
        self.result_len = 0
        
    def fit(self, array):
        self.label_enc.fit(array)
        self.one_hot_enc.fit(self.label_enc.transform(array).reshape(-1, 1))
        self.result_len = len(array)
        
    def predict_one(self, label):
        return self.one_hot_enc.transform(self.label_enc.transform([label])[0]).toarray()[0]
    
    def predict_none(self):
        return np.zeros(self.result_len, dtype=np.int)

In [8]:
end_of_sentences = np.array([u'!', u'"', u'.', u'?', u'\xbb', u'\u2026'])
end_encoder = MyLabelOneHotEncoder()
end_encoder.fit(end_of_sentences)

In [9]:
good_ends = [u'!', u'…', u'.', u'?']

In [10]:
stop_words_cand = Counter()
for p in train_reform:
    par = p['Paragraph']
    for cand in p['Marks']:
        pos = cand['Pos']
        ## _'alpha'._
        if (pos >= 3 and pos + 1 < len(par) and
            par[pos - 2] == ' ' and par[pos] == '.' and
            par[pos - 1].isalpha() and par[pos - 1].islower() and
            par[pos + 1] == ' '):
            stop_words_cand[par[pos - 1:pos + 1]] += 1
        ## _'alpha''alpha'._
        if (pos >= 3 and pos + 1 < len(par) and
            par[pos - 3] == ' ' and
            par[pos - 2].isalpha() and par[pos - 2].islower() and
            par[pos - 1].isalpha() and par[pos - 1].islower() and
            par[pos] == '.' and par[pos + 1] == ' '):
            stop_words_cand[par[pos - 2:pos + 1]] += 1
        ## _'alpha'.'alpha'._
        if (pos >= 4 and pos + 1 < len(par) and
            par[pos - 4] == ' ' and
            par[pos - 3].isalpha() and par[pos - 3].islower() and
            par[pos - 2] == '.' and
            par[pos - 1].isalpha() and par[pos - 1].islower() and
            par[pos] == '.' and par[pos + 1] == ' '):
            stop_words_cand[par[pos - 3:pos + 1]] += 1
        ## _'alpha'._'alpha'._    
        if (pos >= 5 and pos + 1 < len(par) and
            par[pos - 5] == ' ' and
            par[pos - 4].isalpha() and par[pos - 4].islower() and
            par[pos - 3] == '.' and par[pos - 2] == ' ' and
            par[pos - 1].isalpha() and par[pos - 1].islower() and
            par[pos] == '.' and par[pos + 1] == ' '):
            stop_words_cand[par[pos - 4:pos + 1]] += 1

In [11]:
for word, cnt in stop_words_cand.most_common():
    if cnt > 5:
        print word, cnt

г. 163
т. 108
т.е. 88
т.д. 57
ст. 57
им. 54
т.п. 54
л.д. 52
см. 40
н. 39
гг. 31
д. 27
же. 25
ч. 24
км. 24
м. 22
он. 21
в. 19
е. 19
мм. 19
др. 18
их. 16
т. д. 16
т.к. 16
с. 16
т. е. 15
э. 14
н. э. 14
кв. 14
ее. 14
я. 13
бы. 13
пр. 10
п. 10
ул. 9
её. 9
кг. 9
св. 8
т. п. 7
да. 7
m. 7
т. н. 7
гр. 6


In [12]:
stop_words_one = set([u' г. ', u' т. ', u' н. ', u' д. ',
                      u' ч. ', u' м. ', u' е. ', u' в. ',
                      u' с. ', u' э. ', u' п. '])
stop_words_two = set([u' им. ', u' ст. ', u' др. ', u' ул. ',
                      u' вв. ', u' см. ', u' гг. ', u' кв. ',
                      u' св. ', u' км. ', u' мм. '])
stop_words_thr = set([u' т.е. ', u' т.д. ', u' т.п. ', u' л.д. ',
                      u' т.к ', u' т.н. ', u' p.m. '])
stop_words_fou = set([u' т. д. ', u' т. е. ', u' н. э. ', u' т. п. ',
                      u' т. н '])

In [13]:
stop_encoder_one = MyLabelOneHotEncoder()
stop_encoder_one.fit(list(stop_words_one))

stop_encoder_two = MyLabelOneHotEncoder()
stop_encoder_two.fit(list(stop_words_two))

stop_encoder_thr = MyLabelOneHotEncoder()
stop_encoder_thr.fit(list(stop_words_thr))

stop_encoder_fou = MyLabelOneHotEncoder()
stop_encoder_fou.fit(list(stop_words_fou))

In [14]:
print stop_encoder_one.result_len
print stop_encoder_two.result_len
print stop_encoder_thr.result_len
print stop_encoder_fou.result_len
print end_encoder.result_len

11
11
7
5
6


In [15]:
def string_to_code(string, code_len=None):
    if not code_len:
        code_len = len(string)
    if len(string) > code_len:
        raise RuntimeError('Code len must be greater string len')
    tmp_str = np.array(list(ord(c) for c in string))
    return np.pad(tmp_str, [(0, code_len - tmp_str.shape[0])], mode='constant', constant_values=0)

In [16]:
def extract_encode_substrs(Data, window):
    X_tmp = []
    for data in Data:
        par = data['Paragraph']
        for cand in data['Marks']:
            pos = cand['Pos']            
            substr = par[pos - window[0]:pos + window[1]]
            X_tmp.append(string_to_code(substr, window[0] + window[1]))
    return X_tmp

In [17]:
train_encode_substrs = extract_encode_substrs(train_reform, [5, 2])
test_encode_substrs = extract_encode_substrs(test_data, [5, 2])

In [18]:
substrs_encoder = OneHotEncoder()
substrs_encoder.fit(train_encode_substrs + test_encode_substrs)

OneHotEncoder(categorical_features='all', dtype=<type 'numpy.float64'>,
       handle_unknown='error', n_values='auto', sparse=True)

In [19]:
print substrs_encoder.transform(np.zeros(7).reshape(1,-1)).toarray().shape

(1, 1184)


In [20]:
out_data = np.zeros((26476,1))

In [21]:
fea_num = 51 + 1184

In [22]:
def predict(par, cand):
    features = np.zeros([1, 0])
    is_eof = False                      # 0
    is_next_space = False               # 1
    is_next_in_ends = False             # 2
    is_next_alpha_upper = False         # 3
    is_initial = False                  # 4
    is_numerated_list_beg = False       # 5
    is_in_good_ends = False             # 6
    is_city_or_year = False             # 7
    # Сложные правила:
    ## ._'digit'._'upper'
    is_next_list_begin = False          # 8
    # +Нестрогие
    ## 'upper'._'upper'
    is_feature_9 = False                # 9
    ## 'upper''lower'._'upper'
    is_feature_10 = False               # 10
    is_is_stop_words = False            # 11
    
    pos = cand['Pos']
    
    # 0
    if pos == len(par) - 1:
        is_eof = True
    features = np.append(features, is_eof)
    # 1
    if pos + 1 < len(par) and par[pos + 1] == ' ':
        is_next_space = True
    features = np.append(features, is_next_space)
    # 2
    if pos + 2 < len(par) and par[pos + 1] == ' ' and par[pos + 2].isalpha() and par[pos + 2].isupper():
        is_next_alpha_upper = True
    features = np.append(features, is_next_alpha_upper)
    # 3
    if pos + 1 < len(par) and par[pos + 1] in end_of_sentences:
        is_next_in_ends = True
    features = np.append(features, is_next_in_ends)
    # 4
    # 'upper'._'upper'.
    # 'upper'.'upper'.
    if ((pos >= 4 and 
         par[pos - 4].isalpha() and par[pos - 4].isupper() and 
         par[pos - 3] == '.' and par[pos - 2] == ' ' and 
         par[pos - 1].isalpha() and par[pos - 1].isupper() and 
         par[pos] == '.') or 
        (pos >= 3 and 
         par[pos - 3].isalpha() and par[pos - 3].isupper() and 
         par[pos - 2] == '.' and 
         par[pos - 1].isalpha() and par[pos - 1].isupper() and 
         par[pos] == '.')):
        is_initial = True
    features = np.append(features, is_initial)
    # 5
    if ((pos >= 1 and par[pos] == '.' and par[pos - 1].isdigit()) or 
        (pos >= 2 and par[pos] == '.' and par[pos - 1].isdigit() and par[pos - 2].isdigit())):
        is_numerated_list_beg = True
    features = np.append(features, is_numerated_list_beg)
    # 6
    if cand['Mark'] in good_ends:
        is_in_good_ends = True
    features = np.append(features, is_in_good_ends)
    # 7
    if pos >= 2 and pos + 1 < len(par) and par[pos - 2:pos + 2] == u' г. ':
        is_city_or_year = True
    features = np.append(features, is_city_or_year)
    # 8
    ## ._d._A
    ## ._dd._A
    ## ._d.A
    ## ._dd.A
    if (pos + 5 < len(par) and 
        par[pos] == '.' and par[pos + 1] == ' ' and
        par[pos + 2].isdigit() and
        par[pos + 3] == '.' and par[pos + 4] == ' ' and
        par[pos + 5].isalpha() and par[pos + 5].isupper()):
        is_next_list_begin = True
    if (pos + 6 < len(par) and 
        par[pos] == '.' and par[pos + 1] == ' ' and
        par[pos + 2].isdigit() and
        par[pos + 3].isdigit() and
        par[pos + 4] == '.' and par[pos + 5] == ' ' and
        par[pos + 6].isalpha() and par[pos + 6].isupper()):
        is_next_list_begin = True
    if (pos + 4 < len(par) and 
        par[pos] == '.' and par[pos + 1] == ' ' and
        par[pos + 2].isdigit() and
        par[pos + 3] == '.' and
        par[pos + 4].isalpha() and par[pos + 4].isupper()):
        is_next_list_begin = True
    if (pos + 5 < len(par) and 
        par[pos] == '.' and par[pos + 1] == ' ' and
        par[pos + 2].isdigit() and
        par[pos + 3].isdigit() and
        par[pos + 4] == '.' and
        par[pos + 5].isalpha() and par[pos + 5].isupper()):
        is_next_list_begin = True
    features = np.append(features, is_next_list_begin)
    # 9
    if (pos >= 1 and pos + 2 < len(par) and
        par[pos - 1].isalpha() and par[pos - 1].isupper() and
        par[pos] == '.' and par[pos + 1] == ' ' and
        par[pos + 2].isalpha() and par[pos + 2].isupper()):
        is_feature_9 = True
    features = np.append(features, is_feature_9)
    # 10
    if (pos >= 2 and pos + 2 < len(par) and
        par[pos - 2].isalpha() and par[pos - 2].isupper() and
        par[pos - 1].isalpha() and par[pos - 1].islower() and
        par[pos] == '.' and par[pos + 1] == ' ' and
        par[pos + 2].isalpha() and par[pos + 2].isupper()):
        is_feature_10 = True  
    features = np.append(features, is_feature_10)
    # 11
    ## _'alpha'._
    if (pos >= 2 and pos + 1 < len(par) and par[pos - 2:pos + 2] in stop_words_one):
        features = np.append(features, stop_encoder_one.predict_one(par[pos - 2:pos + 2]))
    else:
        features = np.append(features, stop_encoder_one.predict_none())
    # 12
    ## _'alpha''alpha'._
    if (pos >= 3 and pos + 1 < len(par) and par[pos - 3:pos + 2] in stop_words_two):
        features = np.append(features, stop_encoder_two.predict_one(par[pos - 3:pos + 2]))
    else:
        features = np.append(features, stop_encoder_two.predict_none())
    # 13
    ## _'alpha'.'alpha'._
    if (pos >= 4 and pos + 1 < len(par) and par[pos - 4:pos + 2] in stop_words_thr):
        features = np.append(features, stop_encoder_thr.predict_one(par[pos - 4:pos + 2]))
    else:
        features = np.append(features, stop_encoder_thr.predict_none())
    # 14
    ## _'alpha'._'alpha'._
    if (pos >= 5 and pos + 1 < len(par) and par[pos - 5:pos + 2] in stop_words_fou):
        features = np.append(features, stop_encoder_fou.predict_one(par[pos - 5:pos + 2]))
    else:
        features = np.append(features, stop_encoder_fou.predict_none())

    # append type of mark
    features = np.append(features, end_encoder.predict_one(par[pos]))
            
    return features

In [23]:
def extract_features(Data, res_queue):
    Substr_tmp = []
    X_tmp = np.zeros([0, fea_num])
    y_tmp = np.zeros([0, 1])
    
    chunk_encode_substrs = extract_encode_substrs(Data, [5,2])
    chunk_one_hot_codes = substrs_encoder.transform(chunk_encode_substrs).toarray()
    
    idx = 0
    for p in Data:
        par = p['Paragraph']
        for cand in p['Marks']:
            features = predict(par, cand)
            features = np.append(features, chunk_one_hot_codes[idx])
            X_tmp = np.append(X_tmp, features.reshape(1, -1), axis=0)
            y_tmp = np.append(y_tmp, cand['Label'])
            Substr_tmp.append(par[max(0, cand['Pos'] - 15):min(len(par), cand['Pos'] + 15)])
            
            idx += 1
    res_queue.put((X_tmp, y_tmp, Substr_tmp))
    res_queue.put(None)

In [24]:
chuncs = np.array_split(train_reform, 24)

X = np.zeros([0, fea_num])
y = np.zeros([0, 1])
Substrs = []

processes = []
res_queue = Queue() 
WORKER_NUM = len(chuncs)
for i in xrange(WORKER_NUM):
    process = Process(target=extract_features, args=(chuncs[i], res_queue))
    processes.append(process)
    process.start()
    
complete_workers = 0
while complete_workers != WORKER_NUM:
    item = res_queue.get()
    if item == None:
        complete_workers += 1
        print complete_workers
    else:
        X = np.append(X, item[0], axis=0)
        y = np.append(y, item[1])
        Substrs += item[2]
        
for process in processes: process.join()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24


In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [26]:
print np.sum(y_test), y_test.shape
print np.sum(y_train), y_train.shape

21977.0 (30298,)
44800.0 (61513,)


In [27]:
model = XGBClassifier(n_estimators=1000, max_depth=13, n_jobs=24)

In [28]:
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=13, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=24, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)

In [29]:
y_predict = model.predict(X_test)

In [30]:
roc_auc_score(y_predict, y_test)

0.9872020520311922

In [31]:
print np.sum(y_predict != y_test)

334


In [32]:
for idx in xrange(len(y_predict)):
    if y_predict[idx] != y_test[idx]:
        print Substrs[idx]
        print y_predict[idx], y_test[idx]

пондента» «S7-Ъ».
1.0 0.0
2. Судно, исполь
0.0 1.0
 в дверь камеры.
0.0 1.0
главному выходу.
1.0 0.0
 к ней в прачки. Но, глядя на 
0.0 1.0
ми-гимназистами. Через неделю 
1.0 0.0
ень, всю неделю. В конце же не
0.0 1.0
уга и знакомясь. Был один отст
1.0 0.0
т ссылку на них.
1.0 0.0
ней же комнате?». Прикручинилс
1.0 0.0
Отрыгис (галакт. Leto Atreides
0.0 1.0
му пышный приём. В этот момент
1.0 0.0
— Э-э-э… Спасибо за за
1.0 0.0
 и Максимилиана.
0.0 1.0
ишь, просыпайся! Уже без пятна
0.0 1.0
щё одна пятёрка. Приятно иметь
1.0 0.0
или ему кулаком.
0.0 1.0
ией? Кто препод? — начал Митро
1.0 0.0
ло» больше всех.
1.0 0.0
равке и смеётся…
0.0 1.0
ользовался этим!
0.0 1.0
 «любовные игры», но перерыв у
0.0 1.0
— 5 группа! Хм… Двадцать. А д
1.0 0.0
уда Вы денетесь. Но только не 
0.0 1.0
рмально вообще?! Ну ничего, я 
0.0 1.0
 всё же удалась.
1.0 0.0
обще долбанутый?
0.0 1.0
настоящая днюха. Так что, без 
1.0 0.0
уск в пять утра. Утром столь р
1.0 0.0
шёл в аудиторию. Отработка про
1.0 0.0
вливае

In [33]:
X_real = np.zeros([26476, fea_num])

In [34]:
real_encode_substrs = extract_encode_substrs(test_data, [5,2])
real_one_hot_codes = substrs_encoder.transform(real_encode_substrs).toarray()


idx = 0
for p in test_data:
    par = p['Paragraph']
    for cand in p['Marks']:
        if idx % 100 == 0:
            print idx
        features = predict(par, cand)
        features = np.append(features, real_one_hot_codes[idx])
        X_real[idx] = features.reshape(1, -1)
        
        idx += 1

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
11300
11400
11500
11600
11700
11800
11900
12000
12100
12200
12300
12400
12500
12600
12700
12800
12900
13000
13100
13200
13300
13400
13500
13600
13700
13800
13900
14000
14100
14200
14300
14400
14500
14600
14700
14800
14900
15000
15100
15200
15300
15400
15500
15600
15700
15800
15900
16000
16100
16200
16300
16400
16500
16600
16700
16800
16900
17000
17100
17200
17300
17400
17500
17600
17700
17800
17900
18000
18100
18200
18300
18400
18

In [35]:
y_real = model.predict(X_real)

In [36]:
print np.sum(y_real), y_real.shape

18019.0 (26476,)


In [37]:
idx = 0
fd = open('marks.txt', 'w')
for p in test_data:
    par = p['Paragraph']
    fd.write(par.encode('utf-8') + '\n')
    for cand in p['Marks']:
        out_data[cand['Index']-1] = y_real[idx]
        pos = cand['Pos']
        fd.write(par[max(0,pos-15):pos].encode('utf-8') + '!' + par[pos:min(len(par),pos+15)].encode('utf-8') + ' ' + str(out_data[cand['Index']-1][0]) + '\n')
        
        idx += 1

In [38]:
df = pd.DataFrame(out_data, columns=['Mark'], index=range(1,26477))
df.index.name = 'Id'

In [39]:
df.to_csv("sampleSubmission.csv")