In [1]:
import pandas as pd
import numpy as np
import json
import io
import re
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [2]:
train_data = []
with io.open('train_data.json','r',encoding='utf8') as f:
    for line in f.readlines():
        d = json.loads(line)
        train_data.append(d)

In [3]:
test_data = []
with io.open('test_data.json','r',encoding='utf8') as f:
    for line in f.readlines():
        d = json.loads(line)
        test_data.append(d)

In [4]:
k = 0
Index = 0
train_reform = []
for k in range(len(train_data)):
    pos = - 1
    true_positions = []
    for sent in train_data[k][u'Sentences']:
        pos += len(sent) 
        true_positions += [pos]
        pos += 1

    string = train_data[k]['Paragraph']
    Marks = []
    positions = []
    pos = 0
    for i in re.findall(re.compile(u'[!"…\.»?]', re.U), string):
        pos = string.find(i, pos + 1, len(string))
        positions += [pos]
    for pos in positions:
        Index += 1
        if pos in true_positions:
            label = True
        else:
            label = False
        Marks += [{u'Index': Index, u'Pos': pos, u'Mark': string[pos], u'Label': label}]
    train_reform += [{'Paragraph': string, 'Marks': Marks}]

In [5]:
# end_of_sentences = set()
# for paragraph in train_data:
#     for sentence in paragraph['Sentences']:
#         end_of_sentences.add(sentence[-1])

In [6]:
class MyLabelOneHotEncoder():
    def __init__(self):
        self.label_enc = LabelEncoder()
        self.one_hot_enc = OneHotEncoder()
        self.result_len = 0
        
    def fit(self, array):
        self.label_enc.fit(array)
        self.one_hot_enc.fit(self.label_enc.transform(array).reshape(-1, 1))
        self.result_len = len(array)
        
    def predict_one(self, label):
        return self.one_hot_enc.transform(self.label_enc.transform([label])[0]).toarray()[0]
    
    def predict_none(self):
        return np.zeros(self.result_len, dtype=np.int)

In [7]:
end_of_sentences = np.array([u'!', u'"', u'.', u'?', u'\xbb', u'\u2026'])
end_encoder = MyLabelOneHotEncoder()
end_encoder.fit(end_of_sentences)

In [8]:
good_ends = [u'!', u'…', u'.', u'?']

In [9]:
stop_words_cand = Counter()
for p in train_reform:
    par = p['Paragraph']
    for cand in p['Marks']:
        pos = cand['Pos']
        ## _'alpha'._
        if (pos >= 3 and pos + 1 < len(par) and
            par[pos - 2] == ' ' and par[pos] == '.' and
            par[pos - 1].isalpha() and par[pos - 1].islower() and
            par[pos + 1] == ' '):
            stop_words_cand[par[pos - 1:pos + 1]] += 1
        ## _'alpha''alpha'._
        if (pos >= 3 and pos + 1 < len(par) and
            par[pos - 3] == ' ' and
            par[pos - 2].isalpha() and par[pos - 2].islower() and
            par[pos - 1].isalpha() and par[pos - 1].islower() and
            par[pos] == '.' and par[pos + 1] == ' '):
            stop_words_cand[par[pos - 2:pos + 1]] += 1

In [10]:
for word, cnt in stop_words_cand.most_common():
    if cnt > 5:
        print word, cnt

г. 163
т. 108
ст. 57
им. 54
см. 40
н. 39
гг. 31
д. 27
же. 25
км. 24
ч. 24
м. 22
он. 21
е. 19
мм. 19
в. 19
др. 18
их. 16
с. 16
э. 14
ее. 14
кв. 14
я. 13
бы. 13
п. 10
пр. 10
её. 9
кг. 9
ул. 9
св. 8
да. 7
m. 7
гр. 6


In [11]:
stop_words = set([u'им.', u'ст.', u'др.', u'ул.',
                  u'вв.', u'см.', u'гг.', u'кв.',
                  u'св.', u'км', u'мм.', u'г.',
                  u'т.', u'н', u'д', u'ч', 
                  u'м', u'е', u'в', u'с',
                  u'э', u'п'])

In [12]:
stop_encoder = MyLabelOneHotEncoder()
stop_encoder.fit(list(stop_words))

In [13]:
out_data = np.zeros((26476,1))

In [14]:
fea_num = 39

In [15]:
def predict(par, cand):
    features = np.zeros([1, 0])
    is_eof = False                      # 0
    is_next_space = False               # 1
    is_next_in_ends = False             # 2
    is_next_alpha_upper = False         # 3
    is_initial = False                  # 4
    is_numerated_list_beg = False       # 5
    is_in_good_ends = False             # 6
    is_city_or_year = False             # 7
    # Сложные правила:
    ## ._'digit'._'upper'
    is_next_list_begin = False          # 8
    # +Нестрогие
    ## 'upper'._'upper'
    is_feature_9 = False                # 9
    ## 'upper''lower'._'upper'
    is_feature_10 = False               # 10
    is_is_stop_words = False            # 11
    
    pos = cand['Pos']
    
    # 0
    if pos == len(par) - 1:
        is_eof = True
    features = np.append(features, is_eof)
    # 1
    if pos + 1 < len(par) and par[pos + 1] == ' ':
        is_next_space = True
    features = np.append(features, is_next_space)
    # 2
    if pos + 2 < len(par) and par[pos + 2].isalpha() and par[pos +2].isupper():
        is_next_alpha_upper = True
    features = np.append(features, is_next_alpha_upper)
    # 3
    if pos + 1 < len(par) and par[pos + 1] in end_of_sentences:
        is_next_in_ends = True
    features = np.append(features, is_next_in_ends)
    # 4
    if pos >= 2 and par[pos] == '.' and par[pos - 2] == '.' and par[pos - 1].isalpha() and par[pos - 1].isupper():
        is_initial = True
    features = np.append(features, is_initial)
    # 5
    if pos >= 1 and par[pos] == '.' and par[pos - 1].isdigit():
        is_numerated_list_beg = True
    features = np.append(features, is_numerated_list_beg)
    # 6
    if cand['Mark'] in good_ends:
        is_in_good_ends = True
    features = np.append(features, is_in_good_ends)
    # 7
    if pos >= 1 and pos + 1 < len(par) and par[pos - 1:pos + 2] == u'г. ':
        is_city_or_year = True
    features = np.append(features, is_city_or_year)
    # 8
    ## ._d._A
    ## ._dd._A
    ## ._d.A
    ## ._dd.A
    if (pos + 5 < len(par) and 
        par[pos] == '.' and par[pos + 1] == ' ' and
        par[pos + 2].isdigit() and
        par[pos + 3] == '.' and par[pos + 4] == ' ' and
        par[pos + 5].isalpha() and par[pos + 5].isupper()):
        is_next_list_begin = True
    if (pos + 6 < len(par) and 
        par[pos] == '.' and par[pos + 1] == ' ' and
        par[pos + 2].isdigit() and
        par[pos + 3].isdigit() and
        par[pos + 4] == '.' and par[pos + 5] == ' ' and
        par[pos + 6].isalpha() and par[pos + 6].isupper()):
        is_next_list_begin = True
    if (pos + 4 < len(par) and 
        par[pos] == '.' and par[pos + 1] == ' ' and
        par[pos + 2].isdigit() and
        par[pos + 3] == '.' and
        par[pos + 4].isalpha() and par[pos + 4].isupper()):
        is_next_list_begin = True
    if (pos + 5 < len(par) and 
        par[pos] == '.' and par[pos + 1] == ' ' and
        par[pos + 2].isdigit() and
        par[pos + 3].isdigit() and
        par[pos + 4] == '.' and
        par[pos + 5].isalpha() and par[pos + 5].isupper()):
        is_next_list_begin = True
    features = np.append(features, is_next_list_begin)
    # 9
    if (pos >= 1 and pos + 2 < len(par) and
        par[pos - 1].isalpha() and par[pos - 1].isupper() and
        par[pos] == '.' and par[pos + 1] == ' ' and
        par[pos + 2].isalpha() and par[pos + 2].isupper()):
        is_feature_9 = True
    features = np.append(features, is_feature_9)
    # 10
    if (pos >= 2 and pos + 2 < len(par) and
        par[pos - 2].isalpha() and par[pos - 2].isupper() and
        par[pos - 1].isalpha() and par[pos - 1].islower() and
        par[pos] == '.' and par[pos + 1] == ' ' and
        par[pos + 2].isalpha() and par[pos + 2].isupper()):
        is_feature_10 = True  
    features = np.append(features, is_feature_10)
    # 11
    if (pos >= 3 and pos + 1 < len(par) and
        par[pos - 3] == ' ' and
        par[pos - 2].isalpha() and par[pos - 2].islower() and
        par[pos - 1].isalpha() and par[pos - 1].islower() and
        par[pos] == '.' and par[pos + 1] == ' ' and
        par[pos - 2:pos + 1] in stop_words):
        is_is_stop_words = True
        features = np.append(features, stop_encoder.predict_one(par[pos - 2:pos + 1]))
    else:
        features = np.append(features, stop_encoder.predict_none())

    # append type of mark
    features = np.append(features, end_encoder.predict_one(par[pos]))
    
#     if is_eof:
#         return 1, features
    
#     if is_initial:
#         return 0, features
    
#     if is_city_or_year:
#         return 0, features
        
#     if is_next_in_ends:
#         return 0, features
    
#     if is_numerated_list_beg:
#         return 0, features
    
#     if is_next_list_begin:
#         return 1, features

#     if not is_in_good_ends:
#         return 0, features
    
#     if is_in_good_ends and (is_next_space and is_next_alpha_upper):
#         return 1, features
            
    return 0, features

In [16]:
X = np.zeros([0, fea_num])
y = np.zeros([0, 1])

In [17]:
for p in train_reform:
    par = p['Paragraph']
    for cand in p['Marks']:
        _, features = predict(par, cand)
        X = np.append(X, features.reshape(1, -1), axis=0)
        y = np.append(y, cand['Label'])

KeyboardInterrupt: 

In [18]:
from xgboost import XGBClassifier

In [19]:
from sklearn.ensemble import RandomForestClassifier

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [21]:
#model = RandomForestClassifier(n_estimators=1000)
model = XGBClassifier(n_estimators=2000, base_score=0.5, reg_lambda = 1)

In [22]:
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=2000,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [23]:
y_predict = model.predict(X_test)

  if diff:


In [24]:
roc_auc_score(y_predict, y_test)

0.9429339640404385

In [25]:
X_real = np.zeros([0, fea_num])

In [26]:
for p in test_data:
    par = p['Paragraph']
    for cand in p['Marks']:
        out_data[cand['Index']-1], features = predict(par, cand)
        X_real = np.append(X_real, features.reshape(1, -1), axis=0)
        pos = cand['Pos']

In [27]:
y_real = model.predict(X_real)

  if diff:


In [28]:
print np.sum(y_real), y_real.shape

17533.0 (26476,)


In [29]:
idx = 0
fd = open('marks.txt', 'w')
for p in test_data:
    par = p['Paragraph']
    fd.write(par.encode('utf-8') + '\n')
    for cand in p['Marks']:
        out_data[cand['Index']-1] = y_real[idx]
        pos = cand['Pos']
        fd.write(par[max(0,pos-15):pos].encode('utf-8') + '!' + par[pos:min(len(par),pos+15)].encode('utf-8') + ' ' + str(out_data[cand['Index']-1][0]) + '\n')
        idx += 1

In [30]:
df = pd.DataFrame(out_data, columns=['Mark'], index=range(1,26477))
df.index.name = 'Id'

In [31]:
df.to_csv("sampleSubmission.csv")

Адвокаты не допущены к задержанным. ovdinfo.org внимательно следит за развитием событий.

In [32]:
par = u'и Центр им. Вс. Мейерхольда (Москва) объявляют о начале'
par = u'Адвокаты не допущены к задержанным. ovdinfo.org внимательно следит за развитием событий.'
print par[87]
cand = {'Pos' : 87, 'Mark' : '.'}
_, fea = predict(par, cand)
print fea.reshape(1, -1)
model.predict(fea.reshape(1, -1))

.
[[1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]]


  if diff:


array([1.])

Великий режиссер советского кинематографа, мыслитель, новатор и педагог М. Ромм писал о телевидении
и Центр им. Вс. Мейерхольда (Москва) объявляют о начале