In [1]:
import numpy as np
import pandas as pd

import string
import re
import gc

In [2]:
import xgboost as xgb
from sklearn.model_selection import train_test_split



In [3]:
train = pd.read_csv('data/train.csv', encoding='utf-8')
train.dropna(inplace=True)
train = train[train.sentence_id != 492799]

In [6]:
train['class'].unique()

array(['PLAIN', 'DATE', 'PUNCT', 'ORDINAL', 'VERBATIM', 'LETTERS',
       'CARDINAL', 'MEASURE', 'TELEPHONE', 'ELECTRONIC', 'DECIMAL',
       'DIGIT', 'FRACTION', 'MONEY', 'TIME'], dtype=object)

In [22]:
punctuations = [s[0] for s in train[train['class'] == 'PUNCT']['before'].unique().tolist()]
verbatims = train[train['class'] == 'VERBATIM'].query('after != before').before.unique().tolist()
currencies = [u'¥', u'$', u'€', u'£']
power_signs = [u'²', u'³']
fraction_signs = [u'¼', u'½', u'¾', u'⅔', u'⅓']
specials = list(set(string.punctuation) - set(punctuations) - set(currencies))
date_signs = [u'январ', u'феврал', u'март', u'апрел', u'мая', u'май', u'июн', \
        u'июл', u'август', u'сентябр', u'октябр', u'ноябр', u'декабр', u'год']
quantitatives = [u'тыс', u'млн', u'миллион', u'миллиард', u'млрд', u'трлн', u'триллион']
measure_names = [u'см', u'км', u'%', u'с.', u'метров', u'мин', u'st', u'мм', 
                 u'кило', u'кг', u'м.', u'л.', u'т.', u'ярд', u' м']
currency_names = [u'евро', u'руб', u'долл', u'гривен', u'гривн', u'usd', u'eur', u'юан', u'йен', u'cyp', u'php', u'bsd']

In [23]:
def is_latin_letter(letter):
    ord_letter = ord(letter)
    return ord_letter >= 97 and ord_letter <= 122

def is_rus_letter(letter):
    ord_letter = ord(letter)
    return ord_letter >= 1072 and ord_letter <= 1105

def is_digit(letter):
    ord_letter = ord(letter)
    return ord_letter >= 48 and ord_letter <= 57

def is_greek(letter):
    ord_letter = ord(letter)
    return ord_letter >= 945 and ord_letter <= 969

def is_dash(letter):
    return letter == u'—'

def is_dot(letter):
    return letter == u'.'

def is_comma(letter):
    return letter == u','

def is_slash(letter):
    return letter == u'/'

def is_colon(letter):
    return letter == u':'

def is_hyphen(letter):
    return letter == u'-'

def is_date_in_word(word):
    for d in date_signs:
        if d in word:
            return True
    return False

def is_measure_in_word(word):
    for m in measure_names:
        if m in word:
            return True
    return False

def is_domain_in_word(word):
    url_pattern = "(([a-zA-Z0-9]+(-[a-zA-Z0-9]+)*\.)+[a-z]{2,10})"
    return len(re.findall(url_pattern, word)) > 0

def is_quantitative_in_word(word):
    for d in quantitatives:
        if d in word:
            return True
    return False

def is_currency_name_in_word(word):
    for c in currency_names:
        if c in word:
            return True
    return False

def generate_features(value):
    
    value = value.strip()
    num_letters = len(value)
    num_words = len(value.split(" "))
    
    rus_letters_num = 0
    en_letters_num = 0
    digits_num = 0
    punctuations_num = 0
    greek_letters_num = 0
    other_letters_num = 0
    special_letters_num = 0
    whitespaces_num = 0
    uppercase_num = 0
    
    has_dash = 0
    has_currency_sign = 0
    has_power_sign = 0
    has_fraction_sign = 0
    has_dot = 0
    has_comma = 0
    has_slash = 0
    has_colon = 0
    num_hyphens = 0

    for l in value:
        if l.isupper():
            uppercase_num += 1
        
        l = l.lower()

        if is_rus_letter(l):
            rus_letters_num += 1
        elif is_latin_letter(l):
            en_letters_num += 1
        elif is_digit(l):
            digits_num += 1
        elif is_greek(l):
            greek_letters_num += 1
        elif is_hyphen(l):
            num_hyphens += 1
        elif l in punctuations:
            punctuations_num += 1
        elif l in specials:
            special_letters_num += 1
        elif l.isspace():
            whitespaces_num += 1
        else:
            other_letters_num += 1
            
        if is_dash(l):
            has_dash = 1
        elif is_dot(l):
            has_dot = 1
        elif is_comma(l):
            has_comma = 1
        elif is_slash(l):
            has_slash = 1
        elif is_colon(l):
            has_colon = 1
        elif l in currencies:
            has_currency_sign = 1
        elif l in power_signs:
            has_power_sign = 1
        elif l in fraction_signs:
            has_fraction_sign = 1

    value = value.lower()
        
    is_uppercase = 1 if num_letters == uppercase_num else 0
    is_single_char = 1 if num_letters == 1 else 0
    is_russian_only = 1 if num_letters == rus_letters_num else 0
    is_english_only = 1 if num_letters == en_letters_num else 0
    is_digits_only = 1 if num_letters == digits_num else 0
    is_punctuation_only = 1 if num_letters == punctuations_num else 0
    is_one_word = 1 if num_words == 1 else 0
    
    has_date = is_date_in_word(value)
    has_measure = is_measure_in_word(value)
    has_domain = is_domain_in_word(value)
    has_quantitative = is_quantitative_in_word(value)
    is_zero_leading = len(value) > 0 and value[0] == u'0'
    has_currency_name = is_currency_name_in_word(value)
    
    return [num_letters, num_words, rus_letters_num, en_letters_num, digits_num, greek_letters_num, \
            other_letters_num, punctuations_num, uppercase_num, special_letters_num, num_hyphens, \
            is_russian_only, is_english_only, is_digits_only, is_punctuation_only, is_one_word, \
            is_uppercase, is_single_char, is_zero_leading, has_dash, has_currency_sign, has_power_sign, \
            has_fraction_sign, has_date, has_dot, has_comma, has_domain, has_quantitative, has_slash, \
            has_currency_name, has_colon, has_measure]

In [27]:
%%time
gen_data = train.before.apply(generate_features)
gen_data = np.vstack(gen_data.values)

Wall time: 8min 26s


In [65]:
%%time
features_no = len(generate_features(train.before.values[0]))
gen_features = np.zeros((train.shape[0], features_no * 5), dtype = 'int8')

lenghts = train.sentence_id.values
_, lenghts = np.unique(lenghts, return_counts = True)
lenghts = [[count] * count for count in lenghts]
flat_lenghts = [item for sublist in lenghts for item in sublist]
lenghts = np.array(flat_lenghts)

token_ids = train.token_id.values

for index in range(train.shape[0]):
    sentence_length = lenghts[index] - 1
    token_id = token_ids[index]
    
    for offset in range(-2, 3):
        if token_id + offset < 0 or token_id + offset > sentence_length:
            continue
        gen_features[index + offset, ((-offset + 2) * features_no):((-offset + 3) * features_no)] = gen_data[index, :]

Wall time: 3min 30s


In [24]:
sample_sizes = {
    "PUNCT" : 25000,
    "PLAIN" : 30000,
    "CARDINAL" : 20000,
    "LETTERS" : 20000,
    "DATE" : 20000,
    "ORDINAL" : 15000,
    "MEASURE" : 15000
}

In [106]:
train.reset_index(inplace=True)

In [107]:
%%time
train_data = pd.DataFrame(columns=train.columns)
sample_max_size = 10000

for c in train['class'].unique():
    class_sample = train[train['class'] == c]
    sample_size = sample_sizes[c] if c in sample_sizes else sample_max_size
    if len(class_sample) > sample_max_size:
        train_data = train_data.append(class_sample.sample(sample_size))
    else:
        train_data = train_data.append(class_sample)

Wall time: 21.8 s


In [108]:
labels = train_data['class'].values
train_indices = train_data.index.values
train_data = np.take(gen_features, train_indices, axis=0)

In [109]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train_data = scaler.fit_transform(train_data)



In [110]:
from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()
labels = enc.fit_transform(labels)

### Train XGBoost classifier

In [115]:
xtr, xcv, ytr, ycv = train_test_split(train_data, labels, test_size = 0.1,  random_state = 42)

In [116]:
train = xgb.DMatrix(xtr, label=ytr)
dvalid = xgb.DMatrix(xcv, label=ycv)

In [123]:
params = [("objective", "multi:softmax"),
          ("num_class", 15),
          ("booster", "gbtree"),
          ("nthread", 3),
          ("eta", 0.01),
          ("max_depth", 8),
          ("subsample", 0.9),
          ("min_child_weight", 1),
          ("colsample_bytree", 0.7),
          ("eval_metric", 'mlogloss'),
          ("eval_metric", 'merror')
         ]
num_rounds = 2000
stop = 40

In [120]:
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

In [124]:
gbm = xgb.train(params, dtrain, num_rounds, evals=watchlist, early_stopping_rounds=stop)

[0]	train-mlogloss:2.63718	train-merror:0.019889	valid-mlogloss:2.63695	valid-merror:0.017357
Multiple eval metrics have been passed: 'valid-merror' will be used for early stopping.

Will train until valid-merror hasn't improved in 40 rounds.
[1]	train-mlogloss:2.57272	train-merror:0.013918	valid-mlogloss:2.5724	valid-merror:0.013192
[2]	train-mlogloss:2.51126	train-merror:0.012839	valid-mlogloss:2.51095	valid-merror:0.011856
[3]	train-mlogloss:2.45333	train-merror:0.011905	valid-mlogloss:2.45303	valid-merror:0.011109
[4]	train-mlogloss:2.39862	train-merror:0.012049	valid-mlogloss:2.39833	valid-merror:0.011322
[5]	train-mlogloss:2.34873	train-merror:0.011937	valid-mlogloss:2.3485	valid-merror:0.011322
[6]	train-mlogloss:2.29914	train-merror:0.011878	valid-mlogloss:2.2989	valid-merror:0.011376
[7]	train-mlogloss:2.25245	train-merror:0.011712	valid-mlogloss:2.25222	valid-merror:0.011109
[8]	train-mlogloss:2.20946	train-merror:0.011477	valid-mlogloss:2.20931	valid-merror:0.010681
[9]	trai

[85]	train-mlogloss:0.814828	train-merror:0.009907	valid-mlogloss:0.814577	valid-merror:0.008812
[86]	train-mlogloss:0.806355	train-merror:0.009886	valid-mlogloss:0.806086	valid-merror:0.008812
[87]	train-mlogloss:0.798059	train-merror:0.009886	valid-mlogloss:0.797786	valid-merror:0.008812
[88]	train-mlogloss:0.789882	train-merror:0.009897	valid-mlogloss:0.789607	valid-merror:0.008919
[89]	train-mlogloss:0.781631	train-merror:0.009891	valid-mlogloss:0.781351	valid-merror:0.008866
[90]	train-mlogloss:0.773625	train-merror:0.009875	valid-mlogloss:0.77334	valid-merror:0.008866
[91]	train-mlogloss:0.765711	train-merror:0.009859	valid-mlogloss:0.765419	valid-merror:0.008866
[92]	train-mlogloss:0.757846	train-merror:0.009854	valid-mlogloss:0.757547	valid-merror:0.008812
[93]	train-mlogloss:0.750035	train-merror:0.009854	valid-mlogloss:0.749728	valid-merror:0.008812
[94]	train-mlogloss:0.742424	train-merror:0.009816	valid-mlogloss:0.742121	valid-merror:0.008759
[95]	train-mlogloss:0.734909	tr

[170]	train-mlogloss:0.35941	train-merror:0.009208	valid-mlogloss:0.358749	valid-merror:0.008118
[171]	train-mlogloss:0.356166	train-merror:0.009181	valid-mlogloss:0.355501	valid-merror:0.008118
[172]	train-mlogloss:0.352995	train-merror:0.009138	valid-mlogloss:0.352323	valid-merror:0.008118
[173]	train-mlogloss:0.349791	train-merror:0.009149	valid-mlogloss:0.349123	valid-merror:0.008118
[174]	train-mlogloss:0.346601	train-merror:0.009154	valid-mlogloss:0.345929	valid-merror:0.008118
[175]	train-mlogloss:0.343512	train-merror:0.009149	valid-mlogloss:0.342837	valid-merror:0.008118
[176]	train-mlogloss:0.340444	train-merror:0.009117	valid-mlogloss:0.339767	valid-merror:0.008065
[177]	train-mlogloss:0.337438	train-merror:0.009106	valid-mlogloss:0.336756	valid-merror:0.008065
[178]	train-mlogloss:0.3344	train-merror:0.00909	valid-mlogloss:0.333712	valid-merror:0.008065
[179]	train-mlogloss:0.331446	train-merror:0.009079	valid-mlogloss:0.330751	valid-merror:0.008011
[180]	train-mlogloss:0.3

[254]	train-mlogloss:0.17511	train-merror:0.008508	valid-mlogloss:0.174302	valid-merror:0.007691
[255]	train-mlogloss:0.173697	train-merror:0.008476	valid-mlogloss:0.172886	valid-merror:0.007584
[256]	train-mlogloss:0.172298	train-merror:0.008476	valid-mlogloss:0.17149	valid-merror:0.007584
[257]	train-mlogloss:0.17092	train-merror:0.008449	valid-mlogloss:0.170111	valid-merror:0.007584
[258]	train-mlogloss:0.169528	train-merror:0.008417	valid-mlogloss:0.168719	valid-merror:0.007477
[259]	train-mlogloss:0.168189	train-merror:0.008129	valid-mlogloss:0.167381	valid-merror:0.00721
[260]	train-mlogloss:0.166834	train-merror:0.008123	valid-mlogloss:0.166029	valid-merror:0.00721
[261]	train-mlogloss:0.165512	train-merror:0.008102	valid-mlogloss:0.164706	valid-merror:0.00721
[262]	train-mlogloss:0.164205	train-merror:0.008145	valid-mlogloss:0.163392	valid-merror:0.00721
[263]	train-mlogloss:0.162897	train-merror:0.008043	valid-mlogloss:0.162084	valid-merror:0.007103
[264]	train-mlogloss:0.1616

[339]	train-mlogloss:0.092858	train-merror:0.007579	valid-mlogloss:0.091997	valid-merror:0.006729
[340]	train-mlogloss:0.092231	train-merror:0.007579	valid-mlogloss:0.091369	valid-merror:0.006729
[341]	train-mlogloss:0.091603	train-merror:0.007579	valid-mlogloss:0.090736	valid-merror:0.006729
[342]	train-mlogloss:0.090985	train-merror:0.007563	valid-mlogloss:0.090115	valid-merror:0.006729
[343]	train-mlogloss:0.09038	train-merror:0.007563	valid-mlogloss:0.089511	valid-merror:0.006729
[344]	train-mlogloss:0.089776	train-merror:0.007568	valid-mlogloss:0.088905	valid-merror:0.006729
[345]	train-mlogloss:0.089192	train-merror:0.007568	valid-mlogloss:0.088319	valid-merror:0.006729
[346]	train-mlogloss:0.088595	train-merror:0.007568	valid-mlogloss:0.087718	valid-merror:0.006729
[347]	train-mlogloss:0.088026	train-merror:0.007579	valid-mlogloss:0.087151	valid-merror:0.006729
[348]	train-mlogloss:0.087453	train-merror:0.007573	valid-mlogloss:0.08658	valid-merror:0.006676
[349]	train-mlogloss:0

[423]	train-mlogloss:0.056287	train-merror:0.007226	valid-mlogloss:0.055426	valid-merror:0.006409
[424]	train-mlogloss:0.055984	train-merror:0.007221	valid-mlogloss:0.055126	valid-merror:0.006409
[425]	train-mlogloss:0.055703	train-merror:0.007215	valid-mlogloss:0.054849	valid-merror:0.006409
[426]	train-mlogloss:0.055417	train-merror:0.007215	valid-mlogloss:0.054563	valid-merror:0.006409
[427]	train-mlogloss:0.055136	train-merror:0.007215	valid-mlogloss:0.054286	valid-merror:0.006409
[428]	train-mlogloss:0.054861	train-merror:0.007205	valid-mlogloss:0.054011	valid-merror:0.006355
[429]	train-mlogloss:0.054569	train-merror:0.007189	valid-mlogloss:0.05372	valid-merror:0.006302
[430]	train-mlogloss:0.054302	train-merror:0.007189	valid-mlogloss:0.053456	valid-merror:0.006302
[431]	train-mlogloss:0.054034	train-merror:0.007194	valid-mlogloss:0.053188	valid-merror:0.006302
[432]	train-mlogloss:0.053771	train-merror:0.007194	valid-mlogloss:0.052925	valid-merror:0.006302
[433]	train-mlogloss:

In [None]:
gbm.save_model('xgboost_32fwc.model')

In [125]:
(dtrain.num_row() * 0.007189 + dvalid.num_row() * 0.006302)

1464.038252

In [None]:
gc.collect()

#### Predict and validate 

In [None]:
test_data = scaler.transform(gen_features)
test_labels = enc.transform(train['class'].values)



KeyboardInterrupt: 

In [58]:
dtest = xgb.DMatrix(test_data, label=test_labels)

In [59]:
predictions = gbm.predict(dtest)

In [60]:
from sklearn.metrics import accuracy_score

accuracy_score(predictions, test_labels)

0.99012823394692573

In [61]:
np.sum(predictions != test_labels)

104389

In [62]:
train['predicted_class'] = np.apply_along_axis((lambda x: enc.classes_[x]), 0, predictions.astype(np.uint8))

In [63]:
prediction_errors = train[train['class'] != train['predicted_class']]['class'].value_counts()

In [64]:
for i in range(len(prediction_errors)):
    print ("{:>16}{:>12.2f}%".format(prediction_errors.index[i], prediction_errors[i] * 100 / len(train[train['class'] == prediction_errors.index[i]])))

           PLAIN        0.70%
         LETTERS       11.19%
         ORDINAL       41.32%
        CARDINAL        4.10%
            DATE        0.32%
         MEASURE        1.13%
       TELEPHONE        0.97%
           PUNCT        0.00%
           MONEY        1.15%
      ELECTRONIC        0.26%
         DECIMAL        0.21%
            TIME        0.62%
        VERBATIM        0.00%


In [65]:
prediction_errors

PLAIN         51379
LETTERS       21211
ORDINAL       19313
CARDINAL      11175
DATE            586
MEASURE         458
TELEPHONE        98
PUNCT            91
MONEY            31
ELECTRONIC       15
DECIMAL          15
TIME             12
VERBATIM          5
Name: class, dtype: int64

In [66]:
for c in train['class'].unique():
    print ("Errors for class {}:".format(c))
    print (train[(train['class'] == c) & (train['class'] != train['predicted_class'])]['predicted_class'].value_counts())
    print ("\n" * 2)

Errors for class PLAIN:
LETTERS       25168
ORDINAL       13232
PUNCT          6965
VERBATIM       5935
ELECTRONIC       79
Name: predicted_class, dtype: int64



Errors for class DATE:
MEASURE       388
TELEPHONE     124
ORDINAL        43
FRACTION       28
ELECTRONIC      3
Name: predicted_class, dtype: int64



Errors for class PUNCT:
TELEPHONE    91
Name: predicted_class, dtype: int64



Errors for class ORDINAL:
CARDINAL    12242
LETTERS      7058
MEASURE         8
PLAIN           5
Name: predicted_class, dtype: int64



Errors for class VERBATIM:
PLAIN    5
Name: predicted_class, dtype: int64



Errors for class LETTERS:
PLAIN         21199
ELECTRONIC       12
Name: predicted_class, dtype: int64



Errors for class CARDINAL:
LETTERS      7608
ORDINAL      2977
DECIMAL       259
TELEPHONE     246
PLAIN          80
DIGIT           5
Name: predicted_class, dtype: int64



Errors for class MEASURE:
DATE          231
MONEY          81
DECIMAL        80
ORDINAL        53
LETTERS        