In [1]:
import numpy as np
import pandas as pd

import string
import re
import gc

In [2]:
import xgboost as xgb
from sklearn.model_selection import train_test_split



In [3]:
train = pd.read_csv('data/train.csv', encoding='utf-8')
train.dropna(inplace=True)
train = train[train.sentence_id != 492799]

In [4]:
train['class'].unique()

array(['PLAIN', 'DATE', 'PUNCT', 'ORDINAL', 'VERBATIM', 'LETTERS',
       'CARDINAL', 'MEASURE', 'TELEPHONE', 'ELECTRONIC', 'DECIMAL',
       'DIGIT', 'FRACTION', 'MONEY', 'TIME'], dtype=object)

In [5]:
punctuations = [s[0] for s in train[train['class'] == 'PUNCT']['before'].unique().tolist()]
verbatims = train[train['class'] == 'VERBATIM'].query('after != before').before.unique().tolist()
currencies = [u'¥', u'$', u'€', u'£']
power_signs = [u'²', u'³']
fraction_signs = [u'¼', u'½', u'¾', u'⅔', u'⅓']
specials = list(set(string.punctuation) - set(punctuations) - set(currencies))
date_signs = [u'январ', u'феврал', u'март', u'апрел', u'мая', u'май', u'июн', \
        u'июл', u'август', u'сентябр', u'октябр', u'ноябр', u'декабр', u'год']
quantitatives = [u'тыс', u'млн', u'миллион', u'миллиард', u'млрд']
measure_names = [u'см', u'км', u'%', u'с.', u'метров', u'мин', u'st', u'мм', 
                 u'кило', u'кг', u'м.', u'л.', u'т.', u'ярд', u' м']
currency_names = [u'евро', u'руб', u'долл', u'гривен', u'гривн', u'usd', u'eur', u'юан', u'йен', u'cyp', u'php', u'bsd']

In [43]:
def is_latin_letter(letter):
    ord_letter = ord(letter)
    return ord_letter >= 97 and ord_letter <= 122

def is_rus_letter(letter):
    ord_letter = ord(letter)
    return ord_letter >= 1072 and ord_letter <= 1105

def is_digit(letter):
    ord_letter = ord(letter)
    return ord_letter >= 48 and ord_letter <= 57

def is_greek(letter):
    ord_letter = ord(letter)
    return ord_letter >= 945 and ord_letter <= 969

def is_dash(letter):
    return letter == u'—'

def is_dot(letter):
    return letter == u'.'

def is_comma(letter):
    return letter == u','

def is_slash(letter):
    return letter == u'/'

def is_colon(letter):
    return letter == u':'

def is_hyphen(letter):
    return letter == u'-'

def is_date_in_word(word):
    for d in date_signs:
        if d in word:
            return True
    return False

def is_measure_in_word(word):
    for m in measure_names:
        if m in word:
            return True
    return False

def is_domain_in_word(word):
    url_pattern = "(([a-zA-Z0-9]+(-[a-zA-Z0-9]+)*\.)+[a-z]{2,10})"
    return len(re.findall(url_pattern, word)) > 0

def is_quantitative_in_word(word):
    for d in quantitatives:
        if d in word:
            return True
    return False

def is_currency_name_in_word(word):
    for c in currency_names:
        if c in word:
            return True
    return False

def generate_features(value):
    
    value = value.strip()
    num_letters = len(value)
    num_words = len(value.split(" "))
    
    rus_letters_num = 0
    en_letters_num = 0
    digits_num = 0
    punctuations_num = 0
    greek_letters_num = 0
    other_letters_num = 0
    special_letters_num = 0
    whitespaces_num = 0
    uppercase_num = 0
    
    has_dash = 0
    has_currency_sign = 0
    has_power_sign = 0
    has_fraction_sign = 0
    has_dot = 0
    has_comma = 0
    has_slash = 0
    has_colon = 0
    num_hyphens = 0

    for l in value:
        if l.isupper():
            uppercase_num += 1
        
        l = l.lower()

        if is_rus_letter(l):
            rus_letters_num += 1
        elif is_latin_letter(l):
            en_letters_num += 1
        elif is_digit(l):
            digits_num += 1
        elif is_greek(l):
            greek_letters_num += 1
        elif is_hyphen(l):
            num_hyphens += 1
        elif l in punctuations:
            punctuations_num += 1
        elif l in specials:
            special_letters_num += 1
        elif l.isspace():
            whitespaces_num += 1
        else:
            other_letters_num += 1
            
        if is_dash(l):
            has_dash = 1
        elif is_dot(l):
            has_dot = 1
        elif is_comma(l):
            has_comma = 1
        elif is_slash(l):
            has_slash = 1
        elif is_colon(l):
            has_colon = 1
        elif l in currencies:
            has_currency_sign = 1
        elif l in power_signs:
            has_power_sign = 1
        elif l in fraction_signs:
            has_fraction_sign = 1

    value = value.lower()
        
    is_uppercase = 1 if num_letters == uppercase_num else 0
    is_single_char = 1 if num_letters == 1 else 0
    is_russian_only = 1 if num_letters == rus_letters_num else 0
    is_english_only = 1 if num_letters == en_letters_num else 0
    is_digits_only = 1 if num_letters == digits_num else 0
    is_punctuation_only = 1 if num_letters == punctuations_num else 0
    is_one_word = 1 if num_words == 1 else 0
    
    has_date = is_date_in_word(value)
    has_measure = is_measure_in_word(value)
    has_domain = is_domain_in_word(value)
    has_quantitative = is_quantitative_in_word(value)
    is_zero_leading = len(value) > 0 and value[0] == u'0'
    has_currency_name = is_currency_name_in_word(value)
    
    return [num_letters, num_words, rus_letters_num, en_letters_num, digits_num, greek_letters_num, \
            other_letters_num, punctuations_num, uppercase_num, special_letters_num, num_hyphens, \
            is_russian_only, is_english_only, is_digits_only, is_punctuation_only, is_one_word, \
            is_uppercase, is_single_char, is_zero_leading, has_dash, has_currency_sign, has_power_sign, \
            has_fraction_sign, has_date, has_dot, has_comma, has_domain, has_quantitative, has_slash, \
            has_currency_name, has_colon, has_measure]

In [7]:
sample_sizes = {
    "PUNCT" : 25000,
    "PLAIN" : 30000,
    "CARDINAL" : 20000,
    "LETTERS" : 20000,
    "DATE" : 20000,
    "ORDINAL" : 15000,
    "MEASURE" : 15000
}

In [44]:
%%time
train_data = pd.DataFrame(columns=train.columns)
sample_max_size = 10000

for c in train['class'].unique():
    class_sample = train[train['class'] == c]
    sample_size = sample_sizes[c] if c in sample_sizes else sample_max_size
    if len(class_sample) > sample_max_size:
        train_data = train_data.append(class_sample.sample(sample_size))
    else:
        train_data = train_data.append(class_sample)

Wall time: 20.9 s


In [45]:
%%time
labels = train_data['class']
train_data = train_data.before.apply(generate_features)
train_data = np.vstack(train_data.values)

Wall time: 7.95 s


In [46]:
train_data.shape

(187236, 32)

In [47]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train_data = scaler.fit_transform(train_data)



In [48]:
from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()
labels = enc.fit_transform(labels)

#### Save XGBoost data 

In [49]:
xtr, xcv, ytr, ycv = train_test_split(train_data, labels, test_size = 0.1,  random_state = 42)

In [50]:
dtrain = xgb.DMatrix(xtr, label=ytr)
dvalid = xgb.DMatrix(xcv, label=ycv)

In [12]:
dtrain.save_binary('dtrain.buffer')
dvalid.save_binary('dvalid.buffer')

#### Load XGBoost data 

In [3]:
dtrain = xgb.DMatrix('dtrain.buffer')
dvalid = xgb.DMatrix('dvalid.buffer')

In [15]:
params = [("objective", "multi:softmax"),
          ("num_class", 15),
          ("booster", "gbtree"),
          ("nthread", 3),
          ("eta", 0.01),
          ("max_depth", 6),
          ("subsample", 0.9),
          ("min_child_weight", 1),
          ("colsample_bytree", 0.7),
          ("eval_metric", 'mlogloss'),
          ("eval_metric", 'merror')
         ]
num_rounds = 2000
stop = 20

In [51]:
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

In [52]:
gbm = xgb.train(params, dtrain, num_rounds, evals=watchlist, early_stopping_rounds=stop)

[0]	train-mlogloss:2.64238	train-merror:0.060649	valid-mlogloss:2.64232	valid-merror:0.061312
Multiple eval metrics have been passed: 'valid-merror' will be used for early stopping.

Will train until valid-merror hasn't improved in 20 rounds.
[1]	train-mlogloss:2.58269	train-merror:0.065366	valid-mlogloss:2.58243	valid-merror:0.065744
[2]	train-mlogloss:2.52449	train-merror:0.057373	valid-mlogloss:2.52422	valid-merror:0.057146
[3]	train-mlogloss:2.47067	train-merror:0.057836	valid-mlogloss:2.47039	valid-merror:0.057894
[4]	train-mlogloss:2.4195	train-merror:0.057693	valid-mlogloss:2.41925	valid-merror:0.05736
[5]	train-mlogloss:2.37121	train-merror:0.055503	valid-mlogloss:2.37098	valid-merror:0.054743
[6]	train-mlogloss:2.32412	train-merror:0.05529	valid-mlogloss:2.32382	valid-merror:0.054529
[7]	train-mlogloss:2.28234	train-merror:0.054204	valid-mlogloss:2.28193	valid-merror:0.053514
[8]	train-mlogloss:2.23983	train-merror:0.054067	valid-mlogloss:2.2394	valid-merror:0.053034
[9]	train

In [53]:
(dtrain.num_row() + dvalid.num_row()) * 0.051111

9569.819195999999

In [54]:
gc.collect()

35

#### Predict and validate 

In [55]:
%%time
test_labels = train['class']
test_data = train.before.apply(generate_features)
test_data = np.vstack(test_data.values)

Wall time: 7min 32s


In [56]:
test_data.shape

(10574501, 32)

In [57]:
test_data = scaler.transform(test_data)
test_labels = enc.transform(test_labels)



In [58]:
dtest = xgb.DMatrix(test_data, label=test_labels)

In [59]:
predictions = gbm.predict(dtest)

In [60]:
from sklearn.metrics import accuracy_score

accuracy_score(predictions, test_labels)

0.99012823394692573

In [61]:
np.sum(predictions != test_labels)

104389

In [62]:
train['predicted_class'] = np.apply_along_axis((lambda x: enc.classes_[x]), 0, predictions.astype(np.uint8))

In [63]:
prediction_errors = train[train['class'] != train['predicted_class']]['class'].value_counts()

In [64]:
for i in range(len(prediction_errors)):
    print ("{:>16}{:>12.2f}%".format(prediction_errors.index[i], prediction_errors[i] * 100 / len(train[train['class'] == prediction_errors.index[i]])))

           PLAIN        0.70%
         LETTERS       11.19%
         ORDINAL       41.32%
        CARDINAL        4.10%
            DATE        0.32%
         MEASURE        1.13%
       TELEPHONE        0.97%
           PUNCT        0.00%
           MONEY        1.15%
      ELECTRONIC        0.26%
         DECIMAL        0.21%
            TIME        0.62%
        VERBATIM        0.00%


In [65]:
prediction_errors

PLAIN         51379
LETTERS       21211
ORDINAL       19313
CARDINAL      11175
DATE            586
MEASURE         458
TELEPHONE        98
PUNCT            91
MONEY            31
ELECTRONIC       15
DECIMAL          15
TIME             12
VERBATIM          5
Name: class, dtype: int64

In [66]:
for c in train['class'].unique():
    print ("Errors for class {}:".format(c))
    print (train[(train['class'] == c) & (train['class'] != train['predicted_class'])]['predicted_class'].value_counts())
    print ("\n" * 2)

Errors for class PLAIN:
LETTERS       25168
ORDINAL       13232
PUNCT          6965
VERBATIM       5935
ELECTRONIC       79
Name: predicted_class, dtype: int64



Errors for class DATE:
MEASURE       388
TELEPHONE     124
ORDINAL        43
FRACTION       28
ELECTRONIC      3
Name: predicted_class, dtype: int64



Errors for class PUNCT:
TELEPHONE    91
Name: predicted_class, dtype: int64



Errors for class ORDINAL:
CARDINAL    12242
LETTERS      7058
MEASURE         8
PLAIN           5
Name: predicted_class, dtype: int64



Errors for class VERBATIM:
PLAIN    5
Name: predicted_class, dtype: int64



Errors for class LETTERS:
PLAIN         21199
ELECTRONIC       12
Name: predicted_class, dtype: int64



Errors for class CARDINAL:
LETTERS      7608
ORDINAL      2977
DECIMAL       259
TELEPHONE     246
PLAIN          80
DIGIT           5
Name: predicted_class, dtype: int64



Errors for class MEASURE:
DATE          231
MONEY          81
DECIMAL        80
ORDINAL        53
LETTERS        