In [1]:
import numpy as np
import pandas as pd

import string
import re
import gc

In [2]:
import xgboost as xgb
from sklearn.model_selection import train_test_split



In [3]:
train = pd.read_csv('data/train.csv', encoding='utf-8')
train.dropna(inplace=True)
train = train[train.sentence_id != 492799]

In [6]:
train['class'].unique()

array(['PLAIN', 'DATE', 'PUNCT', 'ORDINAL', 'VERBATIM', 'LETTERS',
       'CARDINAL', 'MEASURE', 'TELEPHONE', 'ELECTRONIC', 'DECIMAL',
       'DIGIT', 'FRACTION', 'MONEY', 'TIME'], dtype=object)

In [181]:
punctuations = [s[0] for s in train[train['class'] == 'PUNCT']['before'].unique().tolist()]
verbatims = train[train['class'] == 'VERBATIM'].query('after != before').before.unique().tolist()
currencies = [u'¥', u'$', u'€', u'£']
power_signs = [u'²', u'³']
fraction_signs = [u'¼', u'½', u'¾', u'⅔', u'⅓']
specials = list(set(string.punctuation) - set(punctuations) - set(currencies))
date_signs = [u'январ', u'феврал', u'март', u'апрел', u'мая', u'май', u'июн', \
        u'июл', u'август', u'сентябр', u'октябр', u'ноябр', u'декабр', u'год']
quantitatives = [u'тыс', u'млн', u'миллион', u'миллиард', u'млрд']
currency_names = [u'евро', u'руб', u'долл', u'гривен', u'гривн', u'usd', u'eur', u'юан', u'йен', u'cyp', u'php', u'bsd']

In [188]:
def is_latin_letter(letter):
    ord_letter = ord(letter)
    return ord_letter >= 97 and ord_letter <= 122

def is_rus_letter(letter):
    ord_letter = ord(letter)
    return ord_letter >= 1072 and ord_letter <= 1105

def is_digit(letter):
    ord_letter = ord(letter)
    return ord_letter >= 48 and ord_letter <= 57

def is_greek(letter):
    ord_letter = ord(letter)
    return ord_letter >= 945 and ord_letter <= 969

def is_dash(letter):
    return letter == u'—'

def is_dot(letter):
    return letter == u'.'

def is_comma(letter):
    return letter == u','

def is_slash(letter):
    return letter == u'/'

def is_colon(letter):
    return letter == u':'

def is_date_in_word(word):
    for d in date_signs:
        if d in word:
            return True
    return False

def is_domain_in_word(word):
    url_pattern = "(([a-zA-Z0-9]+(-[a-zA-Z0-9]+)*\.)+[a-z]{2,10})"
    return len(re.findall(url_pattern, word)) > 0

def is_quantitative_in_word(word):
    for d in quantitatives:
        if d in word:
            return True
    return False

def is_currency_name_in_word(word):
    for c in currency_names:
        if c in word:
            return True
    return False

def generate_features(value):
    
    value = value.strip()
    num_letters = len(value)
    num_words = len(value.split(" "))
    
    rus_letters_num = 0
    en_letters_num = 0
    digits_num = 0
    punctuations_num = 0
    greek_letters_num = 0
    other_letters_num = 0
    special_letters_num = 0
    whitespaces_num = 0
    uppercase_num = 0
    
    has_dash = 0
    has_currency_sign = 0
    has_power_sign = 0
    has_fraction_sign = 0
    has_dot = 0
    has_comma = 0
    has_slash = 0
    has_colon = 0

    for l in value:
        if l.isupper():
            uppercase_num += 1
        
        l = l.lower()

        if is_rus_letter(l):
            rus_letters_num += 1
        elif is_latin_letter(l):
            en_letters_num += 1
        elif is_digit(l):
            digits_num += 1
        elif is_greek(l):
            greek_letters_num += 1
        elif l in punctuations:
            punctuations_num += 1
        elif l in specials:
            special_letters_num += 1
        elif l.isspace():
            whitespaces_num += 1
        else:
            other_letters_num += 1
            
        if is_dash(l):
            has_dash = 1
        elif is_dot(l):
            has_dot = 1
        elif is_comma(l):
            has_comma = 1
        elif is_slash(l):
            has_slash = 1
        elif is_colon(l):
            has_colon = 1
        elif l in currencies:
            has_currency_sign = 1
        elif l in power_signs:
            has_power_sign = 1
        elif l in fraction_signs:
            has_fraction_sign = 1

    value = value.lower()
        
    is_uppercase = 1 if num_letters == uppercase_num else 0
    is_single_char = 1 if num_letters == 1 else 0
    is_russian_only = 1 if num_letters == rus_letters_num else 0
    is_english_only = 1 if num_letters == en_letters_num else 0
    is_digits_only = 1 if num_letters == digits_num else 0
    is_punctuation_only = 1 if num_letters == punctuations_num else 0
    is_one_word = 1 if num_words == 1 else 0
    
    has_date = is_date_in_word(value)
    has_domain = is_domain_in_word(value)
    has_quantitative = is_quantitative_in_word(value)
    is_zero_leading = len(value) > 0 and value[0] == u'0'
    has_currency_name = is_currency_name_in_word(value)
    
    return [num_letters, num_words, rus_letters_num, en_letters_num, digits_num, greek_letters_num, \
            other_letters_num, punctuations_num, uppercase_num, special_letters_num, \
            is_russian_only, is_english_only, is_digits_only, is_punctuation_only, is_one_word, \
            is_uppercase, is_single_char, is_zero_leading, has_dash, has_currency_sign, has_power_sign, \
            has_fraction_sign, has_date, has_dot, has_comma, has_domain, has_quantitative, has_slash, \
            has_currency_name, has_colon]

In [189]:
%%time
train_data = pd.DataFrame(columns=train.columns)
sample_max_size = 10000

for c in train['class'].unique():
    class_sample = train[train['class'] == c]
    if len(class_sample) > sample_max_size:
        train_data = train_data.append(class_sample.sample(sample_max_size))
    else:
        train_data = train_data.append(class_sample)

Wall time: 20 s


In [190]:
%%time
labels = train_data['class']
train_data = train_data.before.apply(generate_features)
train_data = np.vstack(train_data.values)

Wall time: 4.56 s


In [191]:
train_data.shape

(112236, 30)

In [192]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train_data = scaler.fit_transform(train_data)



In [193]:
from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()
labels = enc.fit_transform(labels)

#### Save XGBoost data 

In [194]:
xtr, xcv, ytr, ycv = train_test_split(train_data, labels, test_size = 0.1,  random_state = 42)

In [195]:
dtrain = xgb.DMatrix(xtr, label=ytr)
dvalid = xgb.DMatrix(xcv, label=ycv)

In [12]:
dtrain.save_binary('dtrain.buffer')
dvalid.save_binary('dvalid.buffer')

#### Load XGBoost data 

In [3]:
dtrain = xgb.DMatrix('dtrain.buffer')
dvalid = xgb.DMatrix('dvalid.buffer')

In [200]:
params = [("objective", "multi:softmax"),
          ("num_class", 15),
          ("booster", "gbtree"),
          ("nthread", 3),
          ("eta", 0.01),
          ("max_depth", 6),
          ("subsample", 0.9),
          ("min_child_weight", 1),
          ("colsample_bytree", 0.7),
          ("eval_metric", 'mlogloss'),
          ("eval_metric", 'merror')
         ]
num_rounds = 2000
stop = 20

In [201]:
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

In [204]:
(dtrain.num_row() + dvalid.num_row()) * 0.07386

8289.75096

In [203]:
gbm = xgb.train(params, dtrain, num_rounds, evals=watchlist, early_stopping_rounds=stop)

[0]	train-mlogloss:2.65157	train-merror:0.129628	valid-mlogloss:2.65177	valid-merror:0.125445
Multiple eval metrics have been passed: 'valid-merror' will be used for early stopping.

Will train until valid-merror hasn't improved in 20 rounds.
[1]	train-mlogloss:2.59283	train-merror:0.089564	valid-mlogloss:2.59301	valid-merror:0.086511
[2]	train-mlogloss:2.53836	train-merror:0.080901	valid-mlogloss:2.53863	valid-merror:0.078938
[3]	train-mlogloss:2.48856	train-merror:0.079951	valid-mlogloss:2.48886	valid-merror:0.076443
[4]	train-mlogloss:2.44215	train-merror:0.079792	valid-mlogloss:2.44241	valid-merror:0.075552
[5]	train-mlogloss:2.39922	train-merror:0.079565	valid-mlogloss:2.39964	valid-merror:0.075285
[6]	train-mlogloss:2.35506	train-merror:0.079149	valid-mlogloss:2.35552	valid-merror:0.075909
[7]	train-mlogloss:2.31698	train-merror:0.08202	valid-mlogloss:2.31727	valid-merror:0.078136
[8]	train-mlogloss:2.27632	train-merror:0.079357	valid-mlogloss:2.27665	valid-merror:0.076532
[9]	tr

In [214]:
gc.collect()

7

#### Predict and validate 

In [208]:
%%time
test_labels = train['class']
test_data = train.before.apply(generate_features)
test_data = np.vstack(test_data.values)

Wall time: 6min 39s


In [209]:
test_data.shape

(10574501, 30)

In [210]:
test_data = scaler.transform(test_data)
test_labels = enc.transform(test_labels)



In [213]:
dtest = xgb.DMatrix(test_data, label=test_labels)

In [215]:
predictions = gbm.predict(dtest)

In [218]:
from sklearn.metrics import accuracy_score

accuracy_score(predictions, test_labels)

0.93097139997433453

In [219]:
np.sum(predictions != test_labels)

729943

In [236]:
train['predicted_class'] = np.apply_along_axis((lambda x: enc.classes_[x]), 0, predictions.astype(np.uint8))

In [245]:
prediction_errors = train[train['class'] != train['predicted_class']]['class'].value_counts()

In [267]:
for i in range(len(prediction_errors)):
    print ("{:>16}{:>12.2f}%".format(prediction_errors.index[i], prediction_errors[i] * 100 / len(train[train['class'] == prediction_errors.index[i]])))

           PUNCT       24.35%
           PLAIN        1.47%
         LETTERS       16.69%
         ORDINAL       35.11%
        CARDINAL        4.77%
            DATE        1.17%
         MEASURE        2.01%
       TELEPHONE        0.79%
           MONEY        1.34%
        FRACTION        1.26%
         DECIMAL        0.41%
      ELECTRONIC        0.31%
            TIME        0.57%
        VERBATIM        0.00%
