In [1]:
import numpy as np
import pandas as pd

import string
import gc

In [2]:
import xgboost as xgb
from sklearn.model_selection import train_test_split



In [3]:
train = pd.read_csv('data/train.csv', encoding='utf-8')
train.dropna(inplace=True)
train = train[train.sentence_id != 492799]

In [4]:
punctuations = [s[0] for s in train[train['class'] == 'PUNCT']['before'].unique().tolist()]
verbatims = train[train['class'] == 'VERBATIM'].query('after != before').before.unique().tolist()
currencies = [u'¥', u'$', u'€', u'£']
power_signs = [u'²', u'³']
fraction_signs = [u'¼', u'½', u'¾', u'⅔', u'⅓']
specials = list(set(string.punctuation) - set(punctuations) - set(currencies))

In [5]:
def is_latin_letter(letter):
    ord_letter = ord(letter)
    return ord_letter >= 97 and ord_letter <= 122

def is_rus_letter(letter):
    ord_letter = ord(letter)
    return ord_letter >= 1072 and ord_letter <= 1105

def is_digit(letter):
    ord_letter = ord(letter)
    return ord_letter >= 48 and ord_letter <= 57

def is_greek(letter):
    ord_letter = ord(letter)
    return ord_letter >= 945 and ord_letter <= 969

def is_dash(letter):
    return letter == u'—'

def generate_features(value):
    
    value = value.strip()
    num_letters = len(value)
    num_words = len(value.split(" "))
    
    rus_letters_num = 0
    en_letters_num = 0
    digits_num = 0
    punctuations_num = 0
    greek_letters_num = 0
    other_letters_num = 0
    special_letters_num = 0
    whitespaces_num = 0
    uppercase_num = 0
    
    has_dash = 0
    has_currency_sign = 0
    has_power_sign = 0
    has_fraction_sign = 0

    for l in value:
        if l.isupper():
            uppercase_num += 1
        
        l = l.lower()

        if is_rus_letter(l):
            rus_letters_num += 1
        elif is_latin_letter(l):
            en_letters_num += 1
        elif is_digit(l):
            digits_num += 1
        elif is_greek(l):
            greek_letters_num += 1
        elif l in punctuations:
            punctuations_num += 1
        elif l in specials:
            special_letters_num += 1
        elif l.isspace():
            whitespaces_num += 1
        else:
            other_letters_num += 1
            
        if is_dash(l):
            has_dash = 1
        elif l in currencies:
            has_currency_sign = 1
        elif l in power_signs:
            has_power_sign = 1
        elif l in fraction_signs:
            has_fraction_sign = 1
        
    is_uppercase = 1 if num_letters == uppercase_num else 0
    is_lowercase = 1 if uppercase_num == 0 else 0
    is_single_char = 1 if num_letters == 1 else 0
    is_russian_only = 1 if num_letters == rus_letters_num else 0
    is_english_only = 1 if num_letters == en_letters_num else 0
    is_digits_only = 1 if num_letters == digits_num else 0
    is_punctuation_only = 1 if num_letters == punctuations_num else 0
    is_one_word = 1 if num_words == 1 else 0
    
    
    return [num_letters, num_words, rus_letters_num, en_letters_num, digits_num, greek_letters_num, \
            other_letters_num, punctuations_num, uppercase_num, special_letters_num, \
            is_russian_only, is_english_only, is_digits_only, is_punctuation_only, is_one_word, \
            has_dash, has_currency_sign, has_power_sign, has_fraction_sign]

In [6]:
%%time
train_data = train.before.apply(generate_features)
train_data = np.vstack(train_data.values)

Wall time: 3min 56s


In [7]:
train_data.shape

(10574501, 19)

In [8]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train_data = scaler.fit_transform(train_data)



In [9]:
from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()
labels = enc.fit_transform(train['class'])

#### Save XGBoost data 

In [10]:
xtr, xcv, ytr, ycv = train_test_split(train_data, labels, test_size = 0.05,  random_state = 42)

In [11]:
dtrain = xgb.DMatrix(xtr, label=ytr)
dvalid = xgb.DMatrix(xcv, label=ycv)

In [12]:
dtrain.save_binary('dtrain.buffer')
dvalid.save_binary('dvalid.buffer')

#### Load XGBoost data 

In [3]:
dtrain = xgb.DMatrix('dtrain.buffer')
dvalid = xgb.DMatrix('dvalid.buffer')

In [4]:
params = [("objective", "multi:softmax"),
          ("num_class", 15),
          ("booster", "gbtree"),
          ("nthread", 2),
          ("eta", 0.1),
          ("max_depth", 4),
          ("subsample", 0.9),
          ("min_child_weight", 1),
          ("colsample_bytree", 0.7),
          ("eval_metric", 'mlogloss'),
          ("eval_metric", 'merror')
         ]
num_rounds = 1000
stop = 20

In [5]:
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

In [None]:
gbm = xgb.train(params, dtrain, num_rounds, evals=watchlist, early_stopping_rounds=stop)

[0]	train-mlogloss:2.05513	train-merror:0.025821	valid-mlogloss:2.05536	valid-merror:0.026106
Multiple eval metrics have been passed: 'valid-merror' will be used for early stopping.

Will train until valid-merror hasn't improved in 20 rounds.
[1]	train-mlogloss:1.70408	train-merror:0.025505	valid-mlogloss:1.70443	valid-merror:0.025781
[2]	train-mlogloss:1.46555	train-merror:0.022404	valid-mlogloss:1.46598	valid-merror:0.022766
[3]	train-mlogloss:1.28203	train-merror:0.022533	valid-mlogloss:1.28253	valid-merror:0.022885
[4]	train-mlogloss:1.13586	train-merror:0.022831	valid-mlogloss:1.1364	valid-merror:0.023165
[5]	train-mlogloss:1.01307	train-merror:0.022797	valid-mlogloss:1.01366	valid-merror:0.023131
[6]	train-mlogloss:0.91079	train-merror:0.022724	valid-mlogloss:0.91141	valid-merror:0.023031
[7]	train-mlogloss:0.820956	train-merror:0.022671	valid-mlogloss:0.821625	valid-merror:0.022963
[8]	train-mlogloss:0.743227	train-merror:0.022631	valid-mlogloss:0.743935	valid-merror:0.022906
[9

In [29]:
gc.collect()

0