In [1]:
import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv('data/train.csv', encoding = 'utf8')

In [49]:
train = train[train['class'] != 'PUNCT'] #we already processed this class

In [125]:
train_digit = train[train['class'] == 'DIGIT']

In [7]:
train_card.head()

Unnamed: 0,sentence_id,token_id,class,before,after
137,9,9,CARDINAL,254,двести пятьдесят четыре
272,21,7,CARDINAL,2014,две тысячи четырнадцать
275,21,10,CARDINAL,12,двенадцать
304,24,4,CARDINAL,2014,две тысячи четырнадцать
343,26,4,CARDINAL,2011,две тысячи одиннадцать


### 1. Predict [DIGIT] 'class' value | $20.56$% error rate

In [40]:
def is_number(value):
    try:
        int(value)
        return True
    except ValueError:
        return False

In [45]:
print len(train_digit)
print train_digit.before.apply(lambda x: is_number(x) and x.startswith('0')).sum()

2012
2012


In [53]:
train_digit.before.apply(lambda x: len(x) <= 1).sum()

0

All elements in DIGITS class start with $0$. Moreover this number must have at least a size of 2.

In [56]:
train_digit_errors = train[(train['class'] != 'DIGIT') & \
                           (train.before.apply(lambda x: str(x).startswith('0') and \
                                                         is_number(x) and \
                                                         len(x) > 1))]
train_digit_errors.head()

Unnamed: 0,sentence_id,token_id,class,before,after
47276,3470,23,TELEPHONE,60202,ноль ноль шесть ноль два ноль два
88947,6519,18,TELEPHONE,718830067,ноль семь один восемь восемь три ноль ноль шес...
104315,7610,12,TELEPHONE,155996932,ноль сто пятьдесят пять девятьсот девяносто ше...
134983,9797,8,TELEPHONE,761317538,ноль семьсот шестьдесят один триста семнадцать...
137033,9950,12,TELEPHONE,312048475,ноль три один два ноль четыре восемь четыре се...


In [57]:
print len(train_digit_errors)

598


In [59]:
train_digit_errors['class'].value_counts()

TELEPHONE    593
CARDINAL       5
Name: class, dtype: int64

This means that if a number starts with $0$, it is more likely to be a 'DIGIT'. But it also could be 'CARDINAL' or 'TELEPHONE'. We will see if it is easy to separate these classes

In [60]:
train_digit_errors[train_digit_errors['class'] == 'CARDINAL']

Unnamed: 0,sentence_id,token_id,class,before,after
605596,43912,12,CARDINAL,8,восемь
3927636,284000,14,CARDINAL,9,девять
9075205,653766,11,CARDINAL,7,семь
9075209,653766,15,CARDINAL,9,девять
9880522,711620,18,CARDINAL,0,ноль


This seems to be just a noise or a mistake. To fix 'CARDINAL' elements, we need to check a token before the element. It must be 'x' letter

In [86]:
train[train.sentence_id == 653766]

Unnamed: 0,sentence_id,token_id,class,before,after
9075194,653766,0,CARDINAL,4,четыре
9075196,653766,2,PLAIN,Роуз,Роуз
9075197,653766,3,PLAIN,Уильямс,Уильямс
9075198,653766,4,PLAIN,появляется,появляется
9075199,653766,5,PLAIN,во,во
9075200,653766,6,PLAIN,второстепенной,второстепенной
9075201,653766,7,PLAIN,роли,роли
9075202,653766,8,PLAIN,с,с
9075203,653766,9,CARDINAL,2,двумя
9075204,653766,10,PLAIN,x,на


In [81]:
train_digit.before.apply(lambda x: len(x)).max()

4

In [84]:
train_digit_errors[train_digit_errors['class'] == 'TELEPHONE'].before.apply(lambda x: len(x)).min()

5

Now we see that these two classes could be easily separated by value length. 'DIGIT' class maximum length is 4, but the length of digit errors in class 'PHONE' is minimum 5

#### Predict 'DIGIT' class label 

In [117]:
def is_number(value):
    try:
        int(value)
        return True
    except ValueError:
        return False

def predict_digit_class(row):
    value = row['before']
    if str(value).startswith('0') and is_number(value):
        if len(value) == 1:
            return 'CARDINAL'
        if len(value) == 2:
            token_id_cur = row['token_id']
            sentence_id = row['sentence_id']
            token_id_before = token_id_cur - 1
            try:
                value_before = train.loc[(train.sentence_id == sentence_id) & \
                                         (train.token_id == token_id_before), 'before'].values[0]
                if value_before == 'x':
                    return 'CARDINAL'
            except:
                return 'DIGIT'
        if len(value) > 4:
            return 'TELEPHONE'

        return 'DIGIT'
    else:
        return 'OTHER'

In [None]:
train['class_predicted'] = train.apply(lambda row: predict_digit_class(row))