In [2]:
import numpy as np
import pandas as pd

In [50]:
train = pd.read_csv('data/train.csv', encoding = 'utf8')
submit = pd.read_csv('data/sample_submission.csv')

In [38]:
test = pd.read_csv('data/test.csv', encoding = 'utf8')

In [4]:
data = {}
classes = train["class"].unique()
classes

array(['PLAIN', 'DATE', 'PUNCT', 'ORDINAL', 'VERBATIM', 'LETTERS',
       'CARDINAL', 'MEASURE', 'TELEPHONE', 'ELECTRONIC', 'DECIMAL',
       'DIGIT', 'FRACTION', 'MONEY', 'TIME'], dtype=object)

In [5]:
for c in classes:
    data[c] = train[train['class'] == c]

We will be considering all the classes one-by-one starting with the smallest error rate. We will also create a rule for each of them to predict the class for the test data then.

### 1. Predict [PUNCT] 'class' value | $0.00%$% error rate

In [6]:
data["PUNCT"].head()

Unnamed: 0,sentence_id,token_id,class,before,after
4,0,4,PUNCT,.,.
14,1,9,PUNCT,.,.
18,2,3,PUNCT,",",","
24,2,9,PUNCT,(,(
27,2,12,PUNCT,),)


In [38]:
punctuations = data['PUNCT']['before'].unique().tolist()

In [39]:
print (punctuations)

['/', '---', '-----', '(', '¡', '.', ']', '¿', ',', ':', '[', '?', '«', '—', '»', "'", ';', ')', '|', '--', '"', '!']


We want to make sure that the rule that we use is applied to the current class and is not applied to any other element in the dataset (in other words, this rule MUST give unique results for each class)

In [18]:
print (len(data['PUNCT']))
print (data['PUNCT']['before'].apply(lambda x: 1 if x in punctuations else 0).sum())
print (train['before'].apply(lambda x: 1 if x in punctuations else 0).sum())

2288640
2288640
2295197


In [44]:
2295197 - 2288640

6557

###  3.a Fix errors 

We see a number of dataset rows that have the same value that 'PUNCT' does, but don't belong to this class

In [46]:
train_not_punct = train[(train['class'] != 'PUNCT')]

In [47]:
train_punct_errors = train_not_punct[train_not_punct.apply(lambda row: row['before'] in punctuations, axis=1)]

In [51]:
train_punct_errors.head()

Unnamed: 0,sentence_id,token_id,class,before,after
604,44,5,PLAIN,—,до
1347,96,2,PLAIN,—,до
1371,96,26,PLAIN,—,до
1795,128,2,PLAIN,—,до
2434,172,12,PLAIN,—,до


In [54]:
train_punct_errors.before.value_counts()

—    6553
.       3
«       1
Name: before, dtype: int64

#### Fix ' —' letter

We firstly want to make sure, that all '—' punctuations has the same 'after' label

In [53]:
train_punct_errors[(train_punct_errors.before == '—') & (train_punct_errors.after != 'до')]

Unnamed: 0,sentence_id,token_id,class,before,after
6832525,492799,0,VERBATIM,—,—


It's definitely ok with this value, the class will be treated as PUNCT, not VERBATIM. Now we want to see the rest punctuation errors

In [74]:
train_punct_dash_errors = train_punct_errors.loc[(train_punct_errors.before == '—') & (train_punct_errors.after == 'до')]

In [76]:
train_punct_dash_errors['class'].unique()

array(['PLAIN'], dtype=object)

To fix the orher '—' marks, we need to see the sentence context, all this errors come from a "PLAIN" class

In [73]:
train[train.sentence_id == 128]

Unnamed: 0,sentence_id,token_id,class,before,after
1793,128,0,PLAIN,от,от
1794,128,1,CARDINAL,54,пятидесяти четырех
1795,128,2,PLAIN,—,до
1796,128,3,CARDINAL,64,шестьдесят четыре
1797,128,4,PLAIN,Treherne,т_trans р_trans е_trans е_trans р_trans н_trans
1798,128,5,PUNCT,",",","
1799,128,6,CARDINAL,1989,тысяча девятьсот восемьдесят девять
1800,128,7,PUNCT,",",","
1801,128,8,PLAIN,pp,п_trans п_trans
1802,128,9,PUNCT,.,.


It seems that all these '—' errors are transformed to 'до' and are more likely to be followed by 'от' word or 'с' word. Let's check this

In [93]:
train.loc[(train.sentence_id == 128) & (train.token_id == 0), 'before'].values[0] == 'от'

True

In [97]:
counter = 0
for i, row in train_punct_dash_errors.iterrows():
    token_id_dash = row['token_id']
    sentence_id = row['sentence_id']
    token_id_before = token_id_dash - 2
    value_before = train.loc[(train.sentence_id == sentence_id) & (train.token_id == token_id_before), 'before'].values[0]
    if value_before == 'от' or value_before == 'с':
        counter += 1
    else:
        print (train[train.sentence_id == sentence_id])

In [98]:
print (len(train_punct_dash_errors) == counter)

True


#### Fix '.' and '«' letters

In [55]:
train_punct_errors[(train_punct_errors.before == '.')]

Unnamed: 0,sentence_id,token_id,class,before,after
6832527,492799,2,VERBATIM,.,точка
6832528,492799,3,VERBATIM,.,точка
6832529,492799,4,VERBATIM,.,точка


What is the difference between a dot from "PUNCT" class and the one from "VERBATIM"?

In [64]:
train[train.sentence_id == 492799]

Unnamed: 0,sentence_id,token_id,class,before,after
6832525,492799,0,VERBATIM,—,—
6832526,492799,1,VERBATIM,«,кавычка
6832527,492799,2,VERBATIM,.,точка
6832528,492799,3,VERBATIM,.,точка
6832529,492799,4,VERBATIM,.,точка


In [57]:
print (len(data["PUNCT"][data["PUNCT"].before == '.'])) # The majority

920258


We can see all the other dots are actually punctuation marks, so the sentence $id=492799$ is just a trash. We will simply remove this sentence from the dataset

In [90]:
train.iloc[6832525].before == u'\u2014'

True

In [56]:
train.before.dtype

dtype('O')

#### Predict class label 

In [189]:
def has_numbers(inputString):
    return any(char.isdigit() for char in inputString)

def predict_punct_class(row):
    value = row['before']
    if (len(value) == 1 or value.startswith('-')) and value in punctuations:
        if value == u'\u2014':
            token_id_dash = row['token_id']
            if token_id_dash > 1:
                sentence_id = row['sentence_id']
                token_id_before = token_id_dash - 2
                token_id_after = token_id_dash + 1
                sentence = train[(train.sentence_id == sentence_id)]

                value_before = sentence.loc[sentence.token_id == token_id_before, 'before'].values[0].lower()
                value_after = sentence.loc[sentence.token_id == token_id_after, 'before'].values[0].lower()

                if value_before == u'от' or value_before == u'с':
                    if not has_numbers(value_after):
                        return 'PUNCT'
                    else:
                        return 'PLAIN'
        return 'PUNCT'
    else:
        return 'OTHER'

In [None]:
train['class_predicted'] = train.apply(predict_punct_class, axis=1)

### 2. Predict 'after' value

In [None]:
def predict_punct_class(row):
    if row['class'] == 'PLAIN' and row['before'] in punctuations:
        return u'до'

    return row['before']

In [None]:
train['after_predicted'] = train.apply(lambda row: predict_punct_after(row))