In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import string
import nltk
from nltk.corpus import names
from nltk.classify import apply_features
import re

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

In [None]:
df = pd.read_csv('../input/ru_train.csv', delimiter=',', lineterminator='\n', error_bad_lines=False, low_memory=False, encoding = 'utf8',  names=['sentence_id', 'token_id', 'classes', 'before', 'after'])


In [None]:
classes_set= set(classes)
print (classes_set)

In [None]:
before = df['before'].tolist()

In [None]:
after = df['after'].tolist()

In [None]:
token_id = df['token_id'].tolist()

The first step is to learn to guess the class based on the context:

In [None]:
def features(input_string, i):
    features={}
    if isinstance(input_string, str):
        features = {"suffix(1)": input_string[-1:],
                    "suffix(2)": input_string[-2:]}
                    #"suffix(3)": input_string[-3:]}
    features['is float']=isinstance(input_string, float)
    features['contains roman numbers']=isinstance(input_string, str) and all((c in ('IVXLCDM')) for c in input_string)
    features['contains dates']=isinstance(input_string, str) and any((c in ('год','январ','феврал', 'март','апрел','мая','июн','июл','август','сентябр','октябр','ноябр', 'декабр')) for c in input_string)
    features['year']=isinstance(input_string, float) and (input_string<3000)and (input_string>1000)
    features['contains numbers']=isinstance(input_string, str) and any(char.isdigit() for char in input_string)
    features['contains /']=isinstance(input_string, str) and '/' in input_string                                                                                                                                                           
    features['short'] = isinstance(input_string, str) and len(input_string)<4
    features['contains capital latin']=isinstance(input_string, str) and re.search('[A-Z]', input_string)==None
    features['contains lowercase latin']=isinstance(input_string, str) and re.search('[a-z]', input_string)==None
    features['contains capital cyrillic']=isinstance(input_string, str) and re.search('[А-Я]', input_string)==None
    features['contains lowercase cyrillic']=isinstance(input_string, str) and re.search('[а-я]', input_string)==None
    features['contains upper']=isinstance(input_string, str) and any(c.isupper() for c in input_string)    
    features['contains lower']=isinstance(input_string, str) and any(c.islower() for c in input_string) 
    features['contains measures']=isinstance(input_string, str) and any((c in ('st', 'мин', 'с.', 'км', 'см', '%','метр','л.','В','гб','гр','грам','кило','га', 'тыс','ярд','А','мм','В', 'тонн')) for c in input_string) and any(char.isdigit() for char in input_string)
    features['contains endings']=isinstance(input_string, str) and any((c in ('-го', '-ом','-й','-е','-я','-и','-х', '—')) for c in input_string)
    features['contains —']=isinstance(input_string, str) and '—' in input_string
    features['contains і']=isinstance(input_string, str) and 'і' in input_string
    features['contains ї']=isinstance(input_string, str) and 'ї' in input_string
    features['contains &']=isinstance(input_string, str) and '&' in input_string
    features['contains -']=isinstance(input_string, str) and '-' in input_string
    features['contains .']=isinstance(input_string, str) and '.' in input_string
    features['contains ,']=isinstance(input_string, str) and '.' in input_string
    features['is -']=isinstance(input_string, str) and input_string=='-'
    if i == 0 or token_id[i]==0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = before[i-1]
    if i > len(before)-2 or token_id[i+1]==0:
        features["next-word"] = "<END>"
    else:
        try:
            features["next-word"] = before[i+1]
        except IndexError:
            print (i, "index error")
            pass  
    return features

In [None]:
features_n = [features(n, i) for i, n in enumerate(before)]


In [None]:
featuresets = (zip(features_n, classes))
featuresets_to_list = list(featuresets)

In [None]:
train_set = featuresets_to_list[1500:]
devtest_set = featuresets_to_list[500:1500]
test_set = featuresets_to_list[:500]

In [None]:
classifier = nltk.NaiveBayesClassifier.train(train_set) 

In [None]:
print(nltk.classify.accuracy(classifier, test_set))

In [None]:
classifier = nltk.NaiveBayesClassifier.train(featuresets_to_list) 

In [None]:
classifier.show_most_informative_features(35)



Here is the function that turns cardinal numbers to words - in nominative case for now:

In [None]:
cardinal_words = "ноль один два три четыре пять шесть семь восемь девять десять одиннадцать двенадцать" + \
" тринадцать четырнадцать пятнадцать шестнадцать семнадцать восемнадцать девятнадцать двадцать" + \
" тридцать сорок пятьдесят шестьдесят семьдесят восемьдесят девяносто" + \
" сто двести триста четыреста пятьсот шестьсот семьсот восемьсот девятьсот"

cardinal_words = cardinal_words.split(" ")

def number2words(n):
    
    if isinstance(n, str):
        n=''.join(n.split())
        if '-' in n and n.index('-')!=0:
            n=n[:n.index('-')]
        if all((c in ('IVXLCDM')) for c in n):
            n=roman(n)    
        if  '-' in n and n.index('-')==0 and n[1:].isdigit():
            n=int(n[1:])*-1
    if isinstance(n, float) or isinstance(n, int) and n<0:
        n=n*(-1)    
    try:
        n=int(n)
    except ValueError:
        
        return(n)
    
    if n < 20:  
        return cardinal_words[n]
    
    elif n < 100:
        return cardinal_words[18 + n // 10] + ('' if n % 10 == 0 else ' ' + cardinal_words[n % 10])
    elif n < 1000:
        return (cardinal_words[27 + n // 100]) + (' ' + number2words(n % 100) if n % 100 > 0 else '')
    elif n < 2000:
        return  "тысяча" + (' ' + number2words(n % 1000) if n % 1000 > 0 else '')
    elif n < 3000:
        return  "две тысячи" + (' ' + number2words(n % 1000) if n % 1000 > 0 else '')
    elif n < 5000:
        return  cardinal_words[n // 1000]+" тысячи" + (' ' + number2words(n % 1000) if n % 1000 > 0 else '')
    elif n < 21000:
        return  cardinal_words[n // 1000]+" тысяч" + (' ' + number2words(n % 1000) if n % 1000 > 0 else '')
    elif n < 100000:
        return cardinal_words[18 + n // 10000] + (' ' + number2words(n % 10000) if n % 1000 > 0 else '')
    elif n < 1000000:
        if (n// 1000) % 10 == 1:
            return (number2words(n // 1000) + ' тысяча' + (' ' + number2words(n % 1000) if n % 1000 > 0 else '')).replace('один', 'одна')
        if (n// 1000) % 10 in (2, 3, 4):
            return number2words(n // 1000) + ' тысячи ' + (' ' + number2words(n % 1000) if n % 1000 > 0 else '')
        else:
            return number2words(n // 1000) + ' тысяч ' + (' ' + number2words(n % 1000) if n % 1000 > 0 else '')
    elif n<2000000:
        return ('миллион' + (' ' + number2words(n % 1000000) if n % 1000000 > 0 else ''))
    elif n < 1000000000:
        if (n// 1000000) % 10 == 1:
            return (number2words(n // 1000000) + ' миллион' + (' ' + number2words(n % 1000000) if n % 1000000 > 0 else ''))
        if (n// 1000000) % 10 in (2, 3, 4):
            return number2words(n // 1000000) + ' миллиона ' + (' ' + number2words(n % 1000000) if n % 1000 > 0 else '')
        else:
            return number2words(n // 1000000) + ' миллионов ' + (' ' + number2words(n % 1000000) if n % 1000 > 0 else '')
        

This function does the same for ordinals:

In [None]:
ordinal_words = "нулевой первый второй третий четвертый пятый шестой седьмой восьмой девятый десятый одиннадцатый двенадцатый" + \
" тринадцатый четырнадцатый пятнадцатый шестнадцатый семнадцатый восемнадцатый девятнадцатый двадцатый" + \
" тридцатый сороковой пятидесятый шестидесятый семидесятый восьмидесятый девяностый" + \
" сотый двухсотый трехсотый четырехсотый пятисотый шестисотый семисотый восьмисотый девятисотый"

ordinal_words = ordinal_words.split(" ")

def ordinal2words(n):
    if n < 20:
        return ordinal_words[n]
    elif n < 100:
        return (cardinal_words[18 + n // 10] + ('' if n % 10 == 0 else ' ' + ordinal_words[n % 10]) if n % 10 > 0 else ordinal_words[18 + n // 10])
    elif n < 1000:
        return ((cardinal_words[27 + n // 100]) + (' ' + ordinal2words(n % 100)) if n % 100 > 0 else ordinal_words[27 + n // 100])
    elif n < 2000:
        return  "тысяча" + (' ' + ordinal2words(n % 1000) if n % 1000 > 0 else '')
    elif n==2000:
        return "двухтысячный"
    elif n < 3000:
        return  "две тысячи" + (' ' + ordinal2words(n % 1000) if n % 1000 > 0 else '')
    elif n==3000:
        return "трехтысячный"
    elif n < 4000:
        return  "три тысячи" + (' ' + ordinal2words(n % 1000) if n % 1000 > 0 else '')
    elif n==4000:
        return "четырехтысячный"
    elif n < 5000:
        return  "четыре тысячи" + (' ' + ordinal2words(n % 1000) if n % 1000 > 0 else '')
    elif n <1000000:
        return  number2words (n // 1000) + " тысяч" + (' ' + ordinal2words(n % 1000) if n % 1000 > 0 else '')

In [None]:
ordinals=df[df['classes'] == 'ORDINAL']

In [None]:
cardinals=df[df['classes'] == 'CARDINAL']

There are also plenty of roman numbers in the data, so this function turns them into regular numbers, that can be handled by functions above to turn them into ordinal or cardinal expressions

In [None]:
def roman(n):
    roman_dict = {u"M":1000, u"D":500, u"C":100, u"L":50, 
       u"X":10, u"V":5, u"I":1 }
    held=0
    result=0
    for c in n:
        val=roman_dict[c]
        if val>held:
            result-=held
        else:
            result+=held
        held=val
    result+=held    
    return(int(result))    
        
    

This dictionary ending provides a list of possible gender number case features for every ending. Note that a singe ending can be sometimes associated with multiple cases - hence the ambiguous labels. 

In [None]:
endings_dictionary={'ый' : 'MSN',
                   'ий' : 'MSN',
                   'ая' : 'FSN',
                   'ья' : 'FSN',
                   'ое' : 'NSN',
                   'ье' : 'NSN',
                   'ые' : 'Pl_N',
                   'ьи' : 'Pl_N',
                   'го' : 'MSG_A',
                   'ую' : 'FSG',
                   'ью' : 'FSG',
                   'ых' : 'Pl_G_A_P',
                   'их' : 'Pl_G_A_P',
                   'му' : 'MSD',
                   'ой' : 'FD_D_I_P',
                   'ей' : 'FD_D_I_P',
                   'ым' : 'Pl_D',
                   'им': 'Pl_D',
                   'ми' : 'Pl_I',
                   'ом' : 'MSP', 
                   'ем' : 'MSP'
                                  }

Now we make a list of all the ordinal endings in the data:

In [None]:
gender_number_case=[None]*len(after)
for i, a in enumerate(after): 
    if df.classes[i]=='ORDINAL' and isinstance(a, str):
        ending=a[-2:]
        if ending in endings_dictionary:
            gender_number_case[i]=endings_dictionary[ending]
        else:
            gender_number_case[i]='No ending'

... and learn the features to predict gender-number-case

In [None]:
def suffix_features(word, i): 
    features={}
    if isinstance(word, str):
        if '-' in word:
            features['suffix'] = word[word.index('-') + 1:]    
    if i == 0 or token_id[i]==0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = before[i-1]
    if i>1 and token_id[i-2]<token_id[i]:
        features["prev-prev-word"] = before[i-2]
    if i == len(after)-1 or token_id[i+1]==0:
        features["next-word"] = "<END>"
    else:
        try:
            features["next-word"] = before[i+1]
            
            if token_id[i+2]>token_id[i]:
                features["next-next-word"] = before[i+2]
        except IndexError:
            print (i, "index error")
            pass
    return features

In [None]:
suffix_featuresets = []
for i, b in enumerate(before):
    if isinstance(df.after[i], str) and df.classes[i]=='ORDINAL' and df.after[i].isalpha():
                   suffix_featuresets.append( (suffix_features(b, i), gender_number_case[i]) )

In [None]:
size = int(len(suffix_featuresets) * 0.1)


In [None]:
train_set, test_set = suffix_featuresets[size:], suffix_featuresets[:size]

In [None]:
suffix_classifier = nltk.NaiveBayesClassifier.train(train_set)

In [None]:
suffix_classifier = nltk.NaiveBayesClassifier.train(suffix_featuresets)

In [None]:
nltk.classify.accuracy(suffix_classifier, test_set)

In [None]:
suffix_classifier.show_most_informative_features(30)

There are two sets of endings, because the stem for "third"  is a special case ending in a "ь"

In [None]:
yj={'MSN': '',
    'MSG_A' : 'ого',
    'MSD': 'ому',
    'Pl_D_MSI':'ым',
    'MSP':'ом',
    'NSN' : 'oe',
    'FSN':'ая',
    'FSG':'ую',
    'FD_D_I_P':'ой',
    'Pl_N':'ые',
    'Pl_G_A_P':'ых',
     'Pl_D':'ым',
     'Pl_I':'ыми',
       }

In [None]:
ij={'MSN': '',
    'MSG_A' :'ьего',
    'MSD':'ьему',
    'MSA':'ьего',
    'Pl_D_MSI':'ьим',
    'MSP':'ьем',
    'NSN' : 'ье',
    'FSN':'ья',
    'FSG':'ью',
    'FSD':'ьей',
    'FSA':'ью',
     'FD_D_I_P':'ьей',
     'Pl_N':'ьи',
      'Pl_G_A_P':'ьих',
      'Pl_D':'ьим',
      'Pl_I':'ьими',
      }

In [None]:
def ordinal_ending(number, gender_number_case):
    
    number2word=ordinal2words(int(number)) 
    if gender_number_case=='MSN':
        return number2word
    if isinstance(number2word, str):
        stem=number2word[:-2]
    else:
        return number2word
    if number==3 or (number>20 and (number % 10) == 3 and (number % 100) != 13):
        ordinal=stem+ij[gender_number_case]
    else:
        ordinal=stem+yj[gender_number_case]
    
    return (ordinal)
    

Now we can combine the correct stem with correct ending for all ordinals. 

In [None]:
def ordinals(n,i):
        b=n
        if isinstance(n, int) or isinstance(n, float):
            n=int(abs(n))
        elif isinstance(n, str) and all((c in ('IVXLCDM')) for c in n):
            n=roman(n)    
        elif isinstance(n, str):
            n=''.join([ c for c in n if c.isdigit()])  
            if len(n)>0:
                try:
                    n=int(n)
                except ValueError:
                    pass
        guess_ending = suffix_classifier.classify(suffix_features(b, i))
        if guess_ending in ij and isinstance(n, int):
                guess=ordinal_ending(n, guess_ending)
        else: 
            guess=n   
        return (guess)    
            

In [None]:
dates=df[df['classes'] == 'DATE']

In [None]:
months= ['', 'января', 'февраля', 'марта', 'апреля', 'мая', 'июня', 'июля', 'августа', 'сентября', 'октября', 'ноября', 'декабря']

Here we need to learn the correct endings for dates

In [None]:

def date_features(word, i): 
    features={"word" : word,
        "suffix(1)": word[-1:],
                   "suffix(2)": word[-2:]}
              #     "suffix(3)": word[-3:]}
    date_to_list=word.split()
    for j,d in enumerate(date_to_list):
        features.update({j:d})
    if i == 0 or token_id[i]==0:
        features["prev-word"] = "<START>"
    else:
             features["prev-word"] = before[i-1]
    if i>1 and token_id[i-2]<token_id[i]:
            features["prev-prev-word"] = before[i-2]
    if i == len(after)-1 or token_id[i+1]==0:
        features["next-word"] = "<END>"
    else:
        try:
            features["next-word"] = before[i+1]
            
            if token_id[i+2]>token_id[i]:
                features["next-next-word"] = before[i+2]
        except IndexError:
            print (i, "index error")
            pass
    return features

In [None]:
date_featuresets = []
for i, n in enumerate(after):
    if df.classes[i]=='DATE' and ' ' in n and (n.split()[1] in months):
                   date_featuresets.append( (date_features(n, i), n.split()[0][-2:]))  
date_classifier = nltk.NaiveBayesClassifier.train(date_featuresets)
def date_to_words(date, i):
    ret=''
    date=date.strip('.')
    if len(re.split('.-/', date))>1 and all([x.isdigit() for x in re.split('.-/', date)]):
        date_to_list=re.split('.-/', date)
        day=ordinal_ending(int(date_to_list[0]), 'NSN')
        if int(date_to_list[1])<13:
            month=months[int(date_to_list[1])]
        else:
            return date   
        if len(date_to_list)==3:
            year=' '+ ordinal_ending(int(date_to_list[2]), 'MSG_A') + ' года' 
        else:
            year=''    
        return (day+' '+month+year)
    else:    
        date_to_list=date.split()
        if len(date_to_list)>1 and date_to_list[1] in months:
            ending=date_classifier.classify(date_features(date, i))
            if ending=='ое':
                date_to_list[0]=(ordinal_ending(int(date_to_list[0]), 'NSN'))
            elif ending=='го':
                date_to_list[0]=(ordinal_ending(int(date_to_list[0]), 'MSG_A'))    
        for i,d in enumerate(date_to_list):
            ret+=' '
            if d=='г' or d=='г.':
                ret+='году'
            if d=='гг.' or d=='гг':
                ret+='годы' 
            if d.isdigit():
                if i<len(date_to_list)-1:
                    if date_to_list[i+1] == 'год':
                        year=ordinal2words(int(d))
                    elif 'ду' in date_to_list[i+1] or 'г.' in date_to_list[i+1]:
                        year=ordinal_ending(int(d), 'MSP')
                    elif 'дом' in date_to_list[i+1]: 
                        year=ordinal_ending(int(d), 'Pl_D_MSI')
                    elif 'ды' in date_to_list[i+1] or date_to_list[i+1]=='гг.':
                        year=ordinal_ending(int(d), 'PL_N')
                    elif date_to_list[i+1]=='годах' :
                        year=ordinal_ending(int(d), 'Pl_G_A_P')   
                    else:    
                        year=ordinal_ending(int(d), 'MSG_A')
                else:    
                        year=ordinal_ending(int(d), 'MSG_A') + ' года'            
                ret+=year 
            else:
                ret+=d
    return ret.strip() 

In [None]:
def date_to_words1(date, i):
    ret=''
    date=date.strip('.')
    if '.' in date and all([x.isdigit() for x in date.split('.')]):
        date_to_list=date.split('.')
        day=ordinal_ending(int(date_to_list[0]), 'NSN')
        if int(date_to_list[1])<13:
            month=months[int(date_to_list[1])]
        else:
            return date   
        if len(date_to_list)==3:
            year=' '+ ordinal_ending(int(date_to_list[2]), 'MSG') + ' года' 
        else:
            year=''    
        return (day+' '+month+year)
    else:    
        date_to_list=date.split()
        for i,d in enumerate(date_to_list):
            ret+=' '
            if d=='г' or d=='г.':
                ret+='году'
            elif d=='гг.' or d=='гг':
                ret+='годы' 
            elif d.isdigit():
                if i<len(date_to_list)-1:
                    if date_to_list[i+1] == 'год':
                        year=ordinal2words(int(d))
                    elif 'ду' in date_to_list[i+1] or 'г.' in date_to_list[i+1]:
                        year=ordinal_ending(int(d), 'MSP')
                    elif 'дом' in date_to_list[i+1]: 
                        year=ordinal_ending(int(d), 'MSI')
                    elif 'ды' in date_to_list[i+1] or date_to_list[i+1]=='гг.':
                        year=ordinal_ending(int(d), 'PL_N')
                    elif date_to_list[i+1]=='годах' :
                        year=ordinal_ending(int(d), 'Pl_G_A_P')   
                    else:    
                        year=ordinal_ending(int(d), 'MSG')
                else:    
                        year=ordinal_ending(int(d), 'MSG') + ' года'            
                ret+=year 
            else:
                ret+=d
    return ret.strip()        

In [None]:
def telephone(tel):
    tel_to_list=tel.split('-')
    for i,t in enumerate(tel_to_list):
        if t.isdigit():
            t2words=''
            for j,c in enumerate(t):
                if c=='0':
                    t2words+='ноль '
                    t=t[j+1: ]
                else:
                    continue
            if len(t)>0 and int(t)!=0:        
                if len(t)<4:
                    t2words+=number2words(int(t))
                elif len(t)==4:
                    t2words+=number2words(int(t[:2]))+' '+number2words(int(t[2:]))
                elif len(t)<7:
                        t2words+=number2words(int(t[:3]))+' '+number2words(int(t[3:]))
                elif len(t)<10:
                        t2words+=number2words(int(t[:3]))+' '+number2words(int(t[3:6]))+' '+number2words(int(t[6:]))  
                else:
                    t2words+=number2words(int(t[:3]))+' '+number2words(int(t[3:6]))+' '+number2words(int(t[6:9]))  +' '+number2words(int(t[9:]))
        else: 
                t2words=t.lower()
        if i==0:
                tel2words=t2words
        else:
                tel2words+=' sil ' + t2words
    return(tel2words.strip())    
            

In [None]:
telephones=df[df['classes'] == 'TELEPHONE']

In [None]:
measures={'%' : 'процент',
         'с' : 'секунда',
         'т':'тонна',
         'км' : 'километр',
          'А':'ампер',
          'км²':'квадратный километр',
          'См':'сименс',
          'см':'сантиметр',
          'м':'метр',
          'мин':'минута',
          'мм':'миллиметр',
          'см3':'кубический сантиметр',
          'м3':'кубический метр',
          'В':'вольт',
          'л' : 'литр',
          'га':'гектар',
          'ГБ':'гигабайт',
          'г':'грамм',
          'К':'кельвин',
          'st':'стоун',
          'мА':'миллиампер',
          'тыс.':'тысяча',
          'долл.':'доллар',
          'млрд':'миллиард',
          'млн':'миллион',
          'USD':'долларов сэ ш а'
         }

In [None]:
def decimal_to_words(decimal):
    dec_to_list=decimal.split(',')
    if len(dec_to_list)<1:
        return decimal
    elif len(dec_to_list)==1 and decimal.isdigit():
        return number2words(int(decimal))
    else:
        ret=''
        for i,d in enumerate(dec_to_list):
            if i==0 and d.isdigit():
                n=int(d)
                if d[-1]=='1' and (len(d)==1 or d[-2]!='1'):
                    ret+=number2words(n).replace('один','одна целая ')
                elif d[-1]=='2' and (len(d)==1 or d[-2]!='1'):
                    ret+=number2words(n).replace('два','две целых ')    
                else:
                    if number2words(n):
                        ret+=number2words(n)+ ' целых '
                    else:
                        ret+=str(n)
            elif i==0 and not d.isdigit(): 
                return decimal
            elif i==1 and d.isdigit(): 
                n=int(d)  
                if len(d)==1:
                    if  d[-1]=='1' and (len(d)==1 or d[-2]!='1'):
                        ret+='и ' +number2words(n).replace('один','одна десятая ')
                    elif d[-1]=='2' and (len(d)==1 or d[-2]!='1'):
                        ret+='и ' +number2words(n).replace('два','две десятых ')    
                    else:
                        ret+='и ' + number2words(n) + ' десятых'
                elif len(dec_to_list[i])==2:
                    if  d[-1]=='1' and (len(d)==1 or d[-2]!='1'):
                        ret+='и ' + number2words(n).replace('один','одна сотая ')
                    elif d[-1]=='2' and (len(d)==1 or d[-2]!='1'):
                        ret+='и ' + number2words(n).replace('два','две сотых ')    
                    else:
                        ret+='и ' + number2words(n) + ' сотых'
                elif len(dec_to_list[i])==3:
                    if  d[-1]=='1' and (len(d)==1 or d[-2]!='1'):
                        ret+='и ' +number2words(n).replace('один','одна тысячная ')
                    elif d[-1]=='2' and (len(d)==1 or d[-2]!='1'):
                        ret+='и ' +number2words(n).replace('два','две тысячных ')    
                    else:
                        ret+='и ' +number2words(n) + ' тысячных'
                           
                else:
                    return decimal
            else:
                des=d        
    return(ret) 

In [None]:
def decimal_measure(phrase):
    if len(phrase)==1 and not phrase.isdigit():
        head = phrase.strip('0123456789')
        tail =phrase[len(head):]
        if head and tail:
            phrase=head+' '+tail
    if ' ' in phrase and phrase.split()[0].isdigit() and phrase.split()[1].isdigit():
        phrase=phrase.replace(' ', '', 1)
    to_list=phrase.split()
    result= list(to_list)
    n=0
    for i,l in enumerate(to_list):
        if l.isdigit():
            n=number2words(l)      
        elif ',' in l and l.split(',')[0].isdigit() and l.split(',')[1].isdigit():
            n=decimal_to_words(l)            
        elif i!=0 and (l in measures or l.rstrip('.') in measures):   
            if l in measures :
                n=measures[l]    
            if l.rstrip('.') in measures:
                n=measures[l.rstrip('.')]
            previous= to_list[i-1]        
            if ',' in previous:
                    if  n[-1]=='а':
                        if n[-2]=='ч':
                            n=n[:-1]+'и'
                        else:
                            n=n[:-1]+'ы'
                    else: 
                        n=n+'а'
                    if 'квадратный' in n:
                        n=n.replace('ый', 'ого')
                    if 'кубический' in n:
                        n=n.replace('ий', 'ого')   
                    
            elif previous.isdigit():    
                    if previous[-1]=='1' and (len(previous)==1 or previous[-2]!='1') and n[-1]=='а':
                        result[i-1]=result[i-1].replace('один', 'одна')
                    elif previous[-1]=='1' and (len(previous)==1 or previous[-2]!='1') and n[-1]=='у':
                        result[i-1]=result[i-1].replace('один', 'одну')
                    elif previous[-1]=='2' and (len(previous)==1 or previous[-2]!='1') and n[-1]=='а':
                        result[i-1]=result[i-1].replace('два', 'две')
                        if n[-2]=='ч':
                            n=n[:-1]+'и'
                        else:
                            n=n[:-1]+'ы'
                    elif previous[-1] in '234' and (len(previous)==1 or previous[-2]!='1') and n[-1]!='а':   
                        n=n+'а'
                    elif previous[-1] in '234' and (len(previous)==1 or previous[-2]!='1') and n[-1]=='а':
                        if n[-2]=='ч':
                            n=n[:-1]+'и'
                        else:
                            n=n[:-1]+'ы'
                    elif   n[-1]=='а':
                        n=n[:-1]
                    elif previous[-1]=='1':
                        n=n
                    else: 
                        n=n+'ов'
                    if 'квадратный' in n or  'кубический' in n and  previous[-1]!='1':
                        n=n.replace('й', 'х')
        else: 
            n=to_list[i]
        if n:
            result[i]=n
    result=' '.join(result)
    result=result.strip('.')
    return (result.strip())    

Now that we can guess the class of every element and have an algorith for every class  - we can test it:

In [None]:
test = pd.read_csv('../input/ru_test_2.csv', delimiter=',', lineterminator='\n', error_bad_lines=False, low_memory=False, encoding = 'utf8',  names=['sentence_id', 'token_id', 'before'])
test['id'] = test['sentence_id'].astype(str) + '_' + test['token_id'].astype(str)
before=list(test['before'].tolist())
test_after=list(before)
for index,item in enumerate(test_after):
    test_class=classifier.classify(features(item, index))
    if test_class=='CARDINAL':
        test_after[index]=number2words(item)
    if test_class=='ORDINAL':
        test_after[index]=ordinals(item, index)
    if  test_class=='DATE':
        test_after[index]=date_to_words(item, index)   
    if  test_class=='TELEPHONE':
        test_after[index]=telephone(item)
    if  test_class=='MEASURE' or test_class=='DECIMAL' or test_class=='MONEY':
        test_after[index]=decimal_measure(item)
    if  test_class=='LETTERS':
        test_after[index]=' '.join([l for l in item.lower() if l not in string.punctuation])     

In [None]:
test['after']=test_after
test=test[1:]
test[['id','after']].to_csv('submission.csv', index=False)
