# 0. imports, functions and variables

In [1]:
import os
import operator
import gc
import pandas as pd
from tqdm import tqdm
from random import *
import pickle
import csv
import re
from num2words import num2words
from transliterate import translit

INPUT_PATH = r'./input'
DATA_INPUT_PATH = r'./input/ru_with_types'
SUBM_PATH = INPUT_PATH

ch = {"0":"ноль",
      "1":"один",
      "2":"два",
      "3":"три",
      "4":"четыре",
      "5":"пять",
      "6":"шесть",
      "7":"семь",
      "8":"восемь",
      "9":"девять"
     }

SUB = str.maketrans("₀₁₂₃₄₅₆₇₈₉", "0123456789")
SUP = str.maketrans("⁰¹²³⁴⁵⁶⁷⁸⁹", "0123456789")

punct = {'«','.','»',','}
dash = {'-','—'}
short = {"по","англ","ее","что","есть","где","кто","две","ибн","ту"}

res_new = dict()

full = {"проверено","архивировано", "с", "от", "умер", "родился", "первоисточника"}
space = {"на","по"} 

m = {}
m['км²'] = 'квадратных километров'
m['км2'] = 'квадратных километров'
m['km²'] = 'квадратных километров'
m['км'] = 'километрах'
m['km'] = 'километрах'
m['кг'] = 'килограмма'
m['kg'] = 'килограмма'
m['m²'] = 'квадратных метров'
m['м²'] = 'квадратных метров'
m['м³'] = 'кубических метров'
m["млн"] = "миллионов" 
m["м/с"] = "метров в секунду"
m["мм"] = "миллиметров"
m["м"] = "метров"
m["ч"] = "часов"
m["л"] = "лет"
m["тыс"] = "тысяч"
m["тонн"] = "тонн"
m["га"] = "гектара"
m["гг"] = "годы"
m["млрд"] = "миллиардов"
m["км/ч"] = "километров в час"
m["руб"] = "рублей"
m["с"] = "секунд"
m["м3"] = "кубических метров"
m["н. э."] = "нашей эры"
m["трлн"] = "триллионов"
m["$"] = "долларов сэ ш а"
m["€"] = "евро"
m["%"] = "процентов"
m["долл"] = "долларов"
m["ГВт"] = "гигаватт"
m["МВт"] = "мегаватт"
m["кВт"] = "киловатт"
m["куб.см"] = "кубических сантиметров"
m["мкм"] = "микрометров"
m["об/мин"] = "оборотов в минуту"
m["нм"] = "нанометра"

mm = {"1":"января",
      "2":"февраля",
      "3":"марта",
      "4":"апреля",
      "5":"мая",
      "6":"июня",
      "7":"июля",
      "8":"августа",
      "9":"сентября",
      "10":"октября",
      "11":"ноября",
      "12":"декабря",
      "01":"января",
      "02":"февраля",
      "03":"марта",
      "04":"апреля",
      "05":"мая",
      "06":"июня",
      "07":"июля",
      "08":"августа",
      "09":"сентября"      
     }

d = {"0":"",
     "1":"первого",
     "2":"второго",
     "3":"третьего",
     "4":"четвертого",
     "5":"пятого",
     "6":"шестого",
     "7":"седьмого",
     "8":"восьмого",
     "9":"девятого",
     "01":"первого",
     "02":"второго",
     "03":"третьего",
     "04":"четвертого",
     "05":"пятого",
     "06":"шестого",
     "07":"седьмого",
     "08":"восьмого",
     "09":"девятого",
     "10":"десятого",
     "11":"одиннадцатого",
     "12":"двенадцатого",
     "13":"тринадцатого",
     "14":"четырнадцатого",
     "15":"пятнадцатого",
     "16":"шестнадцатого",
     "17":"семнадцатого",
     "18":"восемнадцатого",
     "19":"девятнадцатого",
     "20":"двадцатого",
     "21":"двадцать первого",
     "22":"двадцать второго",
     "23":"двадцать третьего",
     "24":"двадцать четвертого",
     "25":"двадцать пятого",
     "26":"двадцать шестого",
     "27":"двадцать седьмого",
     "28":"двадцать восьмого",
     "29":"двадцать девятого",
     "30":"тридцатого",
     "31":"тридцать первого",
    }

dd = {"2":"двадцать",
      "3":"тридцать",
      "4":"сорок",
      "5":"пятьдесят",
      "6":"шестьдесят",
      "7":"семьдесят",
      "8":"восемьдесят",
      "9":"девяносто"
     }

dd0 = {"1":"десятого",
       "2":"двадцатого",
       "3":"тридцатого",
       "4":"сорокового",
       "5":"пятидесятого",
       "6":"шестидесятого",
       "7":"семидесятого",
       "8":"восьмидесятого",
       "9":"девяностого"
     }

ddd = {"1":"сто",
       "2":"двести",
       "3":"триста",
       "4":"четыреста",
       "5":"пятьсот",
       "6":"шестьсот",
       "7":"семьсот",
       "8":"восемьсот",
       "9":"девятьсот"
     }

gl = ["a","e","u","i","o","y"]
zv = [ "b" , "v" , "g" , "d" , "z" , "l" , "m" , "n" , "r" ]

def thous(x):
    if x[0]=="1":
        res = "тысяча "
    if x[0]=="2":
        res = "две тысячи "
    
    if int(x[1])!=0:
        res += ddd[x[1]]+" "
    if int(x[2])>1:
        if x[3]=="0":
            res += dd0[x[2]] 
        else:
            res += dd[x[2]]+" "+d[x[3]]
    else:
        if x[2]=="0":
            res += d[x[3]]
        else:
            if x[3]=="0":
                res += dd0[x[2]]
            else:
                res += d[x[2]+x[3]]
        
    return res + " года"

def year(q):
    res = ""
    
    for x in res_upd:
        if x[2].endswith(q):
            try:
                y = x[2].split()[-3]
                z = list(res_upd[x])[0]
                res = z.split(y)[-1]
                break
            except:
                pass
        if q == x[2]:
            try:
                res = " "+list(res_upd[x])[0]
                break
            except:
                pass
    if res == "" and int(q[0])<3 and q[3]!="":
        try:
            res = thous(q[:4])
        except:
            pass
    return res

def full_date(t):
    a = {'г','г.','гг.'}
    b = {'год','года','году'}
    ans = ""
    s = t.split()
    if len(s) == 3 and len(s[-1])==4 and s[-1].isnumeric():
        if s[0] in d:
            res = year(s[2] + " года")
            tmp = d[s[0]]+ " " + s[1] + res
            if res != "":
                ans = tmp
            else:
                print(s[2])
    if len(s) == 4 and s[-1] in a|b:
        if s[0] in d:
            res = year(s[2] + " " + s[-1])
            tmp = d[s[0]]+ " " + s[1] + res
            if res != "":
                ans = tmp
            else:
                print(s[2])
    return ans

def space_date(t):
    t = t.replace(" года",'')
    ans = ""
    s = t.split()
    if len(s) == 3 and len(s[-1])==4 and s[2].isnumeric() and s[0].isnumeric() and not s[1].isnumeric():
        try:
            ans = (d[s[0]][:-2]+"е ").replace("ее",'е') +  s[1] + " " + thous(s[2])
        except:
            pass
    return ans

def dot_date(t):
    res = ""
    s = t.split(".")
    if len(s)!=3:
        s = t.split("-")
        if len(s)!=3:
            s = t.split("/")
            if len(s)!=3:
                return ""
    
    if len(s[2]) == 2:
        s[2] = "19"+s[2]
    if len(s[2])!=4:
        return ""
        
    if  len(s)==3 and s[0].isnumeric() and s[1].isnumeric() and s[2].isnumeric():
        try:
            res = (d[s[0]][:-2]+"е ").replace("ее",'е') +  mm[s[1]] + " " + thous(s[2])
        except:
            pass
        
    return res

def hasNumbers(inputString):
    return bool(re.search(r'\d', inputString))

def save_obj(obj, name):
    with open('obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, -1)

def load_obj(name):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

def insert_key(key, arr, k = 1):
    #if hasNumbers(arr): #no_digits
    #    return
    if key not in res_new:
        res_new[key] = {arr : k}
    else:
        if arr in res_new[key]:
            res_new[key][arr] += k
        else:
            res_new[key][arr] = k  
            
def is_ascii(s):
    return all(ord(c) < 128 for c in s)

def name_num(t):
    if len(t) == 0:
        return ""
    while t[0]=="0":
        return "ноль " + name_num(t[1:])
        
    if len(t) == 4:
        if t[:2] == "20" and t[2]!="0":
            return "двадцать " + num2words(t[2:], lang='ru') + " "
        if t[1]=="0" or t[2]=="0":
            return num2words(t, lang='ru') + " "
        return num2words(t[:2], lang='ru') + " " + num2words(t[2:], lang='ru') + " "
    if len(t) <= 3:
        num2words(t, lang='ru')
    return num2words(t[:3], lang='ru') + " " + name_num(t[3:])

def dash_num(t):
    res = ""
    s = t.split("-")
    for x in s:
        res += name_num(x) + "sil "
    return res[:-5]

def url(x):
    s = x.split("/")
    if len(s) == 2 and s[0].isnumeric() and s[1].isnumeric():
        res = num2words(s[0], lang='ru') + " " + num2words(s[1], lang='ru')
        if last:
            if res.endswith("один"):
                res = res.replace("один","первых")
            elif res.endswith("два"):
                res = res.replace("два","вторых")
            elif res.endswith("три"):
                res = res.replace("три","третьих")
            elif res.endswith("четыре"):
                res = res.replace("четыре","четвертых")
            elif res.endswith("семь"):
                res = res.replace("семь","седьмых")
            elif res.endswith("восемь"):
                res = res.replace("восемь","восьмых")
            elif res.endswith("сто"):
                res = res.replace("сто","сотых")
            else:
                res = res[:-1] + "ых"
            return res

    res = ""
    if x.startswith("http://www."):
        res = "h t t p w w w точка "
        x = x.replace("http://www.", "")
    elif x.startswith("http://"):
        res = "h t t p "
        x = x.replace("http://", "")
    
    ss = x.split("/")
    #print(ss)
    i = 0
    for z in ss:
        if z == "":
            continue
        s = z.split(".")
        #print(s)
        for y in s:
            if y =="":
                continue
            if y.isnumeric():
                res += num2words(y, lang='ru') + " точка "
            elif not y.isalpha() or not is_ascii(y):
                #print(y)
                return ""
            else:
                res += trans(y) + " точка " 
        res = res[:-7] + " косая черта "
    if x[-1] == "/":
        res += " косая черта"
    res = res[:-13]
    return res

def trans(x):
    if len(x)>1:
        if x in s:
            return s[x]
        
        key = (None, None, x, None, None)
        if key in res_upd:
            srtd = sorted(res_upd[key].items(), key=operator.itemgetter(1), reverse=True)
            t = srtd[0][0]
            return t
    
    word = x.lower()
    t = ("_trans ".join(translit(word, 'ru')) + "_trans")
    t = t.replace("w_trans","в_trans")
    t = t.replace("ы_trans","и_trans")
    t = t.replace("щ_trans","ш_trans")
    t = t.replace("ц_trans","к_trans")
    t = t.replace("а_trans л_trans л_trans","о_trans л_trans")
    t = t.replace("т_trans и_trans о_trans","ш_trans е_trans")
    t = t.replace("и_trans е_trans в_trans","ь_trans ю_trans")
    t = t.replace("т_trans у_trans р_trans","ч_trans е_trans р_trans")
    t = t.replace("е_trans в_trans","ь_trans ю_trans")
    t = t.replace("а_trans и_trans","э_trans й_trans")
    t = t.replace("г_trans у_trans","г_trans а_trans")
    t = t.replace("о_trans и_trans","о_trans й_trans")
    t = t.replace("т_trans х_trans","т_trans")
    t = t.replace("о_trans о_trans","у_trans")
    t = t.replace("е_trans е_trans","и_trans")
    t = t.replace("п_trans х_trans","ф_trans")
    t = t.replace("с_trans ч_trans","ш_trans")
    t = t.replace("а_trans у_trans","о_trans")
    t = t.replace("о_trans а_trans","о_trans")
    t = t.replace("в_trans х_trans","у_trans")
    t = t.replace("к_trans к_trans","к_trans")
    t = t.replace("х_trans н_trans","н_trans")
    t = t.replace("х_trans л_trans","л_trans")
    t = t.replace("х_trans р_trans","р_trans")
    t = t.replace("х_trans к_trans","к_trans")
    t = t.replace("x_trans","к_trans с_trans")
    t = t.replace("х_trans у_trans","х_trans а_trans")
    t = t.replace("е_trans и_trans","е_trans й_trans")
    t = t.replace("с_trans у_trans","с_trans а_trans")
    t = t.replace("п_trans у_trans","п_trans а_trans")
    t = t.replace("г_trans и_trans я_trans","д_trans ж_trans и_trans я_trans")
    t = t.replace("у_trans н_trans т_trans","а_trans н_trans т_trans")
    t = t.replace("у_trans н_trans д_trans","а_trans н_trans д_trans")
    t = t.replace("у_trans н_trans к_trans","а_trans н_trans к_trans")
    t = t.replace("у_trans б_trans","а_trans б_trans")
    t = t.replace("с_trans в_trans","с_trans у_trans")
    #t = t.replace("к_trans у_trans","с_trans у_trans")

    if t.endswith("е_trans"):
        t = t[:-8]

    if t.endswith("г_trans у_trans"):
        t = t[:-8]

    if t.startswith("е_trans"):
        t = "э" + t[1:]

    if word[0] == "w" and len(word)>1:
        t = "у" + t[1:]

    if word[0] == "j" and len(word)>1:
        tt = "д_trans ж_trans"
        if word[1]=="u":
            tt+=" y_trans"
        if word[1]=="a":
            tt+=" е_trans"
        t = tt + t[7:]

    #if word.startswith("wi"):
    #    t = "в" + t[1:]

    if word.endswith("ts") and t.endswith("к_trans"):
        t = t[:-8] + " т_trans с_trans"

    if word.endswith("ux"):
        t = t[:-16]

    if word.endswith("ge") and t.endswith("г_trans"):
        t = t[:-8] + " д_trans ж_trans"

    if t.startswith("а_trans и_trans р_trans"):
        t = "а_trans й_trans" + t[15:]

    if t.endswith("и_trans а_trans"):
        t = t[:-8] + " я_trans"

    if t.endswith("е_trans с_trans") and not word.endswith("ses") and not word.endswith("hes"):
        t = t[:-15] + "с_trans"

    if t.endswith("д_trans г_trans"):
        t = t[:-8] + " ж_trans"

    if len(word)>2 and word[0]=='c' and word[1]=='h' and word[2] not in gl:
        t = "к" + t[1:]

    if "ce" in word:
        t = t.replace("к_trans","с_trans") 

    t = t.replace("у_trans н_trans д_trans","а_trans н_trans д_trans")
    
    return t    

df = pd.read_csv("input/ru_test_2.csv")
print(df.head())
print(df.tail())
test_set_old = set(df.before)
print(len(test_set_old))

   sentence_id  token_id        before
0            0         0           Эта
1            0         1         книга
2            0         2             ,
3            0         3  отличающаяся
4            0         4             «
        sentence_id  token_id     before
989875        69999        17     убедил
989876        69999        18        его
989877        69999        19  выполнить
989878        69999        20     приказ
989879        69999        21          .
175991


# 1.1 Build dict of different word meanings based on train set

In [2]:
#1 min 
train = open(os.path.join(INPUT_PATH, "ru_train.csv"), encoding='UTF8')
line = train.readline()
res_new = dict()
last = ""
for i in tqdm(range(10574517)):
    line = train.readline().strip()
    if line == '':
        print("Finish")
        break
    
    pos = line.find('","')
    text = line[pos + 2:]
    if text[:3] == '","':
        continue
    text = text[1:-1]
    arr = text.split('","')
    
    insert_key(arr[0], arr[1])
    
train.close()
gc.collect()
len(res_new)
#801234

100%|█████████▉| 10571822/10574517 [00:43<00:00, 270164.47it/s]

Finish


801234

          100%|█████████▉| 10571822/10574517 [01:00<00:00, 176043.23it/s]

In [3]:
#save_obj(res_new, "word_meanings_dict_train") 

# 1.2 The same for external data set# 

In [4]:
#18 min 
res_new = load_obj("word_meanings_dict_train") 

files = os.listdir(DATA_INPUT_PATH)
for file in tqdm(files):
    train = open(os.path.join(DATA_INPUT_PATH, file), encoding='UTF8')
    while 1:
        line = train.readline().strip()
        if line == '':
            break
        
        pos = line.find('\t')
        text = line[pos + 1:]
        if text[:3] == '':
            continue
        arr = text.split('\t')
        if arr[0] == '<eos>':
            continue
        
        if arr[1] == '<self>' or arr[1] == 'sil':
            arr[1] = arr[0]
    
        insert_key(arr[0], arr[1])
        
    train.close()
    gc.collect()
len(res_new)
#4922825


  0%|          | 0/100 [00:00<?, ?it/s][A
100%|██████████| 100/100 [17:28<00:00, 11.21s/it]


4922825

In [5]:
#save_obj(res_new, "word_meanings_dict_all") 

# 2. Produce single/multi meaning word sets 

In [10]:
#10 sec
res_new = load_obj("word_meanings_dict_all") 
res = [x for x in res_new if len(res_new[x])==1]
print(len(res))

single = set(res) & test_set_old
print(len(single))

multi_words = list(test_set_old-set(res)-punct-dash-short)
print(len(multi_words))

s = dict()
for x in tqdm(single):
    s[x] = list(res_new[x])[0]
    
#4835693
#153976
#22001

4835693
153976


 69%|██████▉   | 106815/153976 [00:00<00:00, 520124.69it/s]

22001


100%|██████████| 153976/153976 [00:00<00:00, 534343.64it/s]


In [11]:
#save_obj(s, "single_meaning_words")  
#save_obj(multi_words, "multi_meaning_words")  

# 3. Make 5 word seq dict for test set (multi meaning) 

In [12]:
#10 sec
test_set = set()

multi_words = load_obj("multi_meaning_words") 

df = pd.read_csv("input/ru_test_2.csv")
dp = df[df.before.isin(multi_words)] 

before = df.before
token = df.token_id

for i in tqdm(dp.index):
    arr = before[i]
    
    try:
        if token[i-1] == 0:
            lost = ""
        else:
            lost = before[i-2]
    except:
        lost = ""
        
    try:
        if token[i] == 0:
            last = ""
        else:
            last = before[i-1]
    except:
        last = ""
    
    if last=="":
        lost=""
        
    try:
        if token[i+1]==0:
            nex = ""
        else:
            nex = before[i+1]
    except:
        nex = ""
        
    try:
        if token[i+2]==0:
            nexx = ""
        else:
            nexx = before[i+2]
    except:
        nexx = ""
    if nex=="":
        nexx=""
    
    for key in [(lost, last, arr, nex, nexx),
                
                (lost, last, arr, nex, None),
                (None, last, arr, nex, nexx),
                
                (lost, last, arr, None, None),
                (None, last, arr, nex, None),
                (None, None, arr, nex, nexx),
                
                (None, last, arr, None, None), 
                (None, None, arr, nex, None),
                
                (None, None, arr, None, None)]:
        test_set.add(key)
len(test_set)
#462131

100%|██████████| 72692/72692 [00:12<00:00, 5659.18it/s]


462131

In [13]:
#save_obj(test_set, "multi_seq_test")  

# 4.1 Build freq dict for train set

In [14]:
#2 min
test_set = load_obj("multi_seq_test")
multi_words = load_obj("multi_meaning_words")

res_new = dict()
big_set = set()
big_dict = dict()

df = pd.read_csv("input/ru_train.csv")
dp = df[df.before.isin(multi_words)] 

before = df.before
after = df.after
token = df.token_id

for i in tqdm(dp.index):
    arr = [before[i], after[i]]
    
    try:
        if token[i-1] == 0:
            lost = ""
        else:
            lost = before[i-2]
    except:
        lost = ""
        
    try:
        if token[i] == 0:
            last = ""
        else:
            last = before[i-1]
    except:
        last = ""
    
    if last=="":
        lost=""
        
    try:
        if token[i+1]==0:
            nex = ""
        else:
            nex = before[i+1]
    except:
        nex = ""
        
    try:
        if token[i+2]==0:
            nexx = ""
        else:
            nexx = before[i+2]
    except:
        nexx = ""
    if nex=="":
        nexx=""        
    
    for key in [(lost, last, arr[0], nex, nexx),
                
                (lost, last, arr[0], nex, None),
                (None, last, arr[0], nex, nexx),
                
                (lost, last, arr[0], None, None),
                (None, last, arr[0], nex, None),
                (None, None, arr[0], nex, nexx),
                
                (None, last, arr[0], None, None), 
                (None, None, arr[0], nex, None),
                
                (None, None, arr[0], None, None)]:
        big_set.add(key)
        try:
            big_dict[(key, arr[1])]+= 1
        except:
            big_dict[(key, arr[1])] = 1
            
inter = big_set & test_set
print(len(inter))
for key, arr in tqdm(list(big_dict)):
    if key in inter:
        insert_key(key, arr, big_dict[(key, arr)])

len(res_new)
#108578

100%|██████████| 461864/461864 [01:55<00:00, 4001.49it/s]
  1%|          | 11011/2157638 [00:00<00:35, 59750.56it/s]

108578


100%|██████████| 2157638/2157638 [00:03<00:00, 651184.55it/s]


108578

In [15]:
#save_obj(res_new, "freq_dict_train")  

# 4.2 The same for external data set

In [19]:
#1 hour
test_set = load_obj("multi_seq_test")
res_new = load_obj("freq_dict_train") 
multi_words = load_obj("multi_meaning_words")

files = os.listdir(DATA_INPUT_PATH)
for file in tqdm(files):
    df = pd.read_csv(DATA_INPUT_PATH+"/"+file, sep="\t", low_memory = False, error_bad_lines = False, quoting=3)
    df.columns = [0,1,2]
    
    dp = df[df[1].isin(multi_words)] 
    
    before = df[1]
    after = df[2]
    
    big_set = set()
    big_dict = dict()
    
    for i in dp.index:
        arr = [before[i], after[i]]
        if arr[1] == '<self>' or arr[1] == 'sil':
            arr[1] = arr[0]
        
        try:
            if before[i-2]=="<eos>":
                lost = ""
            else:
                lost = before[i-2]
        except:
            lost = ""
        
        try:
            if before[i-1]=="<eos>":
                last = ""
            else:
                last = before[i-1]
        except:
            last = ""
            
        if last=="":
            lost=""
            
        try:
            if before[i+1]=="<eos>":
                nex = ""
            else:
                nex = before[i+1]
        except:
            nex = ""
        try:
            if before[i+2]=="<eos>":
                nexx = ""
            else:
                nexx = before[i+2]
        except:
            nexx = ""
        if nex=="":
            nexx=""
        
        for key in [(lost, last, arr[0], nex, nexx),
                
                    (lost, last, arr[0], nex, None),
                    (None, last, arr[0], nex, nexx),

                    (lost, last, arr[0], None, None),
                    (None, last, arr[0], nex, None),
                    (None, None, arr[0], nex, nexx),

                    (None, last, arr[0], None, None), 
                    (None, None, arr[0], nex, None),

                    (None, None, arr[0], None, None)]:
            
            big_set.add(key)
            try:
                big_dict[(key, arr[1])]+= 1
            except:
                big_dict[(key, arr[1])] = 1
    
    inter = big_set & test_set
    for key, arr in list(big_dict):
        if key in inter:
            insert_key(key, arr, big_dict[(key, arr)])
print(len(res_new))
#207820


100%|██████████| 100/100 [1:00:26<00:00, 44.39s/it]

207820





In [20]:
#save_obj(res_new, "freq_dict_all")

# 5. main loop

In [21]:
#4 min
res_upd  = load_obj("freq_dict_all")
s = load_obj("single_meaning_words")

multi_new = set()
multi_upd = list()
single_set = set()

df = pd.read_csv("input/ru_test_2.csv")

before = df.before
token = df.token_id
sentence = df.sentence_id

ides = list()
after = list()
reason = list()

for i in tqdm(range(len(df))):
    arr = before[i]
    i1 = sentence[i]
    i2 = token[i]
    
    ides.append(str(i1) + '_' + str(i2))
    
    try:
        if token[i-1] == 0:
            lost = ""
        else:
            lost = before[i-2]
    except:
        lost = ""
        
    try:
        if token[i] == 0:
            last = ""
        else:
            last = before[i-1]
    except:
        last = ""
    
    if last=="":
        lost=""
        
    try:
        if token[i+1]==0:
            nex = ""
        else:
            nex = before[i+1]
    except:
        nex = ""
        
    try:
        if token[i+2]==0:
            nexx = ""
        else:
            nexx = before[i+2]
    except:
        nexx = ""
    if nex=="":
        nexx=""
    
    #check if word has single meaning
    if arr in s: 
        tmp = s[arr]
        q=0
    #punctuation or short word wit almost unique meaning 
    elif arr in punct or arr in short:
        tmp = arr
        q=1
    #special case of dash usage
    elif arr in dash:
        if before[i-2] == "от" and lost!="":
            tmp = "до"
            q=3
        else:
            tmp = arr
            q=2
    #5 word sequences
    else:
        found = False
        q = 1
        for key in [(lost, last, arr, nex, nexx),
                
                    (lost, last, arr, nex, None),
                    (None, last, arr, nex, nexx),

                    (lost, last, arr, None, None),
                    (None, last, arr, nex, None),
                    (None, None, arr, nex, nexx),

                    (None, last, arr, None, None), 
                    (None, None, arr, nex, None),

                    (None, None, arr, None, None)
                   ]:

            if key in res_upd:
                srtd = sorted(res_upd[key].items(), key=operator.itemgetter(1), reverse=True)
                
                #in case of multiple meanings
                if len(srtd)>1:
                    
                    #check first date format 
                    tmp = space_date(arr)
                    if tmp !="" and last.lower() in space|{",","—",'('}:
                        found = True
                        q = -22
                        break
                    
                    #check second date format 
                    tmp = full_date(arr)
                    if tmp !="" and last.lower() in full:
                        found = True
                        q = -21
                        break
                    
                tmp = srtd[0][0]
                found = True
                q*=10
                                
                break
            q+=1
        if not found:
            #NaN
            if str(arr) == "nan":
                tmp = "н_trans а_trans н_trans" #"n a" 
                q = 0
            else:
                #check first date format 
                if last.lower() in space|{",","—",'('}:
                    tmp = space_date(arr)
                    q = -1.2
                #check second date format 
                else:    
                    tmp = full_date(arr)
                    q = -1.1
                if tmp=="":
                    #check trird date format 
                    ar = arr.replace(" г.", '').replace(" года", '')
                    tmp = dot_date(ar)
                    q=-10
                    if tmp=="":
                        #long number
                        if len(arr)>10 and arr.isnumeric():
                            tmp = " ".join([ch[x] for x in arr])
                            q=-7
                        else:
                            #$ at the beginning
                            if arr[0] == "$":
                                arr = arr[1:] + " $"
                            l = []
                            
                            #remove spaces between numbers
                            spl = arr.split(" ")
                            if len(spl)>1:
                                ar = ''
                                for i in range(len(spl)):
                                    ar += spl[i]
                                    if i+1 < len(spl):
                                        if not spl[i].isnumeric() or not spl[i+1].isnumeric():
                                            ar += " "
                                arr = ar
                            
                            #split into separate words
                            for word in arr.split(" "):
                                key = (None, None, word, None, None)
                                if key in res_upd:
                                    srtd = sorted(res_upd[key].items(), key=operator.itemgetter(1), reverse=True)
                                    t = srtd[0][0]
                                    q = -2
                                    if len(srtd)>1:
                                        p = space_date(word)
                                        if p !="" and last.lower() in space|{",","—",'('}:
                                            q = -24
                                            t = p

                                        p = full_date(arr)
                                        if p !="" and last.lower() in full:
                                            q = -23
                                            t = p
                                    
                                    l.append(t)
                                else:
                                    #custom endings
                                    if word in m:
                                        l.append(m[word])
                                    elif word.replace(".","") in m:
                                        l.append(m[word.replace(".","")])
                                    elif is_ascii(word):
                                        #eng word
                                        if word.isalpha():
                                            t = trans(word)
                                            l.append(t)
                                            q = -3
                                        else:
                                            try:
                                                #number
                                                l.append(num2words(word, lang='ru'))
                                                q = -4
                                            except:
                                                try:
                                                    l.append(num2words(word.translate(SUB).translate(SUP), lang='ru'))
                                                    q = -4
                                                except:
                                                    #number like 123-12314-52345
                                                    if (word.count("-")>0 and word.count(".")==0 
                                                        and word.count("/")==0 and word.count("(")==0
                                                        and word.count("B")==0):

                                                        l.append(dash_num(word.replace(" ","-")).replace("одна ",""))
                                                        q=-50
                                                    else:
                                                        l.append(word)
                                                        q = -5

                                    else:
                                        try:
                                            #number
                                            l.append(num2words(word, lang='ru'))
                                            q = -4
                                        except:
                                            #if nothing works leave the word as is
                                            l.append(word)
                                            q = -6
                            tmp = " ".join(l) 
                            single_set.add(((lost, last, arr, nex, nexx),i))

    #external set was inconsistent with train set
    if "_letter_latin" in tmp:
        t = tmp
        tmp = tmp.replace("_letter_latin", "_latin")
        q+=1000
        multi_new.add((t,tmp))
    
    if tmp[0] == " ":
        tmp = tmp[1:]
    
    #special case of url
    if q == -5:
        x = arr
        t = url(arr)
        if t!="":
            tmp = t
            q = -13
    
    after.append(tmp)
    reason.append(q)

 82%|████████▏ | 814453/989880 [03:03<00:55, 3151.24it/s]

0201
0201


 85%|████████▌ | 843498/989880 [03:08<00:28, 5120.76it/s]

701
701
701


 91%|█████████ | 898289/989880 [03:20<00:28, 3208.55it/s]

676
676


100%|██████████| 989880/989880 [03:39<00:00, 4505.69it/s]


# 6. Save prediction

In [25]:
#10 sec
dic = {"id": ides, "after": after, "type": reason}
dr = pd.DataFrame(data=dic)
dr.to_csv("baseline_ext_rus_my.csv", sep=",", index=False, quoting=csv.QUOTE_ALL, columns=["id","after"])

print(len(multi_new),len(multi_upd), len(single_set))
#dr.type.value_counts()

18 0 11951


# 7. Compare to final solution

In [28]:
#1 sec
dc = pd.read_csv("baseline_ext_rus_my_9926.csv")
dc.head()

dc["new"] = dr["after"]
if len(dc[dc["after"] != dc["new"]])==0:
    print("no diff")

no diff
