In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="ticks", color_codes=True)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
from colorama import Fore

from pandas_profiling import ProfileReport
import seaborn as sns
from sklearn import metrics
from scipy import stats
import math

from tqdm.notebook import tqdm
from copy import deepcopy
from sklearn.preprocessing import LabelEncoder

In [None]:
# Defining all our palette colours.
primary_blue = "#496595"
primary_blue2 = "#85a1c1"
primary_blue3 = "#3f4d63"
primary_grey = "#c6ccd8"
primary_black = "#202022"
primary_bgcolor = "#f4f0ea"

primary_green = px.colors.qualitative.Plotly[2]

plt.rcParams['axes.facecolor'] = primary_bgcolor

colors = [primary_blue, primary_blue2, primary_blue3, primary_grey, primary_black, primary_bgcolor, primary_green]
sns.palplot(sns.color_palette(colors))

In [None]:
# Load the Data
train = pd.read_csv('../input/text-normalization-challenge-russian-language/ru_train.csv.zip', encoding='utf-8')
test = pd.read_csv('../input/text-normalization-challenge-russian-language/ru_test_2.csv.zip')

In [None]:
train.info()

In [None]:
test.info()

In [None]:
train

In [None]:
test

# 1. Data visualization

In [None]:
fig = px.histogram(
    train, 
    x='class', 
    color='class',
    color_discrete_sequence=[primary_blue, primary_grey],
)
fig.update_layout(
    title_text='Class distribution', # title of plot
    xaxis_title_text='Value', # xaxis label
    yaxis_title_text='Count', # yaxis label
    bargap=0.2, # gap between bars of adjacent location coordinates
    paper_bgcolor=primary_bgcolor,
    plot_bgcolor=primary_bgcolor,
)
fig.show()

# 2 Data missings

In [None]:
nan_data = (train.isna().sum().sort_values(ascending=False) / len(train) * 100)[:6]
fig, ax = plt.subplots(1,1,figsize=(7, 5))

ax.bar(nan_data.index, 100, color=primary_grey, width=0.6)

bar = ax.bar(
    nan_data.index, 
    nan_data, 
    color=primary_blue, 
    width=0.6
)
ax.bar_label(bar, fmt='%.01f %%')
ax.spines.left.set_visible(False)
ax.set_yticks([])
ax.set_title('Null Data Ratio', fontweight='bold')

plt.show()

#  3 Text Normalization

In [None]:
!pip install num2words

In [None]:
import os
import operator
from num2words import num2words #i'm not sure that it works with ru-version
import gc


INPUT_PATH = r'../input/text-normalization-challenge-russian-language'
DATA_INPUT_PATH = r'../input/ru-with-types/ru_with_types'
SUBM_PATH = INPUT_PATH

SUB = str.maketrans("₀₁₂₃₄₅₆₇₈₉", "0123456789")
SUP = str.maketrans("⁰¹²³⁴⁵⁶⁷⁸⁹", "0123456789")
OTH = str.maketrans("፬", "4")

print('Train start...')

file = "ru_train.csv"
train = open(os.path.join('../input/ru-train/ru_train.csv'), encoding='UTF8')
line = train.readline()
res = dict()
total = 0
not_same = 0
while 1:
    line = train.readline().strip()
    if line == '':
        break
    total += 1
    pos = line.find('","')
    text = line[pos + 2:]
    if text[:3] == '","':
        continue
    text = text[1:-1]
    arr = text.split('","')
    if arr[0] != arr[1]:
        not_same += 1
    if arr[0] not in res:
        res[arr[0]] = dict()
        res[arr[0]][arr[1]] = 1
    else:
        if arr[1] in res[arr[0]]:
            res[arr[0]][arr[1]] += 1
        else:
            res[arr[0]][arr[1]] = 1
train.close()
print(file + ':\tTotal: {} Have diff value: {}'.format(total, not_same))

files = os.listdir(DATA_INPUT_PATH)
for file in files:
    train = open(os.path.join(DATA_INPUT_PATH, file), encoding='UTF8')
    while 1:
        line = train.readline().strip()
        if line == '':
            break
        total += 1
        pos = line.find('\t')
        text = line[pos + 1:]
        if text[:3] == '':
            continue
        arr = text.split('\t')
        if arr[0] == '<eos>':
            continue
        if arr[1] != '<self>':
            not_same += 1

        if arr[1] == '<self>' or arr[1] == 'sil':
            arr[1] = arr[0]

        if arr[1] == '<self>' or arr[1] == 'sil':
            arr[1] = arr[0]

        if arr[0] not in res:
            res[arr[0]] = dict()
            res[arr[0]][arr[1]] = 1
        else:
            if arr[1] in res[arr[0]]:
                res[arr[0]][arr[1]] += 1
            else:
                res[arr[0]][arr[1]] = 1
    train.close()
    print(file + ':\tTotal: {} Have diff value: {}'.format(total, not_same))
    gc.collect()

#looks useless for ru-version, but...
"""
sdict = {}
sdict['km2'] = 'square kilometers'
sdict['km'] = 'kilometers'
sdict['kg'] = 'kilograms'
sdict['lb'] = 'pounds'
sdict['dr'] = 'doctor'
sdict['m²'] = 'square meters'
"""
sdict = {}
sdict['км²'] = 'квадратных километров'
sdict['км2'] = 'квадратных километров'
sdict['km²'] = 'квадратных километров'
sdict['км'] = 'километров'
sdict['km'] = 'километров'
sdict['кг'] = 'килограмм'
sdict['kg'] = 'килограмм'

sdict['m²'] = 'квадратных метров'
sdict['м²'] = 'квадратных метров'

total = 0
changes = 0
out = open(os.path.join('./submission6.csv'), "w", encoding='UTF8')
out.write('"id","after"\n')
test = open(os.path.join('../input/ru-test/ru_test/ru_test.csv'), encoding='UTF8')
line = test.readline().strip()
while 1:
    line = test.readline().strip()
    if line == '':
        break

    pos = line.find(',')
    i1 = line[:pos]
    line = line[pos + 1:]

    pos = line.find(',')
    i2 = line[:pos]
    line = line[pos + 1:]

    line = line[1:-1]
    out.write('"' + i1 + '_' + i2 + '",')
    if line in res:
        srtd = sorted(res[line].items(), key=operator.itemgetter(1), reverse=True)
        out.write('"' + srtd[0][0] + '"')
        changes += 1
    else:
        if len(line) > 1:
            val = line.split(',')
            if len(val) == 2 and val[0].isdigit and val[1].isdigit:
                line = ''.join(val)

        if line.isdigit():
            srtd = line.translate(SUB)
            srtd = srtd.translate(SUP)
            srtd = srtd.translate(OTH)
            out.write('"' + num2words(float(srtd)) + '"')
            changes += 1
        elif len(line.split(' ')) > 1:
            val = line.split(' ')
            for i, v in enumerate(val):
                if v.isdigit():
                    srtd = v.translate(SUB)
                    srtd = srtd.translate(SUP)
                    srtd = srtd.translate(OTH)
                    val[i] = num2words(float(srtd))
                elif v in sdict:
                    val[i] = sdict[v]

            out.write('"' + ' '.join(val) + '"')
            changes += 1
        else:
            out.write('"' + line + '"')

    out.write('\n')
    total += 1

print('Total: {} Changed: {}'.format(total, changes))
test.close()
out.close()