# Подготовка данных для ASR

## Импорты

In [1]:
import re

import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from torch.utils.data import Dataset

from dataset import IPS1ASRDataset
from utils import clean_text

## Датасеты

In [2]:
ips_dataset_train = IPS1ASRDataset('../tatar_tts/train/')
ips_dataset_valid = IPS1ASRDataset('../tatar_tts/valid/')

In [3]:
def from_dataset_to_df(dataset: Dataset) -> pd.DataFrame:
    data = []
    for index in range(len(dataset)):
        item = dataset.get_metadata(index)
        text = clean_text(item[2])
        item_dict = {
            'id': str(item[0].split('/')[-1][:-4]),
            # 'name': item[0].split('/')[-1],
            # 'path': item[0],
            'text': text,
            'len': len(text),
            'word_count': len(text.split())
        }
        data.append(item_dict)
    df = pd.DataFrame(data)
    df = df.set_index('id')
    del data
    return df

In [4]:
df_train = from_dataset_to_df(ips_dataset_train)
df_valid = from_dataset_to_df(ips_dataset_valid)

FileNotFoundError: [Errno 2] No such file or directory: '../tatar_tts/train/331.26.txt'

In [None]:
df_train.head()

In [None]:
df_valid.head()

## Исследование данных

### Длина текста

#### value_counts

In [None]:
df_train['len'].value_counts()

In [None]:
df_train['word_count'].value_counts()

#### Гистограммы

In [None]:
df_train['len'].hist(bins=25)
None

In [None]:
df_train['word_count'].hist(bins=25)
None

#### Ящик с усами

In [None]:
df_train.boxplot(column='len')
None

In [None]:
df_train.boxplot(column='word_count')
None

#### describe

In [None]:
df_train['word_count'].describe()

### Вывод

In [None]:
df_train.loc[(2 < df_train['word_count']) & (df_train['word_count'] < 11)].shape[0] / df_train.shape[0] * 100

Удалим 10% данных, которые являются выбросами

In [None]:
df_train = df_train.loc[(2 < df_train['word_count']) & (df_train['word_count'] < 11)]
df_valid = df_valid.loc[(2 < df_valid['word_count']) & (df_valid['word_count'] < 11)]

In [None]:
df_train.to_csv('../tatar_tts/train.csv')
df_valid.to_csv('../tatar_tts/valid.csv')

In [None]:
df_train.loc['331.90']

### Цифры

In [None]:
def get_digits(text):
    numbers = re.findall(r'\d+', text)
    return numbers == []

In [None]:
assert get_digits('бөгелеп төшмәве') == True, 'Неверная работа функции get_digits'
assert get_digits('бөгелеп төшмәве 1') == False, 'Неверная работа функции get_digits'
assert get_digits('1 бөгелеп төшмәве 2') == False, 'Неверная работа функции get_digits'

In [None]:
df_train['numbers'] = df_train['text'].apply(lambda row: get_digits(row))
df_valid['numbers'] = df_valid['text'].apply(lambda row: get_digits(row))

In [None]:
df_train.loc[df_train['numbers'] == False].head()

In [None]:
df_valid.loc[df_valid['numbers'] == False].head()

In [None]:
df_train.loc[df_train['numbers'] == False].shape[0]

Найдено 33 строки в тренировочном наборе данных, в которых встречаюся числительные

### Специальные символы

In [None]:
def get_char_dijits(text):
    special_characters = '@#$%^&*()-+_=<>/\'":;[]{}\\|~`!?,.'
    for char in text:
        if char in special_characters:
            return False
    return True

In [None]:
assert get_char_dijits('бөгелеп төшмәве') == True, 'Неверная работа функции get_digits'
assert get_char_dijits('бөгелеп төшмәве ""') == False, 'Неверная работа функции get_digits'
assert get_char_dijits('1 бөгелеп төшмәве ..\\||') == False, 'Неверная работа функции get_digits'

In [None]:
df_train['char_dijits'] = df_train['text'].apply(lambda row: get_char_dijits(row))
df_valid['char_dijits'] = df_valid['text'].apply(lambda row: get_char_dijits(row))

In [None]:
df_train.loc[df_train['char_dijits'] == False].head()

In [None]:
df_train.loc[df_train['char_dijits'] == False].shape[0]

In [None]:
df_valid.loc[df_valid['char_dijits'] == False].head()

In [None]:
df_valid.loc[df_train['char_dijits'] == False].shape[0]

Мы разрешаем иметь в данных !?, символы. остальные нужно будет удалить из датасета