In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import nltk
import re
from data_print import data_class_display
from ukrainian_stemmer import UkrainianStemmer

In [None]:
data=pd.read_csv('../input/ukrainian-ostotextdataset42-dataframe/Ukrainian_Open_Speech_To_Text Dataset.csv')
print(data)

In [None]:
kaggle_datasets=data['kaggle_dataset'].unique()
print('kaggle datasets:',kaggle_datasets)
kaggle_datasets = {kaggle_datasets[i] : kaggle_datasets[i] for i in range(0, len(kaggle_datasets) ) }

datasets=data['dataset'].unique()
print('kaggle datasets:',datasets)
datasets = { datasets[i] : datasets[i] for i in range(0, len(datasets) ) }


In [None]:
data_class_display(dataframe=data,class_column='kaggle_dataset',expl_lables=kaggle_datasets)
data_class_display(dataframe=data,class_column='dataset',expl_lables=datasets)

In [None]:
def ua_tokenizer(text,ua_stemmer=True,stop_words=[]):
    """ Tokenizer for Ukrainian language, returns only alphabetic tokens. 
    
    Keyword arguments:
    text -- text for tokenize 
    ua_stemmer -- if True use UkrainianStemmer for stemming words (default True)
    stop_words -- list of stop words (default [])
        
    """
    tokenized_list=[]
    text=re.sub(r"""['’"`�]""", '', text)
    text=re.sub(r"""([0-9])([\u0400-\u04FF]|[A-z])""", r"\1 \2", text)
    text=re.sub(r"""([\u0400-\u04FF]|[A-z])([0-9])""", r"\1 \2", text)
    text=re.sub(r"""[\-.,:+*/_]""", ' ', text)
    for word in nltk.word_tokenize(text): 
        if word.isalpha():
            word=word.lower() 
            if ua_stemmer is True:      
                word=UkrainianStemmer(word).stem_word()
            if word not in stop_words:
                tokenized_list.append(word) 
    return tokenized_list



def ngrams_info(series,n=1,most_common=20,ua_stemmer=True,stop_words=[]):
    """ ngrams_info - Show detailed information about string pandas.Series column. 
    
    Keyword arguments:
    series -- pandas.Series object
    most_common -- show most common words(default 20)
    ua_stemmer -- if True use UkrainianStemmer for stemming words (default True)
    stop_words -- list of stop words (default [])
        
    """
    print (n,'- grams')
    print ('ua_stemmer:',ua_stemmer)
    words=series.str.cat(sep=' ')
    print ('Кількість символів: ',len(words))
    words=nltk.ngrams(ua_tokenizer(words,ua_stemmer=ua_stemmer,stop_words=stop_words),n)
    words=nltk.FreqDist(words)
    print ('Кількість токенів: ',words.N())
    print ('Кількість унікальних токенів: ',words.B())
    print ('Найбільш уживані токени: ',words.most_common(most_common))
    words.plot (most_common, cumulative = True)
for n in (1,2):    
    ngrams_info(data['text'],n=n,ua_stemmer=False)

In [None]:
for dataset in datasets:
    print (dataset)
    ngrams_info(data.loc[data['dataset'] == dataset]['text'],n=1,ua_stemmer=False)

In [None]:
for dataset in datasets:
    print (dataset)
    ngrams_info(data.loc[data['dataset'] == dataset]['text'],n=2,ua_stemmer=False)