In [1]:
from linguaf import descriptive_statistics as ds
from linguaf import syntactical_complexity as sc
from linguaf import lexical_diversity as ld
from linguaf import readability as r

import pandas as pd
from tqdm.notebook import tqdm

In [2]:
human = pd.read_csv("../data/covid-rus-dataset - human.csv")
human.message_date = pd.to_datetime(human.message_date)

chatbot = pd.read_csv("../data/covid-rus-dataset - chatbot.csv")
chatbot.date = pd.to_datetime(chatbot.date)

human = human[(~human.question.isna()) & (human.question.str.len() > 2)]
chatbot = chatbot[(~chatbot.question.isna()) & (chatbot.question.str.len() > 2)]

human.question = human.question.astype(str)
chatbot.question = chatbot.question.astype(str)

In [3]:
human.head()

Unnamed: 0,chat_id,user_id,message_date,first_name,second_name,nickname,lang,question,answer,Unnamed: 9
0,1.0,1.0,2021-03-16 00:00:00,Н,К,1.0,ru,Привет!,Доброго времени суток! Отправьте мне вопрос в ...,
1,1.0,1.0,2021-03-16 20:18:00,Н,К,1.0,ru,Какие основные симптомы у вируса?,К наиболее распространенным симптомам COVID-19...,
2,1.0,1.0,2021-03-16 20:20:00,Н,К,1.0,ru,Можно ли переболеть и не понять что болел?,Теоретически человек может заболеть COVID-19 в...,
3,1.0,1.0,2021-03-16 20:22:00,Н,К,1.0,ru,"Куда обращаться если вызвал врача, а он не при...","Если есть подозрения на COVID-19, необходимо п...",
4,1.0,1.0,2021-03-16 20:23:00,Н,К,1.0,ru,Какой срок выздоровления?,Основная фаза COVID-19 в легкой форме протекае...,


In [4]:
chatbot.head()

Unnamed: 0,chat_id,user_id,date,name,surname,nickname,language,question,answer
0,1001052349,1001052349,2021-03-19 15:24:55+00:00,Cro,Vin,Crovine,ru,Помогает маска?,Использование одноразовой медицинской маски сн...
1,1001052349,1001052349,2021-03-19 15:25:20+00:00,Cro,Vin,Crovine,ru,Безопасна ли вакцина?,"В вакцине содержится часть вируса - антиген, о..."
2,1001052349,1001052349,2021-04-14 15:31:44+00:00,Cro,Vin,Crovine,ru,Какие вакцины используются сегодня?,"Эксперты ВОЗ говорили, что будут считать успеш..."
3,1001052349,1001052349,2021-04-14 15:32:09+00:00,Cro,Vin,Crovine,ru,Перечень вакцин?,"Эксперты ВОЗ говорили, что будут считать успеш..."
4,1001052349,1001052349,2021-04-14 15:33:05+00:00,Cro,Vin,Crovine,ru,Если маску не менять это опасно?,"Не совсем понимаю, о чём ты."


In [5]:
human.user_id.nunique(), chatbot.user_id.nunique()

(35, 68)

In [6]:
human.chat_id.nunique(), chatbot.chat_id.nunique()

(35, 68)

In [7]:
human.message_date.min(), human.message_date.max(), chatbot.date.min(), chatbot.date.max()

(Timestamp('2021-03-16 00:00:00'),
 Timestamp('2021-06-29 18:59:00'),
 Timestamp('2021-03-17 13:57:01+0000', tz='UTC'),
 Timestamp('2021-07-29 14:30:19+0000', tz='UTC'))

In [7]:
human.groupby(['user_id'])['question'].agg(['count'])['count'].mean()

9.428571428571429

In [8]:
chatbot.groupby(['user_id'])['question'].agg(['count'])['count'].mean()

5.264705882352941

## Descriptive statistics

In [9]:
ds.char_count(human.question.values.tolist()), ds.char_count(chatbot.question.values.tolist())

(13145, 7017)

In [10]:
ds.letter_count(human.question.values.tolist()), ds.letter_count(chatbot.question.values.tolist())

(12700, 6764)

In [11]:
ds.digit_count(human.question.values.tolist()), ds.digit_count(chatbot.question.values.tolist())

(18, 35)

In [12]:
ds.punctuation_count(human.question.values.tolist()), ds.punctuation_count(chatbot.question.values.tolist())

(427, 218)

In [13]:
ds.sentence_count(human.question.values.tolist()), ds.sentence_count(chatbot.question.values.tolist())

(356, 369)

In [14]:
ds.syllable_count(human.question.values.tolist(), lang='ru'), ds.syllable_count(chatbot.question.values.tolist(), lang='ru')

(4259, 2193)

In [15]:
hum_avg_wrd_per_sentence = list()
hum_avg_sent_len = list()
hum_avg_word_len = list()
hum_avg_syl_per_wrd = list()

bot_avg_wrd_per_sentence = list()
bot_avg_sent_len = list()
bot_avg_word_len = list()
bot_avg_syl_per_wrd = list()

In [16]:
for q in human.question.values.tolist():
    hum_avg_wrd_per_sentence.append(ds.avg_words_per_sentence([q], lang='ru'))
    hum_avg_sent_len.append(ds.avg_sentence_length([q]))
    hum_avg_word_len.append(ds.avg_word_length([q], lang='ru'))
    hum_avg_syl_per_wrd.append(ds.avg_syllable_per_word([q], lang='ru'))
    
for q in chatbot.question.values.tolist():
    bot_avg_wrd_per_sentence.append(ds.avg_words_per_sentence([q], lang='ru'))
    bot_avg_sent_len.append(ds.avg_sentence_length([q]))
    bot_avg_word_len.append(ds.avg_word_length([q], lang='ru'))
    bot_avg_syl_per_wrd.append(ds.avg_syllable_per_word([q], lang='ru'))

In [17]:
ds_human_df = pd.DataFrame.from_dict({
    'avg_words_per_sentence': hum_avg_wrd_per_sentence,
    'avg_sentence_len': hum_avg_sent_len,
    'avg_word_len': hum_avg_word_len,
    'avg_syllable_per_word': hum_avg_syl_per_wrd
})

ds_chatbot_df = pd.DataFrame.from_dict({
    'avg_words_per_sentence': bot_avg_wrd_per_sentence,
    'avg_sentence_len': bot_avg_sent_len,
    'avg_word_len': bot_avg_word_len,
    'avg_syllable_per_word': bot_avg_syl_per_wrd
})

In [18]:
ds_human_df.to_csv("../data/ds_human.csv", sep=',', index=False)
ds_chatbot_df.to_csv("../data/ds_bot.csv", sep=',', index=False)

## Syntactical Complexity

In [19]:
hum_mdd, bot_mdd = list(), list()

In [20]:
for q in tqdm(human.question.values.tolist()):
    hum_mdd.append(sc.mean_dependency_distance([q], lang='ru'))
    
for q in tqdm(chatbot.question.values.tolist()):
    bot_mdd.append(sc.mean_dependency_distance([q], lang='ru'))

  0%|          | 0/330 [00:00<?, ?it/s]

  0%|          | 0/358 [00:00<?, ?it/s]

In [21]:
sc_human_df = pd.DataFrame.from_dict({
    'mean_dependency_distance': hum_mdd
})

sc_chatbot_df = pd.DataFrame.from_dict({
    'mean_dependency_distance': bot_mdd
})

In [22]:
sc_human_df.to_csv("../data/sc_human.csv", sep=',', index=False)
sc_chatbot_df.to_csv("../data/sc_bot.csv", sep=',', index=False)

## Lexical Diversity

In [23]:
hum_lexical_density = list()
hum_log_type_token_ratio = list()
hum_root_type_token_ratio = list()
hum_type_token_ratio = list()
hum_summer_index = list()

bot_lexical_density = list()
bot_log_type_token_ratio = list()
bot_root_type_token_ratio = list()
bot_type_token_ratio = list()
bot_summer_index = list()

In [25]:
for q in tqdm(human.question.values.tolist()):
    hum_lexical_density.append(ld.lexical_density([q], lang='ru'))
    hum_log_type_token_ratio.append(ld.log_type_token_ratio([q]))
    hum_root_type_token_ratio.append(ld.root_type_token_ratio([q], lang='ru'))
    hum_type_token_ratio.append(ld.type_token_ratio([q], lang='ru'))
    hum_summer_index.append(ld.summer_index([q], lang='ru'))
    
for q in tqdm(chatbot.question.values.tolist()):
    bot_lexical_density.append(ld.lexical_density([q], lang='ru'))
    bot_log_type_token_ratio.append(ld.log_type_token_ratio([q]))
    bot_root_type_token_ratio.append(ld.root_type_token_ratio([q], lang='ru'))
    bot_type_token_ratio.append(ld.type_token_ratio([q], lang='ru'))
    bot_summer_index.append(ld.summer_index([q], lang='ru'))

  0%|          | 0/330 [00:00<?, ?it/s]

  0%|          | 0/358 [00:00<?, ?it/s]

In [26]:
ld_human_df = pd.DataFrame.from_dict({
    'lexical_density': hum_lexical_density,
    'log_type_token_ratio': hum_log_type_token_ratio,
    'root_type_token_ratio': hum_root_type_token_ratio,
    'type_token_ratio': hum_type_token_ratio,
    'summer_index': hum_summer_index
})

ld_chatbot_df = pd.DataFrame.from_dict({
    'lexical_density': bot_lexical_density,
    'log_type_token_ratio': bot_log_type_token_ratio,
    'root_type_token_ratio': bot_root_type_token_ratio,
    'type_token_ratio': bot_type_token_ratio,
    'summer_index': bot_summer_index
})

In [27]:
ld_human_df.to_csv("../data/ld_human.csv", sep=',', index=False)
ld_chatbot_df.to_csv("../data/ld_bot.csv", sep=',', index=False)

## Readability

In [28]:
hum_automated_readability_index = list()
hum_automated_readability_index_simple = list()
hum_coleman_readability = list()
hum_easy_listening = list()
hum_flesch_kincaid_grade = list()
hum_flesch_reading_ease = list()

bot_automated_readability_index = list()
bot_automated_readability_index_simple = list()
bot_coleman_readability = list()
bot_easy_listening = list()
bot_flesch_kincaid_grade = list()
bot_flesch_reading_ease = list()

In [29]:
for q in tqdm(human.question.values.tolist()):
    hum_automated_readability_index.append(r.automated_readability_index([q], lang='ru'))
    hum_automated_readability_index_simple.append(r.automated_readability_index_simple([q]))
    hum_coleman_readability.append(r.coleman_readability([q], lang='ru'))
    hum_easy_listening.append(r.easy_listening([q], lang='ru'))
    hum_flesch_kincaid_grade.append(r.flesch_kincaid_grade([q], lang='ru'))
    hum_flesch_reading_ease.append(r.flesch_reading_ease([q], lang='ru'))
    
for q in tqdm(chatbot.question.values.tolist()):
    bot_automated_readability_index.append(r.automated_readability_index([q], lang='ru'))
    bot_automated_readability_index_simple.append(r.automated_readability_index_simple([q]))
    bot_coleman_readability.append(r.coleman_readability([q], lang='ru'))
    bot_easy_listening.append(r.easy_listening([q], lang='ru'))
    bot_flesch_kincaid_grade.append(r.flesch_kincaid_grade([q], lang='ru'))
    bot_flesch_reading_ease.append(r.flesch_reading_ease([q], lang='ru'))

  0%|          | 0/330 [00:00<?, ?it/s]

  0%|          | 0/358 [00:00<?, ?it/s]

In [30]:
r_human_df = pd.DataFrame.from_dict({
    'automated_readability_index': hum_automated_readability_index,
    'automated_readability_index_simple': hum_automated_readability_index_simple,
    'coleman_readability': hum_coleman_readability,
    'easy_listening': hum_easy_listening,
    'flesch_kincaid_grade': hum_flesch_kincaid_grade,
    'flesch_reading_ease': hum_flesch_reading_ease
})

r_chatbot_df = pd.DataFrame.from_dict({
    'automated_readability_index': bot_automated_readability_index,
    'automated_readability_index_simple': bot_automated_readability_index_simple,
    'coleman_readability': bot_coleman_readability,
    'easy_listening': bot_easy_listening,
    'flesch_kincaid_grade': bot_flesch_kincaid_grade,
    'flesch_reading_ease': bot_flesch_reading_ease
})

In [31]:
r_human_df.to_csv("../data/r_human.csv", sep=',', index=False)
r_chatbot_df.to_csv("../data/r_bot.csv", sep=',', index=False)