## Загрузка библиотек

In [193]:
import os
import sys
import numpy as np
import pandas as pd
from collections import Counter
from pprint import pprint
import re
import pymorphy2
morph = pymorphy2.MorphAnalyzer()

## Объявление функций

In [194]:
# предобработка
def preprocess_transc (text): 
    text = re.sub(r'[\!\?]', r'.', text)
    text = re.sub(r'\.\.\.', r'.', text)
    return text

In [195]:
# подсчет количества предложений
def count_sentenies(text):
    text = preprocess_transc(text)
    return text.count('.') + 1 if text else 0

In [197]:
# создание ngram
def get_ngrams(text, n):
    ngrams = []
    if text:
        for sentence in text.split('.'):
            words = re.findall(r'\w+[-]?\w*', sentence)
            while True:
                a, b = tee(words)
                l = tuple(islice(a, n))
                if len(l) == n:
                    ngrams.append('_'.join(l))
                    next(b)
                    words = b
                else:
                    break
    return ngrams

In [196]:
# подсчет частей речи
def count_pos(words):
    return dict(Counter([str(morph.parse(word)[0].tag.POS) for word in words])) if words else {}

## Загрузка корпусов

In [198]:
nds_df = pd.read_excel("NDS.xlsx", encoding="utf-8")
pic_df = pd.read_csv("Pic.csv", encoding="utf-8")
sib_df = pd.read_csv("Sib.csv", encoding="utf-8")

In [201]:
# Берем только текст, предобрабатываем и соединяем в одну строку
#nds = list(nds_df['Text'])
#pic=list(pic_df['Text'])
#sib=list(sib_df['Text'])
nds_gs = " ".join(str(x) for x in list(nds_df['Text']))
pic_gs = " ".join(str(x) for x in list(pic_df['Text']))
sib_gs = " ".join(str(x) for x in list(sib_df['Text']))

In [202]:
# подсчет количества предложений
print(count_sentencies(nds_gs))
print(count_sentencies(pic_gs))
print(count_sentencies(sib_gs))

215
378
416


In [203]:
c = count_pos(get_ngrams(nds_gs,1))
c_df = pd.DataFrame(c, index = ['nds'])
p = count_pos(get_ngrams(pic_gs,1))
p_df = pd.DataFrame(p, index = ['pic'])
s = count_pos(get_ngrams(sib_gs,1))
s_df = pd.DataFrame(s, index = ['sib'])

In [204]:
# создаем датафрейм с распределением по частям речи
s_p_df = pd.merge(s_df,p_df, how='outer')
df_merged = pd.merge(s_p_df,c_df, how='outer')
df_merged = df_merged.rename(index={0:'sib',1:'pic',2:'nds'})
df_pos = df_merged.T

In [205]:
df_pos

Unnamed: 0,sib,pic,nds
NPRO,391,358,383
VERB,735,625,430
NOUN,1106,1091,611
CONJ,499,383,390
INTJ,237,103,68
PRCL,408,237,222
PREP,464,392,265
ADVB,398,338,276
ADJF,362,234,164
INFN,109,142,43


In [206]:
# добавляем столбцы с суммой слов трех датасетов по частям речи 
# и процент от общего числа слов 
df_pos['all']= df_pos.sum(axis=1)
df_pos['percentage'] =  df_pos['all']/df_pos['all'].sum(axis=0)

In [207]:
df_pos

Unnamed: 0,sib,pic,nds,all,percentage
NPRO,391,358,383,1132,0.094935
VERB,735,625,430,1790,0.150117
NOUN,1106,1091,611,2808,0.235491
CONJ,499,383,390,1272,0.106676
INTJ,237,103,68,408,0.034217
PRCL,408,237,222,867,0.07271
PREP,464,392,265,1121,0.094012
ADVB,398,338,276,1012,0.084871
ADJF,362,234,164,760,0.063737
INFN,109,142,43,294,0.024656


In [208]:
df_pos.to_csv('pos_stats_gs.csv')