In [None]:
import json
import matplotlib.pyplot as plt
import numpy as np
import operator
import os
import re
import warnings
from nltk.corpus import stopwords
from PIL import Image  
from pymystem3 import Mystem
from stop_words import get_stop_words 
from wordcloud import WordCloud

warnings.filterwarnings('ignore')
%matplotlib inline
mystem = Mystem()
stops = get_stop_words('ru')

In [None]:
def list_of_files(folder):
    arr = []
    for i in os.listdir(path="./{}/".format(folder)):
        if i != '.DS_Store':
            arr.append(i)
    return arr

In [None]:
def text(file):
    arr = []
    with open('./{}/{}'.format(folder, file)) as f:
        text = json.load(f)
    for k, v in text.items():
        arr.append(v)
    return arr

In [None]:
def get_lemmas(drama):
    lemmas = []
    non_stop_lemmas = []
    lemmas_ = dict()
    for i in drama:
        words_analyses = mystem.analyze(i)
        lemmas += [parse["analysis"][0]["lex"] for parse in words_analyses if parse.get("analysis")]
    for lemma in lemmas:
        if lemma not in stops:
            non_stop_lemmas.append(lemma)
    for i in non_stop_lemmas:
        if i in lemmas_:
            lemmas_[i] += 1
        else:
            lemmas_[i] = 1
    return lemmas, lemmas_

In [None]:
def saving_lemmas_all(lemmas_all):
    with open('./{}/words_all_.json'.format(folder), 'w') as f:
        f.write(json.dumps(lemmas_all, ensure_ascii=False))

In [None]:
def list_of_poses(lemmas):
    reg_pos = re.compile("([A-Z]+)")
    pos = []
    pos_N = []
    for lemma in lemmas:
        lemma_analyses = mystem.analyze(lemma)
        all_grammar = lemma_analyses[0]["analysis"][0]["gr"]
        if re.search(reg_pos, all_grammar).group(1) == 'S':
            if lemma not in stops:
                pos_N.append(lemma)
        pos.append(re.search(reg_pos, all_grammar).group(1))
    poses = dict()
    for i in pos:
        if i in poses:
            poses[i] += 1
        elif i not in poses:
            poses[i] = 1
    return poses, pos_N

In [None]:
def plotting_bar(poses_all_):
    x = []
    labels = []
    list_keys = list(poses_all_.keys())
    labels_ = sorted(list_keys)
    total = 0
    for i, j in poses_all_.items():
        total += j
    for k in labels_:
        x_ = poses_all_[k]/total*100
        if x_ >2:
            x.append(x_)
            labels.append(k)
    plt.figure(figsize=(10,4))
    plt.title("Части речи в группе {}".format(folder), fontsize=20)
    plt.grid(zorder=0)
    plt.bar(labels, x)
    plt.savefig('./{}/POSes_{}.png'.format(folder, folder))
    plt.clf()

In [None]:
def custom_colours(*args, **kwargs):
    word = args[0]
    a = ['мама', 'мать', 'папа', 'папочка', 'отец', 'папаша', 'маменька', 'мамаша', 'папенька', 'батюшка', 'матушка', 'муж', 'жена']
    b = ['сын', 'дочь', 'сыночек', 'доченька', 'брат', 'братец', 'сестра', 'сестрица', 'братик', 'братишка', 'сестренка', 'ребенок']
    if word in a: 
        return "#FFD700"
    elif word in b:
        return '#00D632'
    else:
        return "#4f4f4f"

In [None]:
def plotting_cloud():
    with open('./{}/words_all_.json'.format(folder)) as f:
        text = json.load(f)
    wave_mask = np.array(Image.open("1.png"))
    wordcloud = WordCloud(width = 2500,
                          height = 2500,
                          max_words=2500,
                          background_color ='white',
                          color_func=custom_colours,
                          mask=wave_mask).generate_from_frequencies(text)
    plt.figure(figsize = (20, 20), facecolor = None) 
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.margins(x=0, y=0)
    plt.savefig('./{}/cloud_{}.png'.format(folder, folder))
    plt.clf()

In [None]:
def plotting_cloud_N(poses_N_all_):
    s = ''
    for i in poses_N_all_:
        for j in i:
            s += j+' '
    wave_mask = np.array(Image.open("1.png"))
    wordcloud = WordCloud(width = 2500,
                          height = 2500,
                          max_words=2500,
                          background_color ='white',
                          color_func=custom_colours,
                          mask=wave_mask).generate(s)
    plt.figure(figsize = (20, 20), facecolor = None) 
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.margins(x=0, y=0)
    plt.savefig('./{}/cloud_N_{}.png'.format(folder, folder))
    plt.clf()

In [None]:
def all_words():
    with open('./{}/words_all_.json'.format(folder)) as f:
        text = json.load(f)
    total = 0
    for i, j in text.items():
        total += j
    dict_text = dict()
    for k, v in text.items():
        dict_text[k] = v/total*100
    sorted_all_words = sorted(dict_text.items(), key=lambda kv: kv[1], reverse=True)
    with open('./{}/all_words_.json'.format(folder), 'w') as f:
        f.write(json.dumps(sorted_all_words, ensure_ascii=False))
    return sorted_all_words

In [None]:
def plotting_bar_words(sorted_all_words):
    x = []
    labels = []
    for i in sorted_all_words[:10]:
        x.append(i[0])
        labels.append(i[1])
    plt.figure(figsize=(6,6))
    plt.title("TOP-10 слов в группе {}".format(folder), fontsize=14)
    plt.grid(zorder=0)
    plt.bar(labels, x)
    plt.savefig('./{}/TOP-10_{}.png'.format(folder, folder))
    plt.clf()

In [None]:
def get_emotions(drama):
    emotions = {}
    for i in drama:
        pattern = '\w(\W{1,3})\s'
        x = re.findall(pattern, i)
        for p in x:
            p = re.sub('[\–\—\-\"//();,:>»\xa0]', '', p)
            p = re.sub('(!..|!!)', '!', p)
            p = re.sub('(\?..|\? .|\?!)', '?', p)
            p = re.sub('(...|. .|….)', '…', p)
            p = p.strip(' ')
            p = p.strip('')
            if p != '':
                try:
                    emotions[p] += 1
                except:
                    emotions[p] = 1
    return emotions

In [None]:
def plotting_bar_emotions(emotions_all):
    x = []
    labels = []
    list_keys = list(emotions_all.keys())
    labels_ = sorted(list_keys)
    total = 0
    for i, j in emotions_all.items():
        total += j
    for k in labels_:
        labels.append(k)
        x_ = emotions_all[k]/total*100
        x.append(x_)
    plt.figure(figsize=(10,4))
    plt.title("Эмоциональный окрас в группе {}".format(folder), fontsize=20)
    plt.grid(zorder=0)
    plt.bar(labels, x)
    plt.savefig('./{}/emotions_{}.png'.format(folder, folder))
    plt.clf()

In [None]:
#folders = ['родственники', 'супруги', 'родители']
folders = ['1 родитель 1 ребенок', '1 родитель 2 ребенка', '2 родителя 1 ребенок', '2 родителя 2 ребенка']


for folder in folders:
    files = list_of_files(folder)
    lemmas_all = dict()
    poses_all_ = dict()
    emotions_all = dict()
    poses_N_all_ = []
    for file in files:
        if not file.endswith('_.json') and not file.endswith('.png'):
            print(file)
            file_ = file.rstrip('.json')
            drama = text(file)
            lemmas, lemmas_ = get_lemmas(drama)
            e = get_emotions(drama)
            for k in lemmas_:
                if k not in lemmas_all:
                    lemmas_all[k] = lemmas_[k]        
                else:
                    lemmas_all[k] += lemmas_[k]
            poses, pos_N = list_of_poses(lemmas)
            for k in poses:
                if k not in poses_all_:
                    poses_all_[k] = poses[k]        
                else:
                    poses_all_[k] += poses[k]
            for k in e:
                if k not in emotions_all:
                    if e[k] > 1:
                        emotions_all[k] = e[k]        
                else:
                    if e[k] > 1:
                        emotions_all[k] += e[k]
            poses_N_all_.append(pos_N)
    saving_lemmas_all(lemmas_all)
    plotting_bar(poses_all_)
    sorted_all_words = all_words()
    plotting_cloud()
    plotting_cloud_N(poses_N_all_)
    plotting_bar_words(sorted_all_words)
    plotting_bar_emotions(emotions_all)