In [None]:
from spacy import load
from pymorphy2 import MorphAnalyzer
from gensim.models import KeyedVectors
from transformers import AutoTokenizer, AutoModel
from torch import no_grad

from nltk.corpus import stopwords
from string import punctuation as punct
from nltk import sent_tokenize

from sklearn.metrics.pairwise import cosine_similarity as cos_sim
from tqdm import tqdm
from re import sub
from numpy import array, zeros
from random import choice, seed
seed(1509)
# from seaborn import heatmap

# for W2V sim

nlp = load("ru_core_news_lg")
W2V_model = KeyedVectors.load_word2vec_format('ruwikiruscorpora_upos_cbow_300_10_2021/model.bin', binary=True)

# WordNet data proved to be unnecessary
# for WordNet

from ruwordnet import RuWordNet
wn = RuWordNet()
freq_lemmas = set()
with open('ruscorpora_freqdict.csv', encoding='utf-8') as f:
    f.readline()
    for line in f:
        freq_lemmas.add(line.split(';')[1].strip('"'))

# for deixis

morph = MorphAnalyzer()
personal = ['он', 'она', 'оно', 'они']

# for BERT sim
tokenizer = AutoTokenizer.from_pretrained("sberbank-ai/sbert_large_nlu_ru")
BERT_model = AutoModel.from_pretrained("sberbank-ai/sbert_large_nlu_ru")

### Functions for application

In [2]:
def get_sem_sim(para, same=True):
    
    # apply spacy to get PoS tags
    
    doc = nlp(para)
    para_POSed = []
    for sent in doc.sents:
        s_POSed_words = []
        for i in sent: # feature PoS are of questionable value, ADV too 
            if i.pos_ in ['ADJ', 'NOUN', 'PROPN', 'VERB'] and \
            i.text not in stopwords.words('russian')\
            and i.text != '-':
                s_POSed_words.append(i.lemma_ + '_' + i.pos_)            
        para_POSed.append(s_POSed_words)
    
    # compute vector similarity
    
    sims = []
    mem = {}
    for i in range(len(para_POSed)): # sents
        sent_comp_to_others = []
        
        for j in range(len(para_POSed)):
            if para_POSed[i] == [] or para_POSed[j] == []: # avoid empty sents generated previously
                sent_comp_to_others.append(0)
            else:              
                interim_sims = set()
                for word in para_POSed[j]:
                    
                    try:
                        W2V_model[word]
                    except KeyError: # ascribe random vector to OOV
                        if mem.get(word) == None:
                            mem[word] = choice(W2V_model.index_to_key)
                            word = mem[word]
                        else:
                            word = mem[word]
                    
                    for slovo in para_POSed[i]:
                        try:
                            W2V_model[slovo]
                        except KeyError:
                            if mem.get(slovo) == None:
                                mem[slovo] = choice(W2V_model.index_to_key)
                                slovo = mem[slovo]
                            else:
                                slovo = mem[slovo]
                                
                        interim_sims.add(W2V_model.similarity(word, slovo))
                if same == False and i in interim_sims:
                    interim_sims.remove(1) # ignore same words
                sent_comp_to_others.append(round(max(interim_sims), 2))
        
        sims.append(sent_comp_to_others)
    
    return array(sims)

In [19]:
def contains_related_words(two_sents_l, same=True):
    
    # lemmatize sent pair
    
    sent_pair = []
    for i in range(2):
        lemmas = []
        for word in nlp(two_sents_l[i]):
            if word.pos_ in ['ADJ', 'NOUN', 'PROPN', 'VERB'] \
            and word.lemma_ not in freq_lemmas:
                lemmas.append(word.lemma_)
        sent_pair.append(lemmas)
    
    # get related words from sent_pair[0] to check in sent_pair[1]
    
    target_words = set()
    for lemma in sent_pair[0]:
        try:
            for i in wn[lemma]: # diff senses of lemma
                for j in i.synset.hyponyms:
                    target_words.add(j.title.lower())
                for j in i.synset.hypernyms:
                    target_words.add(j.title.lower())
                for j in i.synset.domains:
                    target_words.add(j.title.lower())
                for j in i.synset.domain_items:
                    target_words.add(j.title.lower())
                for j in i.synset.holonyms:
                    target_words.add(j.title.lower())
                for j in i.synset.meronyms:
                    target_words.add(j.title.lower())
                for j in i.synset.classes:
                    target_words.add(j.title.lower())
                for j in i.synset.instances:
                    target_words.add(j.title.lower())
                for j in i.synset.premises:
                    target_words.add(j.title.lower())
                for j in i.synset.conclusions:
                    target_words.add(j.title.lower())
                for j in i.synset.causes:
                    target_words.add(j.title.lower())
                for j in i.synset.effects:
                    target_words.add(j.title.lower())
                for j in i.synset.pos_synonyms:
                    target_words.add(j.title.lower())
                for j in i.synset.antonyms:
                    target_words.add(j.title.lower())
                for j in i.synset.related:
                    target_words.add(j.title.lower())
                for j in i.derivations:
                    target_words.add(j.name.lower())
        except KeyError:
            continue

    for lemma in sent_pair[1]:
        for target in target_words:
            if lemma in target.split(' '):
                if same == True:
#                     print('lemma', [lemma], 'target', [target])
                    return True
                else:
                    if lemma == target:
                        pass
                    else:
#                         print('lemma', [lemma], 'target', [target])
                        return True
    else:
        return False

In [4]:
def contains_text_deixis(sent_pair, init_thr=6, core=True):
    
    for i in range(len(sent_pair)):
        sent_pair[i] = [w.strip(punct + '«»') for w in sent_pair[i].split(' ')]
    
    contains_deixis = False
    for i, word in enumerate(sent_pair[1]):
        
        if i == init_thr: # init_thr - how many tokens we check from the beginning of sentence
            break
            
        word = morph.parse(word)

        if word[0].normal_form in personal:
            if word[0].normal_form == 'они':
                
                for slovo in sent_pair[0]:
                    for razbor in morph.parse(slovo):
                        if core == True:
                            if razbor.tag.number == 'plur' \
                            and (razbor.tag.case == 'nomn' or razbor.tag.case == 'accs'):
#                                 print('word', word, 'anaphor', slovo)
                                contains_deixis = True
                                return contains_deixis
                        else:
                            if razbor.tag.number == 'plur':
#                                 print('word', word, 'anaphor', slovo)
                                contains_deixis = True
                                return contains_deixis
            else: # он, она, оно
                
                for slovo in sent_pair[0]:
                    for razbor in morph.parse(slovo):
                        if core == True:
                            if razbor.tag.gender == word[0].tag.gender \
                            and (razbor.tag.case == 'nomn' or razbor.tag.case == 'accs'):
#                                 print('word', word, 'anaphor', slovo)
                                contains_deixis = True
                                return contains_deixis
                        else:
                            if razbor.tag.number == 'plur':
#                                 print('word', word, 'anaphor', slovo)
                                contains_deixis = True
                                return contains_deixis
    
    return contains_deixis

In [5]:
def encode(sent, tokenizer, model):
        for s in [sent]:
            encoded_input = tokenizer(s, padding=True, truncation=True, max_length=64, return_tensors='pt')
            with no_grad():
                model_output = model(**encoded_input)
        
        return model_output[0][0][0].numpy()

def bert_sent_sim(para_list, deep=1):
    sims = []
    for sent in para_list:
        sims.append(encode(sent, tokenizer, BERT_model))
    return cos_sim(array(sims))

In [29]:
def process_para(para, deep=1, sim_thr=0.4, b_sim_thr=0.78, ratio_thr=0.6, deix=6, core=True, same=True, crit=[0,2,3]):
    
    '''
    Parameters:
    para (str): paragraph that we check for coherence
    deep (int): how many sentences backwards semantic similarity is checked (crit. 0 and 3)
    sim_thr (float): words with W2V cosine similarity above this threshold are said to establish coherence (crit. 0)
    b_sim_thr (float): sentences with BERT CLS cosine similarity above this threshold are said to be coherent (crit. 3)
    ratio_thr (float): ratio of coherent sentences in paragraph above this threshold deems paragraph coherent
    deix (int): number of words from the beginning of sentence that are checked for deixis (crit. 2)
    core (bool): check if candidate for antecedent if in Nom or Acc case (crit. 2)
    same (bool): allow same words and similarity 1 to be present in similarity matrix (criteria 0 and 1)
    crit (list): this list specifies which criteria will be applied
    '''
    
    # initial preparation
    
    para_list = sent_tokenize(para)
    sent_coh_index = zeros([len(para_list), 4], dtype = int)
    
    # Criterion 0: similarity associates
    
    if 0 in crit:
        arr = get_sem_sim(para, same)
    #     heatmap(data = arr, annot = True, cmap = 'Blues')
        for i in range(1, len(para_list)):
            for j in range(i - deep, i):
                if not j == -1:
                    if arr[i][j] >= sim_thr:
                        sent_coh_index[i][0] = 1
    
    # Criterion 1: related words
    
    if 1 in crit:
        for i in range(1, len(para_list)):
            sent_coh_index[i][1] = contains_related_words([para_list[i - 1], para_list[i]], same)
    
    # Criteroin 2: basic anaphora
    
    if 2 in crit:
        if deix > 0:
            for i in range(1, len(para_list)):
                sent_coh_index[i][2] = contains_text_deixis([para_list[i - 1], para_list[i]], deix, core)
    
    # Criterion 3: BERT sent simmilarity
    
    if 3 in crit:
        B_arr = bert_sent_sim(para_list, deep)
    #     heatmap(data = B_arr, annot = True, cmap = 'Greens')

        for i in range(1, len(para_list)):
            for j in range(i - deep, i):
                if not j == -1:
                    if B_arr[i][j] >= b_sim_thr:
                        sent_coh_index[i][3] = 1
    
    # final return
    
    ct = 0
    for row in sent_coh_index:
        if 1 in row:
            ct += 1 # note that [0] sent never supplies 1
    
    if ct / len(para_list) >= ratio_thr:
        verdict = True
    else:
        verdict = False
    
    return verdict, sent_coh_index

In [7]:
para1 = 'Всегда почему-то казалось, что океанская дорога между Старым и Новым Светом очень оживлена, что то и дело навстречу попадаются веселые пароходы, с музыкой и флагами. На самом же деле океан – это штука величественная и пустынная, и пароходик, который штормовал в четырехстах милях от Европы, был единственным кораблем, который мы встретили за пять дней пути. «Нормандия» раскачивалась медленно и важно. Она шла, почти не уменьшив хода, уверенно расшвыривая высокие волны, которые лезли на нее со всех сторон, и только иногда отвешивала океану равномерные поклоны. Это не было борьбой мизерного создания человеческих рук с разбушевавшейся стихией. Это была схватка равного с равным.'
process_para(para1)

(True,
 array([[0, 0, 0, 0],
        [1, 0, 0, 0],
        [0, 0, 0, 0],
        [1, 0, 1, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 1]]))

In [19]:
para2 = 'Все задрожало на корме, где мы помещались. Дрожали палубы, стены, иллюминаторы, шезлонги, стаканы над умывальником, сам умывальник. Вибрация парохода была столь сильной, что начали издавать звуки даже такие предметы, от которых никак этого нельзя было ожидать. Впервые в жизни мы слышали, как звучит полотенце, мыло, ковер на полу, бумага на столе, занавески, воротничок, брошенный на кровать. Звучало и гремело все, что находилось в каюте. Достаточно было пассажиру на секунду задуматься и ослабить мускулы лица, как у него начинали стучать зубы. Всю ночь казалось, что кто-то ломится в двери, стучит в окна, тяжко хохочет. Мы насчитали сотню различных звуков, которые издавала наша каюта.'
process_para(para2)

(True,
 array([[0, 0, 0, 0],
        [1, 0, 0, 1],
        [0, 0, 0, 0],
        [0, 0, 0, 1],
        [1, 0, 0, 1],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0]]))

### Metrics

In [8]:
def compute_text_metric(
    filename, deep=1, sim_thr=0.4, b_sim_thr=0.78, ratio_thr=0.6, deix=6, core=True, same=True, para=4, crit=[0,2,3]):
    
    '''
    Parameters:
    filename (str): path to text file that we assess
    para (int): we only consider paragraphs that contain more sentences than this threshold
    
    Other parameters are used to transmit its parameters to process_para
    '''
    
    # preprocessing
    
    with open(filename, encoding='utf-8') as f:
        text = f.read()
    text = text.replace('\xa0', ' ')
    text = sub('\n{2,}', '\n', text)
    
    text = text.split('\n')
    t_paras_as_lists = [sent_tokenize(i) for i in text]
    to_remove = []
    for i in range(len(text)): # paras
        if text[i] == '':
            to_remove.append(text[i])
        elif text[i].startswith('– '):
            to_remove.append(text[i])
        elif len(t_paras_as_lists[i]) < para: # paragraph length filter
            to_remove.append(text[i])
    for i in to_remove:
        text.remove(i)
    
    # making a shuffled copy
    
    shuffled_text = []
    for i in range(len(text)):
        p = []
        para_l = sent_tokenize(text[i])
        if len(para_l) % 2 == 0:
            p.append(None)
            for j in range(int(len(para_l) / 2)):
                p.append(para_l[j])
                p.append(para_l[-(j + 1)])
            p[0] = p[-1]
            del p[-1]
            shuffled_text.append(' '.join(p))
        else:
            for j in range(int((len(para_l) - 1) / 2)):
                p.append(para_l[j])
                p.append(para_l[-(j + 1)])
            p.append(para_l[j + 1])
            shuffled_text.append(' '.join(p))
    
    # apply index calculations
    
    ct = 0
    for para in text:
        if process_para(para, deep, sim_thr, b_sim_thr, ratio_thr, deix, core, same, crit)[0] == True:
            ct += 1
    
    ct_2 = 0
    for sh_para in shuffled_text:
        if process_para(sh_para, deep, sim_thr, b_sim_thr, ratio_thr, deix, core, same, crit)[0] == False:
            ct_2 += 1
    
    return {'natural': ct / len(text), 
            'shuffled':  ct_2 / len(shuffled_text), 
            'total': (ct + ct_2) / (len(text) * 2) }

In [9]:
def tuning_small(
    text, shuffled_text, deep=1, sim_thr=0.4, b_sim_thr=0.78, ratio_thr=0.6, deix=6, core=True, same=True, crit=[0,2,3]):
    
    # apply index calculations
    
    ct = 0
    for para in text:
        if process_para(para, deep, sim_thr, b_sim_thr, ratio_thr, deix, core, same, crit)[0] == True:
            ct += 1
    
    ct_2 = 0
    for sh_para in shuffled_text:
        if process_para(sh_para, deep, sim_thr, b_sim_thr, ratio_thr, deix, core, same, crit)[0] == False:
            ct_2 += 1
    
    return {'natural': ct / len(text), 
            'shuffled':  ct_2 / len(shuffled_text), 
            'total': (ct + ct_2) / (len(text) * 2) }

In [10]:
def tuning_large(filename, para=4, crit=[0,2,3]):
    
    # preprocessing
    
    with open(filename, encoding='utf-8') as f:
        text = f.read()
    text = text.replace('\xa0', ' ')
    text = sub('\n{2,}', '\n', text)

    text = text.split('\n')
    t_paras_as_lists = [sent_tokenize(i) for i in text]
    to_remove = []
    for i in range(len(text)): # paras
        if text[i] == '':
            to_remove.append(text[i])
        elif text[i].startswith('– '):
            to_remove.append(text[i])
        elif len(t_paras_as_lists[i]) < para: # paragraph length filter
            to_remove.append(text[i])
    for i in to_remove:
        text.remove(i)

    # making a shuffled copy

    shuffled_text = []
    for i in range(len(text)):
        p = []
        para_l = sent_tokenize(text[i])
        if len(para_l) % 2 == 0:
            p.append(None)
            for j in range(int(len(para_l) / 2)):
                p.append(para_l[j])
                p.append(para_l[-(j + 1)])
            p[0] = p[-1]
            del p[-1]
            shuffled_text.append(' '.join(p))
        else:
            for j in range(int((len(para_l) - 1) / 2)):
                p.append(para_l[j])
                p.append(para_l[-(j + 1)])
            p.append(para_l[j + 1])
            shuffled_text.append(' '.join(p))
    
    # trying different parameter combinations
    
    results = []
    for i in tqdm(range(40, 81, 10)):
        sim_thr = i / 100
        for j in range(78, 81, 2):
            b_sim_thr = j / 100
            for k in range(30, 61, 10):
                ratio_thr = k / 100
#                 if {'sim_thr': sim_thr, 'b_sim_thr': b_sim_thr, 'ratio_thr': ratio_thr} not in [i[0] for i in prev_results]:
                prev_results.append(({'sim_thr': sim_thr,
                                 'b_sim_thr': b_sim_thr,
                                 'ratio_thr': ratio_thr}, 
                                tuning_small(text, 
                                             shuffled_text, 
                                             sim_thr=sim_thr, 
                                             b_sim_thr=b_sim_thr,
                                             ratio_thr=ratio_thr,
                                             crit=crit
                                            )))
#                 else:
#                     continue

    return sorted(results, key=lambda x: x[1]['total'], reverse=True)

In [54]:
results = tuning_large('texts/America_chapter.txt')
results

[({'sim_thr': 0.8, 'b_sim_thr': 0.78, 'ratio_thr': 0.5},
  {'natural': 0.5333333333333333,
   'shuffled': 0.9333333333333333,
   'total': 0.7333333333333333}),
 ({'sim_thr': 0.7, 'b_sim_thr': 0.78, 'ratio_thr': 0.5},
  {'natural': 0.5333333333333333,
   'shuffled': 0.8666666666666667,
   'total': 0.7}),
 ({'sim_thr': 0.6, 'b_sim_thr': 0.78, 'ratio_thr': 0.4},
  {'natural': 0.8,
   'shuffled': 0.5333333333333333,
   'total': 0.6666666666666666}),
 ({'sim_thr': 0.8, 'b_sim_thr': 0.78, 'ratio_thr': 0.4},
  {'natural': 0.6,
   'shuffled': 0.7333333333333333,
   'total': 0.6666666666666666}),
 ({'sim_thr': 0.5, 'b_sim_thr': 0.78, 'ratio_thr': 0.5},
  {'natural': 0.8666666666666667,
   'shuffled': 0.4666666666666667,
   'total': 0.6666666666666666}),
 ({'sim_thr': 0.6, 'b_sim_thr': 0.78, 'ratio_thr': 0.5},
  {'natural': 0.5333333333333333,
   'shuffled': 0.8,
   'total': 0.6666666666666666}),
 ({'sim_thr': 0.4, 'b_sim_thr': 0.8, 'ratio_thr': 0.6},
  {'natural': 0.6666666666666666,
   'shuffl

#### WordNet proves capable but useless if Word2Vec data is already considered

In [10]:
compute_text_metric('texts/America_chapter.txt', sim_thr=0.4, ratio_thr=0.6, crit=[1, 2])

{'natural': 0.13333333333333333, 'shuffled': 1.0, 'total': 0.5666666666666667}

In [11]:
compute_text_metric('texts/America_chapter.txt', sim_thr=0.4, ratio_thr=0.6, crit=[0, 2])

{'natural': 0.6666666666666666, 'shuffled': 0.6, 'total': 0.6333333333333333}

In [12]:
compute_text_metric('texts/America_chapter.txt', sim_thr=0.4, ratio_thr=0.6, crit=[0, 1])

{'natural': 0.5333333333333333, 'shuffled': 0.6, 'total': 0.5666666666666667}