In [114]:
#!pip install spacy
# !python -m spacy download ru_core_news_lg

In [115]:
import pickle
import numpy as np
import statistics
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from collections import defaultdict
from tqdm.notebook import tqdm
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
import spacy
import json

In [116]:
class Token: # trees parse
    def __init__(self, token_text: str = "", token_type: str = "", token_lemma: str = "", token_form_id: int = 0, token_lemma_id: int = 0, token_homonymous: bool = False, token_properties: dict = {}):
        self.token_text = token_text
        self.token_type = token_type
        self.token_lemma = token_lemma
        self.token_form_id = token_form_id
        self.token_lemma_id = token_lemma_id
        self.token_homonymous = token_homonymous
        self.token_properties = token_properties
        self.hide_zero_prop()

    def hide_zero_prop(self):
        new_properties = {}
        for token_property in self.token_properties:
            if "0-" not in self.token_properties[token_property]:
                new_properties[token_property] = self.token_properties[token_property]
        self.token_properties = new_properties

    def __str__(self):
        return f"token_text: {self.token_text}, token_lemma: {self.token_lemma}"

    def __repr__(self):
        return self.token_text

In [117]:
# Load the model
model = KeyedVectors.load_word2vec_format('../W2V_Model/model.bin', binary=True)  # ruwikiruscorpora_upos_cbow_300_10_2021
nlp = spacy.load('ru_core_news_lg')

In [118]:
with open("correct_trees.pickle", "rb") as p_file:
 correct_trees = pickle.load(p_file)
with open("bad_trees.pickle", "rb") as p_file:
 bad_trees = pickle.load(p_file)

In [119]:
with open("scenarios.json", "r", encoding="utf8") as in_file:
    scenarios = json.load(in_file)

In [120]:
scenarios

{'@поклониться_15804_999': {'CENTRAL': {'scenarios': {'гость поклонился королю': {'semantics': {'semantics': [{'facts': [{'meta': {'metadata': {'semantics_proximity': {'proximity': 0.20885194001495166,
             'tree_proximity': 0.20885194001495166,
             'scenario': '@поклониться_15804_999'}}},
          'valencies': [{'valency_alias': 'p',
            'lemma': 'поклониться',
            'wordform': 'поклонился'},
           {'valency_alias': 'ag', 'lemma': 'гость', 'wordform': 'гость'},
           {'valency_alias': 'ben',
            'lemma': 'король',
            'wordform': 'королю'}]}]}]}}}},
  'BEFORE': {'scenarios': {'гость пришёл к королю': {'semantics': {'semantics': [{'facts': [{'meta': {'metadata': {'semantics_proximity': {'proximity': 0.2761527398489404,
             'tree_proximity': 0.2761527398489404,
             'scenario': '@прийти_4121_VERB_13'}}},
          'valencies': [{'valency_alias': 'p',
            'lemma': 'прийти',
            'wordform': 'пришел

In [121]:
with open("vals.json", "r", encoding="UTF8") as j_file:
    valencies = json.load(j_file)

In [122]:
def branch():
    return defaultdict(list)

with open("incidents_dict.pkl", "rb") as p_file:
    incidents_dict = pickle.load(p_file)

In [123]:
def get_best_semantics(semantics):
    try:
        return sorted(semantics, key=lambda x: (len(x['facts']), -x['facts'][0]['meta']['metadata']['semantics_proximity']['tree_proximity']) if x['facts'] else (0, 0))[0]
    except:
        return semantics[0]

In [124]:
def get_incidents(lemma, relation):
    incidents = incidents_dict[lemma][relation]
    if incidents == {}:
        return []
    incidents.sort(key=lambda x: x[1], reverse=True)
    return incidents

In [125]:
max_similarity_list = []

In [126]:
def intersect_check_for_compactness(words, compare_word, threshold = 0.15, show_plot: bool = False, debug: bool = False):
    if debug: print(f"in words: {len(words)}")
    remaining_words = []
    max_similarity = 0
    similarities = {}
    if compare_word not in model:
        print(f"compare word not in w2v model")
        return remaining_words
    

    # Create a word cloud center from the vectors of words
    words_vectors = np.array([model[word[0]] for word in words if word[0] in model])
    word_cloud_center = np.mean(words_vectors, axis=0)
    
    for word in words:
        if word[0] in model:
            word_vector = model[word[0]]  
        else:
            continue
        # Calculate cosine similarity with the word cloud center
        similarity = cosine_similarity(word_cloud_center.reshape(1, -1), word_vector.reshape(1, -1))
        similarities[word] = similarity[0][0]
        if similarity[0][0] > max_similarity:
            max_similarity = similarity[0][0]
    max_similarity_list.append(max_similarity)
    # Calculate the threshold as the median of the similarities but not less than a constant
    if similarities:
        threshold = max(np.median(list(similarities.values())), threshold)
    # Convert the dictionary to a list of tuples
    similarities_list = [(word, sim) for word, sim in similarities.items()]

    # Sort the list by similarity and then by frequency in descending order
    similarities_list = sorted(similarities_list, key=lambda x: (x[1], x[0][1]), reverse=True)

    # Now you can iterate over the sorted list
    for word, similarity in similarities_list:
        # Remove words further than the threshold (precision error correction)
        if similarity < threshold:
            continue
        # Add words closer to the threshold to W1, but not included in the set
        remaining_words.append(f"{word[0]}_sim:{'{:.3f}'.format(round(similarity, 3))}_freq:{word[1]}")
        
    if debug: print(f"similarities: {similarities}")
    return remaining_words

In [127]:
def parse_tokens(scenario_tokens, p_list, possible_pairs, word_lemma_to_wordform, word_wordform_to_lemma, central_possible_pairs, text, central_scenario_text, central_wordform_to_lemma, scenario_type, outfile, debug: bool = False):
        p_properties = {}
        p_tokens = scenario_tokens['root']
        for p_token in p_tokens:
            if p_token.token_text in p_list:
                p_properties.setdefault(p_token.token_text, []).extend(token_property for token_property in p_token.token_properties if token_property in valencies)
        if debug: print("p_properties:", p_properties)
        paired_words_properties = {}
        for p_token, words in scenario_tokens.items():
            if p_token == 'root' or p_token not in p_list:
                continue
            for word in words:
                if debug: print("try match:",word, possible_pairs)
                if word.token_text in possible_pairs:
                    if 'SemVal' not in word.token_properties:
                        continue
                    sem_val = word.token_properties['SemVal']
                    for p, properties in p_properties.items():
                        for p_property in properties:
                            val = p_property.split("-")[-1]
                            if sem_val == val:
                                paired_words_properties[word_lemma_to_wordform[word.token_text]] = {word_lemma_to_wordform[p]: valencies[p_property]}
        if debug: print("paired_words_properties:", paired_words_properties)
        #else:
            #print(f"paired_words_properties: {paired_words_properties}", file=outfile)
        found_variants = {}
        polysemy = None
        central_polysemy = None
        selected_valency = None
        central_selected_valency = None
        for val in paired_words_properties:
            if val not in central_possible_pairs:
                continue
            scenario_verb_lemma, scenario_relationship = list(paired_words_properties[val].items())[0]
            if debug: print(f"scenario_verb_lemma, scenario_relationship: {scenario_verb_lemma, scenario_relationship}")
            if type(scenario_relationship) == list:
                if debug: print(f"polysemy detected in {scenario_type} scenario: {scenario_relationship}")
                polysemy = f"polysemy detected in {scenario_type} scenario: {scenario_relationship}"
                scenario_incidents_selector = []
                for scenario_val in scenario_relationship:
                    scenario_incidents_selector.append(get_incidents(scenario_verb_lemma, scenario_val))
                scenario_incidents = max(scenario_incidents_selector, key=len)
                if debug: print(f"selected valency for scenario: {scenario_relationship[scenario_incidents_selector.index(scenario_incidents)]}, len: {len(scenario_incidents)}")
                selected_valency = f"selected valency for {scenario_type} scenario: {scenario_relationship[scenario_incidents_selector.index(scenario_incidents)]}, len: {len(scenario_incidents)}"
            else:
                
                scenario_incidents = get_incidents(scenario_verb_lemma, scenario_relationship)
                if debug: print(f"selected valency for scenario: {scenario_relationship}, len: {len(scenario_incidents)}")
                selected_valency = f"selected valency for {scenario_type} scenario: {scenario_relationship}, len: {len(scenario_incidents)}"
            if debug: print(f"scenario_incidents: {scenario_incidents}")
            central_scenario_verb_lemma, central_scenario_relationship = list(central_possible_pairs[val].items())[0]
            if debug: print(f"central_scenario_verb_lemma: {central_scenario_verb_lemma}")
            if type(central_scenario_relationship) == list:
                if debug: print(f"polysemy detected in central scenario: {central_scenario_relationship}")
                central_polysemy = f"polysemy detected in central scenario: {central_scenario_relationship}"
                central_scenario_incidents_selector = []
                for scenario_val in central_scenario_relationship:
                    central_scenario_incidents_selector.append(get_incidents(central_scenario_verb_lemma, scenario_val))
                central_scenario_incidents = max(central_scenario_incidents_selector, key=len)
                if debug: print(f"selected valency for central scenario: {central_scenario_relationship[central_scenario_incidents_selector.index(central_scenario_incidents)]}, len: {len(central_scenario_incidents)}")
                central_selected_valency = f"selected valency for central scenario: {central_scenario_relationship[central_scenario_incidents_selector.index(central_scenario_incidents)]}, len: {len(central_scenario_incidents)}"
            else:
                central_scenario_incidents = get_incidents(central_scenario_verb_lemma, central_scenario_relationship)
                if debug: print(f"selected valency for central scenario: {central_scenario_relationship}, len: {len(central_scenario_incidents)}")
                central_selected_valency = f"selected valency for central scenario: {central_scenario_relationship}, len: {len(central_scenario_incidents)}"
            if debug: print(f"central_scenario_incidents: {central_scenario_incidents}")
            intersect_incidents = [item for item in central_scenario_incidents if item[0] in [x[0] for x in scenario_incidents]]
            if debug: print(f"intersect_incidents: {intersect_incidents}")
            # TODO: add w2v close words
            words_nlp = [nlp(word[0])[0] for word in intersect_incidents]
            val_nlp = nlp(val)[0]
            val_nlp = f"{val_nlp.text}_{val_nlp.pos_}"
            if debug: print(f"val_nlp: {val_nlp}")
            words_incident = [word[1] for word in intersect_incidents]
            if debug: print([word[0] for word in intersect_incidents])
            if debug: print(words_incident)
            words_pos = [f"{token.text}_{token.pos_}" for token in words_nlp]
            intersect_incidents_checked_with_w2v = [(word, words_incident[i]) for i, word in enumerate(words_pos) if word in model.key_to_index]
            checked_for_compactness = intersect_check_for_compactness(intersect_incidents_checked_with_w2v, val_nlp, debug=debug)
            if debug: print("intersect_incidents_checked_with_w2v:",intersect_incidents_checked_with_w2v)
            if debug: print("checked_for_compactness:", checked_for_compactness)
            if debug: print(f"scenario_incidents_amount: {len(scenario_incidents)}, central_scenario_incidents: {len(central_scenario_incidents)}, w2v check: {len(intersect_incidents_checked_with_w2v)}, compact_check: {len(checked_for_compactness)}")
            if len(checked_for_compactness) > 0:
                found_variants[val] = []
                for word in checked_for_compactness:
                    replaced_scenario_text = text.lower().replace(word_wordform_to_lemma[val], f"({word})")
                    replaced_central_scenario_text = central_scenario_text.lower().replace(central_wordform_to_lemma[val], f"({word})")
                    found_variants[val].append(f"{replaced_central_scenario_text} --- {replaced_scenario_text}")    
        return found_variants, paired_words_properties, polysemy, central_polysemy, selected_valency, central_selected_valency

In [128]:
def parse_scenario(scenario, scenario_type, central_possible_pairs, central_scenario_text, central_wordform_to_lemma, outfile, debug: bool = False, include_central_polysemy = False):
    for text in scenario["scenarios"]:
            if debug: print("text:",text)
            if text.lower() == 'неадекв':
                print("неадекв", file=outfile)
                print(f"{scenario_type}: {text.lower()}\n", file=outfile)
                continue
            scenario_semantic = scenario['scenarios'][text]['semantics']['semantics']
            scenario_best_semantic = get_best_semantics(scenario_semantic)
            if 'facts' not in scenario_best_semantic:
                print("нету фактов", file=outfile)
                print(f"{scenario_type}: {text.lower()}\n", file=outfile)
                continue
            scenario_facts = scenario_best_semantic['facts']
            if not scenario_facts:
                print("нету семантики", file=outfile)
                print(f"{scenario_type}: {text.lower()}\n", file=outfile)
                continue
            semantics = [(sem['lemma'] if 'lemma' in sem else '', sem['wordform'] if 'wordform' in sem else '', sem['valency_alias'] if 'valency_alias' in sem else '') for fact in scenario_facts for sem in fact['valencies']]
            if debug: print("semantics:",semantics)
            word_lemma_to_wordform = {lemma[1]: lemma[0] for lemma in semantics}
            word_wordform_to_lemma = {lemma[0]: lemma[1] for lemma in semantics}
            possible_pairs = {lemma[1] for lemma in semantics if lemma[2] != 'p' and (lemma[0] in central_possible_pairs or lemma[1] in central_possible_pairs)}
            if debug: print(f"possible pairs: {possible_pairs}")
            p_list = {lemma[1] for lemma in semantics if lemma[2] == 'p'}
            if text.lower() in correct_trees:
                scenario_tokens = correct_trees[text.lower()]['tokens']
                found_variants, paired_word_properties, polysemy, central_polysemy, selected_valency, central_selected_valency = parse_tokens(scenario_tokens, p_list, possible_pairs, word_lemma_to_wordform, word_wordform_to_lemma, central_possible_pairs, text, central_scenario_text, central_wordform_to_lemma, scenario_type, outfile, debug=debug)
                if include_central_polysemy and central_polysemy is not None:
                    if debug: 
                        print(central_polysemy)
                        print(central_selected_valency)
                    else:
                        print(central_polysemy, file=outfile)
                        print(central_selected_valency, file=outfile)
                if debug:
                    print(f"scenario found in correct_trees")
                else:
                    print(f"scenario found in correct_trees", file=outfile)
                if len(found_variants.keys()) > 0:
                    if debug:
                        print("parsed correct tree")
                        print(f"paired word properties: {paired_word_properties}")
                        if polysemy is not None:
                            print(polysemy)
                            print(selected_valency)
                        print(f"Creating pairs for {scenario_type}: {text.lower()}")
                    else:
                        print("parsed correct tree", file=outfile)
                        print(f"paired word properties: {paired_word_properties}", file=outfile)
                        if polysemy is not None:
                            print(polysemy, file=outfile)
                            print(selected_valency, file=outfile)
                        print(f"Creating pairs for {scenario_type}: {text.lower()}", file=outfile)
                    for found_vars in found_variants:
                        for var in found_variants[found_vars]:
                            if debug:
                                print(var)
                            else:
                                print(var, file=outfile)
                    if debug:
                        print()
                    else:
                        print(file=outfile)
            else:
                counter = 0
                parsed_trees_counter = 0
                paired_word_properties = {}
                while text.lower() in bad_trees and counter < len(bad_trees[text.lower()]):
                    scenario_tokens = bad_trees[text.lower()][counter]['tokens']
                    found_variants, paired_word_properties, polysemy, central_polysemy, selected_valency, central_selected_valency = parse_tokens(scenario_tokens, p_list, possible_pairs, word_lemma_to_wordform, word_wordform_to_lemma, central_possible_pairs, text, central_scenario_text, central_wordform_to_lemma, scenario_type, outfile, debug)
                    counter += 1
                    if len(found_variants.keys()) > 0:
                        parsed_trees_counter += 1
                        if debug:
                            if parsed_trees_counter == 1:
                                if include_central_polysemy and central_polysemy is not None:
                                    print(central_polysemy)
                                    print(central_selected_valency)
                                print(f"Creating pairs for {scenario_type}: {text.lower()}")
                                print(f"paired word properties: {paired_word_properties}")
                                if polysemy is not None:
                                    print(polysemy)
                                    print(selected_valency)
                            print(f"parsed bad tree {parsed_trees_counter}")
                        else:
                            if parsed_trees_counter == 1:
                                if include_central_polysemy and central_polysemy is not None:
                                    print(central_polysemy, file=outfile)
                                    print(central_selected_valency, file=outfile)
                                print(f"Creating pairs for {scenario_type}: {text.lower()}", file=outfile)
                                print(f"paired word properties: {paired_word_properties}", file=outfile)
                                if polysemy is not None:
                                    print(polysemy, file=outfile)
                                    print(selected_valency, file=outfile)
                            print(f"parsed bad tree {parsed_trees_counter}", file=outfile)
                        for found_vars in found_variants:
                            for var in found_variants[found_vars]:
                                if debug:
                                    print(var)
                                else:
                                    print(var, file=outfile)
                        if debug:
                            print()
                        else:
                            print(file=outfile)
                        break # comment if want all trees
                            
                if parsed_trees_counter == 0:
                    if debug:
                        print(f"valencies not found for any tree in Sketches, trees_parsed: {counter}")
                        print(f"{scenario_type}: {text.lower()}")
                        print()
                    else:
                        print(f"valencies not found for any tree in Sketches, trees_parsed: {counter}", file=outfile)
                        print(f"{scenario_type}: {text.lower()}\n", file=outfile)
                        print(file=outfile)

In [129]:
def main(full_scenario, outfile, debug: bool = False):
    central_scenario = scenarios[full_scenario]["CENTRAL"]["scenarios"]
    central_scenario_text = list(central_scenario.keys())[0]
    if debug:
        print(f"Parsing scenario: {full_scenario} - {central_scenario_text}\n")
    else:
        print(f"Parsing scenario: {full_scenario} - {central_scenario_text}\n", file=outfile)
    central_scenario_semantic = central_scenario[central_scenario_text]['semantics']['semantics']
    central_scenario_best_semantic = get_best_semantics(central_scenario_semantic)
    if 'facts' not in central_scenario_best_semantic:
        print(central_scenario_text, "has no facts")
        return
    central_scenario_best_semantic = get_best_semantics(central_scenario_semantic)['facts']
    semantics = [(sem['lemma'] if 'lemma' in sem else '', sem['wordform'] if 'wordform' in sem else '', sem['valency_alias'] if 'valency_alias' in sem else '') for fact in central_scenario_best_semantic for sem in fact['valencies']]
    if debug: print("semantics:", semantics)
    word_lemma_to_wordform = {lemma[1]: lemma[0] for lemma in semantics}
    word_wordform_to_lemma = {lemma[0]: lemma[1] for lemma in semantics}
    possible_pairs = {lemma[1] for lemma in semantics if lemma[2] != 'p'}
    if central_scenario_text.lower() not in correct_trees:
        if debug:
            print(f"{'central scenario not in correct trees'}\n")
        else:
            print(f"{'central scenario not in correct trees'}\n", file=outfile)
        return
    central_scenario_tokens = correct_trees[central_scenario_text.lower()]['tokens'] #
    central_p_list = {lemma[1] for lemma in semantics if lemma[2] == 'p'}
    central_p_properties = {}
    p_tokens = central_scenario_tokens['root']
    for p_token in p_tokens:
        if p_token.token_text in central_p_list:
            central_p_properties.setdefault(p_token.token_text, []).extend(token_property for token_property in p_token.token_properties if token_property in valencies)
    if debug: print("central_p_properties:", central_p_properties)
    central_paired_words_properties = {}
    for p_token, words in central_scenario_tokens.items():
        if p_token == 'root' or p_token not in central_p_list:
            continue
        for word in words:
            if word.token_text in possible_pairs or word.token_lemma in possible_pairs:
                if 'SemVal' not in word.token_properties:
                    continue
                sem_val = word.token_properties['SemVal']
                for p, properties in central_p_properties.items():
                    for p_property in properties:
                        val = p_property.split("-")[-1]
                        if sem_val == val:
                            central_paired_words_properties[word_lemma_to_wordform[word.token_text]] = {word_lemma_to_wordform[p]: valencies[p_property]}
    if debug: print(f"central_possible words with properties: {central_paired_words_properties}")
    scenario_types = ["BEFORE", "AFTER", "INTERP"]
    possible_types_in_current = [scenario_type for scenario_type in scenario_types if scenario_type in scenarios[full_scenario]]
    first_scenario = True
    for scenario_type in possible_types_in_current:
        if debug: print("scenario_type:",scenario_type)
        parse_scenario(scenarios[full_scenario][scenario_type], scenario_type, central_paired_words_properties, central_scenario_text.lower(), word_wordform_to_lemma, outfile, debug, first_scenario)
        first_scenario = False

In [130]:
res_file = open("test_on_scenarios.txt", mode="w", encoding="UTF8")
with tqdm(total=len(scenarios), desc="Parsing scenarios") as pbar:
    for scenario in scenarios:
        pbar.update()
        scenario_text = main(scenario, res_file)
print(f"mean sim: {statistics.mean(max_similarity_list)}", file=res_file)
res_file.close()

Parsing scenarios:   0%|          | 0/3 [00:00<?, ?it/s]