# Tasakaalus korpuse verbide statistika kogumine


In [33]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [34]:
from datetime import datetime

from data_helpers.syntax_graph import SyntaxGraph
from data_helpers.tasak_reader import TasakReader
from data_helpers.utils import ListUtils

In [35]:
%%time

file_name = 'data/tasak.vert'

my_reader = TasakReader(
   file_name = file_name
)

stats = {}

date_time = datetime.now().strftime("%Y%m%d-%H%M%S")



# hoitakse algusest lõpuni mälus
verb_plain_stat = {}
verbs_compound_stat = {}
verbs_total = 0


count = 0
for collection_id, graph in my_reader.get_sentences():
    # matrix for node distances
    dpath = graph.get_distances_matrix()
    
    # eraldame ainult verbid:
    verb_nodes = graph.get_nodes_by_attributes(attrname="POS", attrvalue="V")
    # compound:prt
    compound_nodes = graph.get_nodes_by_attributes(
        attrname="deprel", attrvalue="compound:prt"
    )
    for verb in verb_nodes:
        verbs_total += 1
        lemma = graph.nodes[verb]['lemma']
        if not lemma in verb_plain_stat:
            verb_plain_stat[lemma] = 0
        verb_plain_stat[lemma] += 1

        # compound 
        kids = [k for k in dpath[verb] if dpath[verb][k] == 1]
        # compound children
        n_compounds = ListUtils.list_intersection(kids, compound_nodes)
        if not len(n_compounds):
            verb_compound = ""
            n_compounds.append(None)
        else:
            verb_compound = ", ".join(
                [graph.nodes[n]["lemma"] for n in sorted(n_compounds) if n]
            )
        key = (lemma, verb_compound, )
        if not key in verbs_compound_stat:
            verbs_compound_stat[key] = 0
        verbs_compound_stat[key] += 1
        
print('Verbs total:', verbs_total)
 

data/tasak.vert


TSV lines: 100%|██████████| 20058039/20058039 [01:18<00:00, 255824.35it/s]

Verbs total: 2767812
CPU times: user 1min 18s, sys: 801 ms, total: 1min 18s
Wall time: 1min 19s





In [36]:
import pandas as pd

In [37]:
verb_plain_list = [(key, value) for key, value in verb_plain_stat.items()]
df_plain = pd.DataFrame(verb_plain_list, columns=["verb", "total"])
df_plain.head()

df_plain.to_csv(f"tasak_verb_{date_time}.tsv", index=None, sep="\t")

In [38]:
verbs_compound_list = [
    (key[0], key[1], value) for key, value in verbs_compound_stat.items()
]
df_compound = pd.DataFrame(verbs_compound_list, columns=["verb", "compound", "total"])
display(df_compound.head())

df_compound.to_csv(f"tasak_verb_afiksaaladverb_{date_time}.tsv", index=None, sep="\t")

Unnamed: 0,verb,compound,total
0,olema,,1
1,ignoreerima,,1
2,võima,,1
3,võitma,,1
4,toimima,,1
