In [1]:
import random
from pathlib import Path
from preprocessing import tokenize
from collocations import make_graph, cosine_similarities
from plotting import draw_graph

In [2]:
def read_all(directory):
    return "\n".join([file.read_text() for file in Path(directory).glob('*') if file.is_file()])

def neighbourhood(graph, node, filter_function=lambda edges: edges):
    edges = filter_function(graph.out_edges(node, data=True))
    return graph.edge_subgraph([(edge[0], edge[1]) for edge in edges])

def top_n_filter(n):
    return lambda edges: sorted(edges, key=lambda item: item[2]["weight"], reverse=True)[:n]

def draw(graph):
    return draw_graph(graph, label_attribute="weight", label_function=lambda x: f"{x:.3f}", width_attribute="weight")

In [3]:
directory1, directory2 = "data/1", "data/2"

In [4]:
tokens, vocabulary = tokenize([read_all(directory1), read_all(directory2)])

In [5]:
G1 = make_graph(tokens[0], vocabulary)

In [6]:
G2 = make_graph(tokens[1], vocabulary)

In [7]:
similarities = cosine_similarities(G1, G2, vocabulary)
similarities = sorted(similarities.items(), key=lambda item: item[1], reverse=True)

100%|██████████| 61642/61642 [1:26:09<00:00, 11.92it/s]


In [8]:
node, score = random.choice([(node, score) for node, score in similarities if 1 > score > 0])

In [9]:
score

0.0008561736159030396

In [10]:
draw(neighbourhood(G1, node, filter_function=top_n_filter(10)))

In [11]:
draw(neighbourhood(G2, node, filter_function=top_n_filter(10)))