To do: Convert this to a class

In [1]:
import networkx as nx

In [2]:
class NetworkLM():
    def __init__(self, graph_path, freq = 'freq', count = 'count') -> None:
        """Initialise the language model
        Parameters:
            graph_path (str): filepath to the network GML
            freq (str): the node attribute that stores word frequency (default: freq)
            count (str): the edge attribute that stores edge frequency (default: count)
        Returns:
            None"""
        
        self.G = nx.read_gml(graph_path)
        self.freq = freq
        self.count = count
        
    def k_most_common_from(self, target, k = 10) -> dict:
        """Find the k most common words after a target word
        Parameters:
            target (str): the target word
            k (int): the limit (default: 10), if None then all are retrieved
        Returns:
            dict: next words and their probabilities sorted desc"""
        
        # find all possible next words and counts using out-edges of target
        next_words = {i[1]: i[2][self.count] for i in self.G.out_edges(target, data = True)}
        
        if len(next_words) > 0:
            total = sum(next_words.values())  # calculate total out edges
            
            # get next words sorted desc by probability
            next_words = sorted({i: next_words[i] / total for i in next_words}.items(), key = lambda x: x[1], reverse = True)
            
            if k:
                return dict(next_words[: k])
            else:
                return dict(next_words)
            
        return dict()

In [3]:
nlm = NetworkLM('./Graphs/corpus_vocab.gml')