In [1]:
import networkx as nx
import numpy as np

# from gensim.models import Word2Vec
from random import choices

In [2]:
vocab_G = nx.read_gml('./Graphs/corpus_vocab.gml')

In [3]:
class NetworkLM():
    def __init__(self, G, freq = 'freq', count = 'count') -> None:
        """Initialise the language model
        Parameters:
            G (nx.DiGraph): the vocabulary network
            freq (str): the node attribute that stores word frequency (default: freq)
            count (str): the edge attribute that stores edge frequency (default: count)
        Returns:
            None"""
        
        self.G = G
        self.freq = freq
        self.count = count
        self.SENT_BEG = '<s>'
        self.SENT_END = '</s>'
        
    def k_most_common_from(self, target, k = 10) -> dict:
        """Find the k most common words after a target word
        Parameters:
            target (str): the target word
            k (int): the limit (default: 10), if None then all are retrieved
        Returns:
            dict: next words and their probabilities sorted desc"""
        
        # find all possible next words and counts using out-edges of target
        next_words = {i[1]: i[2][self.count] for i in self.G.out_edges(target, data = True)}
        
        if len(next_words) > 0:
            total = sum(next_words.values())  # calculate total out-edges
            
            # get next words sorted desc by probability
            next_words = sorted({i: next_words[i] / total for i in next_words}.items(), key = lambda x: x[1], reverse = True)
            
            if k:
                return dict(next_words[: k])
            else:
                return dict(next_words)
            
        return dict()
    
    def k_most_common_to(self, target, k = 10) -> dict:
        """Find the k most common words before a target word
        Parameters:
            target (str): the target word
            k (int): the limit (default: 10), if None then all are retrieved
        Returns:
            dict: prev words and their probabilities sorted desc"""
        
        # find all possible prev words and counts using in-edges of target
        prev_words = {i[0]: i[2][self.count] for i in self.G.in_edges(target, data = True)}
        
        if len(prev_words) > 0:
            total = sum(prev_words.values())  # calculate total in-edges
            
            # get prev words sorted desc by probability
            prev_words = sorted({i: prev_words[i] / total for i in prev_words}.items(), key = lambda x: x[1], reverse = True)
            
            if k:
                return dict(prev_words[: k])
            else:
                return dict(prev_words)
            
        return dict()
    
    def perplexity(self, prob, n) -> float:
        """Calculate the perplexity given the probability
        Parameters:
            prob (float): the probability
            n (int): sentence length
        Returns:
            float: the perplexity"""
    
        return prob ** (-1 / n)
    
    def generate_sentence_shannon(self, seed, max_len = 10, mode = 1) -> (list, float):
        """Generate a sentence from a seed word
        Parameters:
            seed (str): the seed word
            max_len (int): the max sentence length (default: 10)
            mode (int): mode of operation - 1 uses out-edges, 0 uses in-edges (default: 1)
        Returns:
            list: the tokens of the generated sentence
            float: the sentence probability"""
        
        score = 0
        sentence, sent_len = [], 0 if seed == self.SENT_BEG else 1
        
        sentence.append(seed)  # append the seed to the empty sentence
        
        # generate the next words
        while sent_len <= max_len:
            words = None
            
            if mode == 1:  # get possible words using out edges
                words = self.k_most_common_from(sentence[-1], k = None)
            else:
                words = self.k_most_common_to(sentence[-1], k = None)
                
            if len(words) == 0:
                break
            
            # select word and add to sentence
            word = choices(list(words.keys()), list(words.values()))
            word = word[0]
            
            sentence.append(word)
            sent_len += 1
            
            score += np.log10(words[word])  # get probability of the selected word 
            
            # break conditions
            if mode == 1 and word == self.SENT_END:
                break
            elif mode == 0 and word == self.SENT_BEG:
                break
                
        if mode == 1:
            if sentence[-1] != self.SENT_END:
                sentence.append(self.SENT_END)

            if sentence[0] != self.SENT_BEG:
                sentence.insert(0, self.SENT_BEG)
                
        else:
            if sentence[-1] != self.SENT_BEG:
                sentence.append(self.SENT_BEG)
                
            if sentence[0] != self.SENT_END:
                sentence.insert(0, self.SENT_END)
                
        return sentence, 10 ** score
    
    def generate_sentence_inside_out(self, seed, before = 5, after = 4) -> (str, float):
        """Generate a sentence inside-out
        Parameters:
            seed (str): the seed word
            before (int): the number of words require before the seed (default: 5)
            after (int): the number of sentences required after the seed (default: 4)
        Returns:
            list: the tokens of the generated sentence
            float: the sentence probability"""
        
        sentence_before, prob_before = self.generate_sentence_shannon(seed, before, mode = 0)
        
        sentence_after, prob_after = self.generate_sentence_shannon(seed, after)
        
        sentence_before = sentence_before[:: -1][: -1]  # reverse the list and remove the last token i.e. </s>
        sentence_after = sentence_after[2 :]  # remove the first 2 tokens i.e. <s> and the seed
        
        sentence_before.extend(sentence_after)
        
        total_score = np.log10(prob_before) + np.log10(prob_after)  # get total score
        
        return sentence_before, 10 ** total_score
        
    def generate_sentences(self, seed, technique = 1, n = 5, max_len = 10, before = 5, after = 4) -> (list, list):
        """Generate n sentences
        Parameters:
            technique
            seend (str): the seed word
            technique (int): whether to use standard shannon (1) or inside-out (2) (default: 1)
            n (int): the number of sentences to generate (default: 5)
            max_len (int): the max sentence length if using standard shannon (default: 10)
            before (int): the number of words require before the seed if using inside-out (default:  5)
            after (int): the number of sentences required after the seed if using inside-out (default: 4)
        Returns:
            list: a list of sentences
            list: a list of associated probabilities"""
        
        sentences, probs = [], []
        
        if technique == 1:
            for _ in range(n):
                sentence, prob = self.generate_sentence_shannon(seed, max_len = max_len)
                
                sentences.append(' '.join(sentence))
                probs.append(prob)
                
        elif technique == 2:
            for _ in range(n):
                sentence, prob = self.generate_sentence_inside_out(seed, before = before, after = after)
                
                sentences.append(' '.join(sentence))
                probs.append(prob)
                
        return sentences, probs

In [4]:
nlm = NetworkLM(vocab_G)

In [5]:
sents, probs = nlm.generate_sentences('smart', technique = 1)

sents

['<s> smart wonder to the detective s bay of six miles that </s>',
 '<s> smart hundred steps from sir francis </s>',
 '<s> smart revelation and breathing was the thick weighs 1 500 lb </s>',
 '<s> smart wager the brightness with a bad fortune favors the frigate </s>',
 '<s> smart breath at 5 600 atmospheres </s>']

In [7]:
sents, probs = nlm.generate_sentences('fogg', technique = 2, before = 7, after = 100)

sents

['<s> sir scorched his hand my dear phileas fogg who denies you going on to the fire was the frigate have made all kinds heaped up to dig a candidate for a bag as they will be placed near his command the lashings like a meeting a chimerical creature your opinion and yet </s>',
 '<s> say that slid to be to phileas fogg betrayed the nautilus </s>',
 '<s> phileas fogg snugly ensconced himself to mr fogg </s>',
 '<s> protector and a sly dog said mr fogg like perambulating pagodas tigers wanting and the coal </s>',
 '<s> this region through phileas fogg applied to lose a word of silence </s>']

In [7]:
# s, p = nlm.generate_sentences('nemo', technique = 1)
# pp = []

# for i in range(len(s)):
#     pp.append(nlm.perplexity(p[i], len(s[i])))
    
# s, p, pp