In [0]:
import os ; os.chdir('/content/drive/My Drive/Colab Notebooks/Lazy courses/NLP2')

In [0]:
import numpy as np
import os
import sys
from brown import get_sentences_with_word2idx_limit_vocab, get_sentences_with_word2idx

In [0]:
# import nltk
# nltk.download('brown')

In [0]:
def get_bigram_probs(sentences, V, start_idx, end_idx, smoothing=1):
    # structure of bigram probability matrix will be:
    # (last word, current word) --> probability
    # we will use add-1 smoothing
    # note: we will always ignore this from the END token
    bigram_probs = np.ones((V,V)) * smoothing
    
    for sentence in sentences:
        for i in range(len(sentence)):
            if i == 0:
                #beginning word
                bigram_probs[start_idx, sentence[i]] += 1
            else:
                #middle word
                bigram_probs[sentence[i-1], sentence[i]] += 1

            #if we are at the final word 
            #we update the bigram for last --> current 
            #and current --> END token
            if i == len(sentence) - 1:
                #final word
                bigram_probs[sentence[i], end_idx] += 1

    #normalize the counts along the rows to get probabilities
    bigram_probs /= bigram_probs.sum(axis=1, keepdims=True)
    return bigram_probs

In [0]:
def get_score(sentence):
    score = 0
    for i in range(len(sentence)):
        if i == 0:
            #beginning word
            score += np.log(bigram_probs[start_idx, sentence[i]])
        else:
            #middle word
            score += np.log(bigram_probs[sentence[i-1], sentence[i]])
    #final word
    score += np.log(bigram_probs[sentence[-1], end_idx])

    #normalize the score
    return score / (len(sentence) +1)

#function to map word index back to real words
def get_words(sentence):
    return ' '.join(idx2word[i] for i in sentence) 

In [12]:
if __name__ == '__main__':
    #load the data
    #point-1: sentences are already converted to sequences of word indexes
    sentences, word2idx = get_sentences_with_word2idx_limit_vocab(10000)
    #sentences, word2idx = get_sentences_with_word2idx()

    #vocab size
    V = len(word2idx)
    print("Vocab size:", V)

    #We will treat beginning of the sentence and end of the sentence as bigrams 
    #first word --> START
    #last word --> END
        
    start_idx = word2idx['START']
    end_idx = word2idx['END']

    #a matrix here:
    #row = last word
    #col = current word
    #value at [row, col] = p(current word | last word)
    bigram_probs = get_bigram_probs(sentences, V, start_idx, end_idx, smoothing=0.1)
    idx2word = dict((v,k) for k,v in word2idx.items())

    #when we sample a fake sentence, we want to ensure not to sample
    #start token or end token

    sample_probs = np.ones(V)
    sample_probs[start_idx] = 0
    sample_probs[end_idx] = 0
    sample_probs /= sample_probs.sum()

    #test our model on real and fake sentences
    while True:
        #real sentence
        real_idx = np.random.choice(len(sentences))
        real = sentences[real_idx]

        #fake sentence
        fake = np.random.choice(V, size=len(real), p=sample_probs)

        print("REAL:", get_words(real), "SCORE:", get_score(real))
        print("FAKE:", get_words(fake), "SCORE:", get_score(fake))

        #input your own custom sentence
        custom = input("Enter your own custom sentence: \n")
        custom = custom.lower().split()

        #check that all tokens exists in word2idx (otherwise we can't get score)
        bad_sentence = False
        for token in custom:
            if token not in word2idx:
                bad_sentence = True

        if bad_sentence:
            print("Sorry you entered words not in vocabulary")
        else:
            #convert sentence into list of indexes
            custom = [word2idx[token] for token in custom]
            print("SCORE:", get_score(custom))

        cont = input("Continue? [y/n]")
        if cont and cont.lower() in ('N', 'n'):
            break

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
enforcement 19
commissioner 19
appeals 19
supervision 19
interviews 19
tangible 19
politicians 19
elementary 19
respective 19
stresses 19
directors 19
continental 19
filing 19
males 19
guards 19
vincent 19
salem 19
lodge 19
specialists 19
wiped 19
slender 19
snapped 19
string 19
whip 19
ray 19
achievements 19
span 19
drank 19
fathers 19
stroke 19
1927 19
frederick 19
addressed 19
ethics 19
toast 19
lover 19
calif. 19
solved 19
theology 19
crown 19
convenience 19
men's 19
victims 19
arrested 19
cottage 19
lid 19
packed 19
lacked 19
condemned 19
documents 19
corporate 19
eve 19
entries 19
wildlife 19
livestock 19
businesses 19
attract 19
companion 19
rid 19
shipping 19
earnings 19
makers 19
gains 19
venture 19
affects 19
demanding 19
delivery 19
allows 19
toes 19
loves 19
mexico 19
likes 19
ham 19
label 19
ladder 19
dreamed 19
resting 19
guitar 19
pamela 19
traveling 19
slip 19
spell 19
neatly 19
people's 19
decisive 19
ten