In [23]:
import nltk
from nltk.corpus import brown
import string
# nltk.download('punkt') # Download the 'punkt' tokenizer
from nltk.tokenize import word_tokenize
from nltk.util import bigrams
import operator
import numpy as np

In [None]:
def contains_punctuation(word):
    return any(char in string.punctuation for char in word)

In [None]:
words = brown.words()
lowercase = [word.lower() for word in words]
remove_punc = [word for word in lowercase if not contains_punctuation(word)]
processed_words = remove_punc

In [None]:
print(processed_words[:100])

In [None]:
text = "You are learning from Geeks for, Geeks."
tokens = word_tokenize(text)
print(tokens)

In [None]:
bigram_prelist = bigrams(tokens)
bigrams = list(bigram_prelist)

In [None]:
print(bigrams)


In [16]:
KEEP_WORDS = set([
  'king', 'man', 'queen', 'woman',
  'italy', 'rome', 'france', 'paris',
  'london', 'britain', 'england',
])


def get_sentences():
  # returns 57340 of the Brown corpus
  # each sentence is represented as a list of individual string tokens
  return brown.sents()


def get_sentences_with_word2idx():
  sentences = get_sentences()
  indexed_sentences = []

  i = 2
  word2idx = {'START': 0, 'END': 1}
  for sentence in sentences:
    indexed_sentence = []
    for token in sentence:
      token = token.lower()
      if token not in word2idx:
        word2idx[token] = i
        i += 1

      indexed_sentence.append(word2idx[token])
    indexed_sentences.append(indexed_sentence)

  print("Vocab size:", i)
  return indexed_sentences, word2idx


def get_sentences_with_word2idx_limit_vocab(n_vocab=2000, keep_words=KEEP_WORDS):
  sentences = get_sentences()
  indexed_sentences = []

  i = 2
  word2idx = {'START': 0, 'END': 1}
  idx2word = ['START', 'END']

  word_idx_count = {
    0: float('inf'),
    1: float('inf'),
  }

  for sentence in sentences:
    indexed_sentence = []
    for token in sentence:
      token = token.lower()
      if token not in word2idx:
        idx2word.append(token)
        word2idx[token] = i
        i += 1

      # keep track of counts for later sorting
      idx = word2idx[token]
      word_idx_count[idx] = word_idx_count.get(idx, 0) + 1

      indexed_sentence.append(idx)
    indexed_sentences.append(indexed_sentence)



  # restrict vocab size

  # set all the words I want to keep to infinity
  # so that they are included when I pick the most
  # common words
  for word in keep_words:
    word_idx_count[word2idx[word]] = float('inf')

  sorted_word_idx_count = sorted(word_idx_count.items(), key=operator.itemgetter(1), reverse=True)
  word2idx_small = {}
  new_idx = 0
  idx_new_idx_map = {}
  for idx, count in sorted_word_idx_count[:n_vocab]:
    word = idx2word[idx]
    word2idx_small[word] = new_idx
    idx_new_idx_map[idx] = new_idx
    new_idx += 1
  # let 'unknown' be the last token
  word2idx_small['UNKNOWN'] = new_idx
  unknown = new_idx

  assert('START' in word2idx_small)
  assert('END' in word2idx_small)
  for word in keep_words:
    assert(word in word2idx_small)

  # map old idx to new idx
  sentences_small = []
  for sentence in indexed_sentences:
    if len(sentence) > 1:
      new_sentence = [idx_new_idx_map[idx] if idx in idx_new_idx_map else unknown for idx in sentence]
      sentences_small.append(new_sentence)

  return sentences_small, word2idx_small

In [8]:
sentences = get_sentences()
for i in range(3):
    print(f"Sentence {i+1}: {sentences[i]}")

Sentence 1: ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']
Sentence 2: ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.']
Sentence 3: ['The', 'September-October', 'term', 'jury', 'had', 'been', 'charged', 'by', 'Fulton', 'Superior', 'Court', 'Judge', 'Durwood', 'Pye', 'to', 'investigate', 'reports', 'of', 'possible', '``', 'irregularities', "''", 'in', 'the', 'hard-fought', 'primary', 'which', 'was', 'won', 'by', 'Mayor-nominate', 'Ivan', 'Allen', 'Jr.', '.']


In [10]:
indexed_sentences, word2idx = get_sentences_with_word2idx()

Vocab size: 49817


In [11]:
print(word2idx)



In [17]:
sentences_small, word2idx_small = get_sentences_with_word2idx_limit_vocab(n_vocab=2000)

In [20]:
print(word2idx_small)

{'START': 0, 'END': 1, 'man': 2, 'paris': 3, 'britain': 4, 'england': 5, 'king': 6, 'woman': 7, 'rome': 8, 'london': 9, 'queen': 10, 'italy': 11, 'france': 12, 'the': 13, ',': 14, '.': 15, 'of': 16, 'and': 17, 'to': 18, 'a': 19, 'in': 20, 'that': 21, 'is': 22, 'was': 23, 'he': 24, 'for': 25, '``': 26, "''": 27, 'it': 28, 'with': 29, 'as': 30, 'his': 31, 'on': 32, 'be': 33, ';': 34, 'at': 35, 'by': 36, 'i': 37, 'this': 38, 'had': 39, '?': 40, 'not': 41, 'are': 42, 'but': 43, 'from': 44, 'or': 45, 'have': 46, 'an': 47, 'they': 48, 'which': 49, '--': 50, 'one': 51, 'you': 52, 'were': 53, 'her': 54, 'all': 55, 'she': 56, 'there': 57, 'would': 58, 'their': 59, 'we': 60, 'him': 61, 'been': 62, ')': 63, 'has': 64, '(': 65, 'when': 66, 'who': 67, 'will': 68, 'more': 69, 'if': 70, 'no': 71, 'out': 72, 'so': 73, 'said': 74, 'what': 75, 'up': 76, 'its': 77, 'about': 78, ':': 79, 'into': 80, 'than': 81, 'them': 82, 'can': 83, 'only': 84, 'other': 85, 'new': 86, 'some': 87, 'could': 88, 'time': 89,

In [31]:
def get_bigram_probs(sentences, V, start_idx, end_idx, smoothing=1):
  # structure of bigram probability matrix will be:
  # (last word, current word) --> probability
  # we will use add-1 smoothing
  # note: we'll always ignore this from the END token
  bigram_probs = np.ones((V, V)) * smoothing
  for sentence in sentences:
    for i in range(len(sentence)):

      if i == 0:
        # beginning word
        bigram_probs[start_idx, sentence[i]] += 1
      else:
        # middle word
        bigram_probs[sentence[i-1], sentence[i]] += 1

      # if we're at the final word
      # we update the bigram for last -> current
      # AND current -> END token
      if i == len(sentence) - 1:
        # final word
        bigram_probs[sentence[i], end_idx] += 1

  # normalize the counts along the rows to get probabilities
  bigram_probs /= bigram_probs.sum(axis=1, keepdims=True)
  return bigram_probs

In [34]:
start_idx = word2idx_small['START']
end_idx = word2idx_small['END']
print(start_idx, end_idx)

0 1


In [36]:
bigram_probs = get_bigram_probs(sentences_small, len(word2idx_small), start_idx, end_idx, smoothing=1)

In [38]:
np.savetxt('bigram_probs.csv', bigram_probs, delimiter=',')