<a href="https://colab.research.google.com/github/sonakshisen1234/NLP_Course/blob/master/bigrams.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import nltk
from nltk.corpus import brown
import operator
import numpy as np
from future.utils import iteritems
import random

nltk.download('brown')

KEEP_WORDS = set([
  'king', 'man', 'queen', 'woman',
  'italy', 'rome', 'france', 'paris',
  'london', 'britain', 'england',
])

def get_sentences():
  # returns 57340 of the Brown corpus
  # each sentence is represented as a list of individual string tokens
  return brown.sents()

def get_sentences_with_word2idx_limit_vocab(n_vocab=2000, keep_words=KEEP_WORDS):
  sentences = get_sentences()
  indexed_sentences = []

  i = 2
  word2idx = {'START': 0, 'END': 1}
  idx2word = ['START', 'END']

  word_idx_count = {
    0: float('inf'),
    1: float('inf'),
  }

  for sentence in sentences:
    indexed_sentence = []
    for token in sentence:
      token = token.lower()
      if token not in word2idx:
        idx2word.append(token)
        word2idx[token] = i
        i += 1

      # keep track of counts for later sorting
      idx = word2idx[token]
      word_idx_count[idx] = word_idx_count.get(idx, 0) + 1

      indexed_sentence.append(idx)
    indexed_sentences.append(indexed_sentence)



  # restrict vocab size

  # set all the words I want to keep to infinity
  # so that they are included when I pick the most
  # common words
  for word in keep_words:
    word_idx_count[word2idx[word]] = float('inf')

  sorted_word_idx_count = sorted(word_idx_count.items(), key=operator.itemgetter(1), reverse=True)
  word2idx_small = {}
  new_idx = 0
  idx_new_idx_map = {}
  for idx, count in sorted_word_idx_count[:n_vocab]:
    word = idx2word[idx]
    #print(word, count)
    word2idx_small[word] = new_idx
    idx_new_idx_map[idx] = new_idx
    new_idx += 1
  # let 'unknown' be the last token
  word2idx_small['UNKNOWN'] = new_idx 
  unknown = new_idx

  assert('START' in word2idx_small)
  assert('END' in word2idx_small)
  for word in keep_words:
    assert(word in word2idx_small)

  # map old idx to new idx
  sentences_small = []
  for sentence in indexed_sentences:
    if len(sentence) > 1:
      new_sentence = [idx_new_idx_map[idx] if idx in idx_new_idx_map else unknown for idx in sentence]
      sentences_small.append(new_sentence)

  return sentences_small, word2idx_small

def get_bigram_probs(sentences,V,start_index,end_index,smoothing=1): #function to get bigram probability matrix
  matrix = np.ones((V,V)) * smoothing # create matrix of V*V and intialize with smoothing parameter
  for sentence in sentences:
    for i in range (len(sentence)):
      if(i==0):
        matrix[start_index,sentence[i]]+=1
      else:
        matrix[sentence[i-1],sentence[i]]+=1
      if(i==len(sentence)-1):
        matrix[sentence[i],end_index]+=1
  x = matrix.sum(axis=1,keepdims=True) #sum each row(to attain normalization)(P(B/A) = count(A followed by B)/count(A))
  matrix/=x
  return matrix          

def get_score(sentence): #we calculate the probability for each sentence but not normal probability(by multiplying various bigram probabilities)
  score = 0              #instead we calculate logarithimic probability because normal probability will keep on getting smaller on multiplying(tend to zero)
  for i in range (len(sentence)):
    if(i==0):
      score+=np.log(matrix[start_index,sentence[i]])
    else:
      score+=np.log(matrix[sentence[i-1],sentence[i]])
    if(i==len(sentence)-1):
      score+=np.log(matrix[sentence[i],end_index])
  return score/(len(sentence) + 1) #(normalization) we divide by sentence length to prevent bias to shorter sentences(log proabilities are negative)  

sentences, word2idx = get_sentences_with_word2idx_limit_vocab(10000)
V = len(word2idx)
print("Vocab size:", V)
start_index = word2idx['START']
end_index = word2idx['END'] 
matrix = get_bigram_probs(sentences, V, start_index, end_index, smoothing=0.1) 
# a function to map word indexes back to real words
idx2word = dict((v, k) for k, v in iteritems(word2idx))
def get_words(sentence):
    return ' '.join(idx2word[i] for i in sentence)  


# when we sample a fake sentence, we want to ensure not to sample
# start token or end token  
sample_probs = np.ones(V)  
sample_probs[start_index] = 0  
sample_probs[end_index] = 0  
sample_probs /= sample_probs.sum()    

real_idx = np.random.choice(len(sentences))
real = sentences[real_idx]    
print(real)
# fake sentence    
fake = np.random.choice(V, size=len(real), p=sample_probs)    
print(fake)

print("REAL:", get_words(real), "SCORE:", get_score(real))    
print("FAKE:", get_words(fake), "SCORE:", get_score(fake))    



[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
Vocab size: 10001
[28, 22, 41, 47, 10000, 18, 193, 21, 13, 134, 228, 64, 133, 45, 71, 888, 439, 103, 91, 1251, 16, 228, 15]
[8925 3168 3489 8459 8223 1308 6830 1775  809  715 9765 1478 3042 1698
 5566  603 8668 5862 9236 8282 5248 5158 4837]
REAL: it is not an UNKNOWN to say that the state government has little or no fiscal control over these units of government . SCORE: -4.565574228697625
FAKE: loneliness producing tube assessing limitation broad ironic location month below compost store moves passing lest move tearing notably fbi twenties halfway rugged dominated SCORE: -9.319580136357972


In [0]:
!git init

Initialized empty Git repository in /content/.git/


In [0]:
!git config user.email "sakshi.1331agarwal@gmail.com"
!git config user.name "sonakshisen1234"

In [0]:
!git add -A

In [0]:
!git commit -m "first commit"

[master (root-commit) cdd89ab] first commit
 20 files changed, 50775 insertions(+)
 create mode 100644 .config/.last_opt_in_prompt.yaml
 create mode 100644 .config/.last_survey_prompt.yaml
 create mode 100644 .config/.last_update_check.json
 create mode 100644 .config/.metricsUUID
 create mode 100644 .config/active_config
 create mode 100644 .config/config_sentinel
 create mode 100644 .config/configurations/config_default
 create mode 100644 .config/gce
 create mode 100644 .config/logs/2020.06.10/16.27.11.187541.log
 create mode 100644 .config/logs/2020.06.10/16.27.29.206693.log
 create mode 100644 .config/logs/2020.06.10/16.27.42.054200.log
 create mode 100644 .config/logs/2020.06.10/16.27.46.450102.log
 create mode 100644 .config/logs/2020.06.10/16.28.00.359521.log
 create mode 100644 .config/logs/2020.06.10/16.28.00.889996.log
 create mode 100755 sample_data/README.md
 create mode 100755 sample_data/anscombe.json
 create mode 100644 sample_data/california_housing_test.csv
 create mo

In [0]:
!git remote add origin https://sonakshisen1234:gitgithub123@github.com/sonakshisen1234/NLP_Course.git

In [0]:
!git push -u origin master

Counting objects: 27, done.
Delta compression using up to 2 threads.
Compressing objects: 100% (19/19), done.
Writing objects: 100% (27/27), 8.42 MiB | 2.22 MiB/s, done.
Total 27 (delta 4), reused 0 (delta 0)
remote: Resolving deltas: 100% (4/4), done.[K
To https://github.com/sonakshisen1234/NLP_Course.git
 * [new branch]      master -> master
Branch 'master' set up to track remote branch 'master' from 'origin'.
