<a href="https://colab.research.google.com/github/sugarh777/hangman/blob/master/count_base.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
%matplotlib inline

import sys, os, pickle
import numpy as np
import matplotlib.pyplot as plt
import urllib.request
from sklearn.utils.extmath import randomized_svd


In [0]:
def process(text):
  txt = text.lower()
  txt = txt.replace('.',' .')
  words = txt.split(' ')

  word_to_id = {}
  id_to_word = {}

  for word in words:
    if word not in word_to_id:
      new_id = len(word_to_id)
      word_to_id[word] = new_id
      id_to_word[new_id] = word

  corpus = np.array([word_to_id[w] for w in words])

  return corpus, word_to_id, id_to_word


In [0]:
def create_co_matrix(corpus,vocab_size,window_size=1):
  corpus_size = len(corpus)
  co_matrix = np.zeros((vocab_size,vocab_size),dtype=np.int32)

  for idx, word_id in enumerate(corpus):
    for i in range(1,window_size+1):
      left_idx = idx - i
      right_idx = idx + i

      if left_idx >= 0:
        left_word_id = corpus[left_idx]
        co_matrix[word_id,left_word_id] += 1

      if right_idx < corpus_size:
        right_word_id = corpus[right_idx]
        co_matrix[word_id,right_word_id] += 1

  return co_matrix
  

In [0]:
def cos_similarity(x,y,eps=1e-8):

  nx = x / np.sqrt(np.sum(x**2) + eps)
  ny = y / np.sqrt(np.sum(y**2) + eps)

  return np.dot(nx,ny)
  

In [0]:
def most_similar(query,word_to_id,id_to_word,word_matrix,top=5):
  if query not in word_to_id:
    print('{} is not found.'.format(query))
    return

  print('\n[query] {}'.format(query))
  query_id = word_to_id[query]
  query_vec = word_matrix[query_id]

  vocab_size = len(id_to_word)
  similarity = np.zeros(vocab_size)
  
  for i in range(vocab_size):
    similarity[i] = cos_similarity(word_matrix[i],query_vec)

  count = 0

  for i in (-1 * similarity).argsort():
    if id_to_word[i] == query:
      continue

    print('{}: {}'.format(id_to_word[i],similarity[i]))

    count += 1

    if count >= top:
      return



In [0]:
def ppmi(c,verbose=False,eps=1e-8):
  m = np.zeros_like(c,dtype=np.float32)
  n = np.sum(c)
  s = np.sum(c,axis=0)
  total = c.shape[0] * c.shape[1]
  cnt = 0

  for i in range(c.shape[0]):
    for j in range(c.shape[1]):
      pmi = np.log2(c[i,j] * n / (s[i] * s[j]) + eps)
      m[i,j] = max(0,pmi)

      if verbose:
        cnt += 1
        if cnt % (total // 10) == 0:
          print('{:.1f}% done'.format(100 * cnt / total))

  return m


In [0]:
text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = process(text)
vocab_size = len(word_to_id)

c = create_co_matrix(corpus,vocab_size)


In [0]:
c0 = c[word_to_id['you']]
c1 = c[word_to_id['i']]

cos_similarity(c0,c1)

In [0]:
most_similar('you',word_to_id,id_to_word,c,top=5)


In [0]:
w = ppmi(c,verbose=True)

np.set_printoptions(precision=3)
print('covariance matrix')
print(c)
print('-' * 50)
print('ppmi')
print(w)


In [0]:
u, s, v = np.linalg.svd(w)

np.set_printoptions(precision=3)
print('c:')
print(c[0])
print('-'*50)
print('w:')
print(w[0])
print('-'*50)
print('u:')
print(u[0])
print('-'*50)
print('s:')
print(s)

In [0]:
fig, ax = plt.subplots(1,1)

for word, word_id in word_to_id.items():
  ax.annotate(word,(u[word_id,0],u[word_id,1]))

ax.scatter(u[:,0],u[:,1],alpha=0.5)
plt.show()


In [0]:
path = os.path.join('/','content','drive','My Drive','Colab Notebooks','Neural Network','RNN')
sys.path.append(path)

url = 'https://raw.githubusercontent.com/tomsercu/lstm/master/data/'

file_name = 'ptb.train.txt'
corp_name = 'ptb.train.npy'
vocab_name = 'ptb.vocab.pkl'
file_path = os.path.join(path,file_name)
corp_path = os.path.join(path,corp_name)
vocab_path = os.path.join(path,vocab_name)


In [0]:
urllib.request.urlretrieve(url+file_name,file_path)

with open(file_path,'r') as f:
  words = f.read().replace('\n','<eos>').strip()

corpus, word_to_id, id_to_word = process(words)

np.save(corp_path,corpus)

with open(vocab_path,'wb') as f:
  pickle.dump((word_to_id,id_to_word),f)


In [0]:
print('corpus size:', len(corpus))
print('corpus[:30]:', corpus[:30])
print('-'*50)
print('id_to_word[0]:', id_to_word[0])
print('id_to_word[1]:', id_to_word[1])
print('id_to_word[2]:', id_to_word[2])
print('-'*50)
print("word_to_id['car']:", word_to_id['car'])
print("word_to_id['happy']:", word_to_id['happy'])
print("word_to_id['lexus']:", word_to_id['lexus'])


corpus size: 946312
corpus[:30]: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29]
--------------------------------------------------
id_to_word[0]: aer
id_to_word[1]: banknote
id_to_word[2]: berlitz
--------------------------------------------------
word_to_id['car']: 3843
word_to_id['happy']: 4411
word_to_id['lexus']: 7391


In [0]:
with open(file_path,'r') as f:
  print(f.readline())

with open(vocab_path,'rb') as v:
  word_to_id, id_to_word = pickle.load(v)

corpus = np.load(corp_path)


 aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memotec mlx nahb punts rake regatta rubens sim snack-food ssangyong swapo wachter 



In [0]:
window_size = 2
wordvec_size = 100
vocab_size = len(word_to_id)

print('counting  co-occurrence ...')
cmat = create_co_matrix(corpus,vocab_size,window_size)

print('calculating PPMI ...')
pmi = ppmi(cmat,verbose=True)

print('calculating SVD ...')
U, S, V = randomized_svd(pmi,n_components=wordvec_size,
                         n_iter=5,random_state=None)

print(U.shape)
word_vec = U[:,:wordvec_size]

querys = ['you', 'year', 'car', 'toyota']
for q in querys:
  most_similar(q,word_to_id,id_to_word,word_vec,top=5)


counting  co-occurrence ...
calculating PPMI ...
10.0% done
20.0% done
30.0% done
40.0% done
50.0% done
60.0% done
70.0% done
80.0% done
90.0% done
100.0% done
calculating SVD ...
(9944, 100)

[query] you
i: 0.6367372870445251
we: 0.6046673655509949
do: 0.5772789716720581
'd: 0.5405998229980469
've: 0.532917857170105

[query] year
earlier: 0.626132607460022
month: 0.6040111780166626
last: 0.6022427082061768
quarter: 0.5998589396476746
months: 0.5788155794143677

[query] car
auto: 0.68401700258255
luxury: 0.5734304189682007
vehicle: 0.5165008306503296
cars: 0.49752214550971985
lexus: 0.4760116636753082

[query] toyota
motor: 0.6689599752426147
motors: 0.6526650786399841
nissan: 0.6430892944335938
honda: 0.6414303183555603
lexus: 0.52358478307724
