In [2]:
import requests

In [3]:
base_url="https://raw.githubusercontent.com/tomsercu/lstm/master/data"
filenames={split : f"ptb.{split}.txt" for split in ["train","valid","test"]}

In [4]:
import pathlib

datasets_folder=pathlib.Path(r"C:\Users\amrul\programming\deep_learning\dl_projects\nlp_with_dl_from_scratch\datasets")

res = requests.get(f"{base_url}/{filenames["train"]}")

ptb_file=datasets_folder/filenames["train"]
ptb_file.write_text(res.text)
print(f"wrote {filenames["train"]} to {ptb_file}")


wrote ptb.train.txt to C:\Users\amrul\programming\deep_learning\dl_projects\nlp_with_dl_from_scratch\datasets\ptb.train.txt


In [5]:
text = ptb_file.read_text()
print(f"text size : {len(text):,}")

text size : 5,101,618


In [6]:
import re
from utils import tokenize

words = tokenize(text,special_words=["<unk>"])
print(f"total of {len(words):,} words")

total of 937,128 words


In [6]:
vocab = list(set(words))
print(f"vocab size : {len(vocab):,}")

vocab size : 9,654


In [7]:
from collections import Counter

wcounter = Counter(words)
n = 100
for word,wfreq in wcounter.most_common(n):
    print(f"{word:<10} : {wfreq:,}")


the        : 50,869
<unk>      : 45,020
N          : 32,481
of         : 24,406
to         : 23,662
a          : 21,639
in         : 18,010
and        : 17,498
.          : 16,709
'          : 14,755
s          : 11,934
for        : 8,936
that       : 8,931
$          : 7,727
is         : 7,337
it         : 6,112
said       : 6,027
-          : 5,953
on         : 5,653
at         : 4,950
by         : 4,915
as         : 4,833
from       : 4,724
million    : 4,627
with       : 4,585
mr         : 4,326
was        : 4,073
be         : 3,936
are        : 3,914
its        : 3,846
he         : 3,632
n          : 3,598
t          : 3,556
but        : 3,541
has        : 3,494
an         : 3,477
will       : 3,270
have       : 3,245
year       : 2,957
new        : 2,809
or         : 2,704
company    : 2,686
they       : 2,562
this       : 2,438
which      : 2,362
would      : 2,321
about      : 2,220
market     : 2,101
says       : 2,092
more       : 2,065
were       : 2,009
u          : 1,942
o

In [11]:
len(wcounter)

9656

In [12]:
import numpy as np

def build_co_matrix(corpus,word_to_id,window_size):
    vocab_size = len(word_to_id)
    cooccur_mat = np.zeros((vocab_size,vocab_size))
    for word_pos,word_id in enumerate(corpus):
        for context_idx in range(1, window_size+1):
            left_word_pos = word_pos - context_idx
            right_word_pos = word_pos + context_idx

            if left_word_pos > 0:
                left_word_id = corpus[left_word_pos]
                cooccur_mat[word_id, left_word_id] += 1
            
            if right_word_pos < len(corpus):
                right_word_id = corpus[right_word_pos]
                cooccur_mat[word_id, right_word_id] += 1
    return cooccur_mat

In [13]:
word_to_id = {word : idx for idx,(word,wfreq) in enumerate(wcounter.most_common(len(wcounter)))}
id_to_word = {id:word for word,id in word_to_id.items()}

In [14]:
corpus = [word_to_id[word] for word in words]
co_matrix = build_co_matrix(corpus,word_to_id,window_size=1)

In [15]:
co_matrix.shape

(9656, 9656)

In [16]:
co_matrix[word_to_id["king"]]

array([3., 3., 0., ..., 0., 0., 0.])

In [17]:
# I won't do this again
# U,S,VT = np.linalg.svd(co_matrix)

In [18]:
import numpy as np
from scipy.sparse import csc_matrix
from scipy.sparse.linalg import svds

# Example sparse matrix
# Using csc_matrix (Compressed Sparse Column matrix) for demonstration
rows, cols = np.array([0, 1, 2]), np.array([0, 2, 2])
data = np.array([1, 2, 3])
sparse_matrix = csc_matrix((data, (rows, cols)), shape=(3, 3))
sparse_matrix = sparse_matrix.astype(np.float64)

# Perform SVD
# k is the number of singular values and vectors to compute
# You can adjust k based on your needs, but it must be less than the size of the matrix
k = 2
u, s, vt = svds(sparse_matrix, k=k)

# u is the matrix of left singular vectors
# vt is the matrix of right singular vectors, already transposed
print("U matrix:\n", u)
print("V^T matrix:\n", vt)


U matrix:
 [[-1.00000000e+00 -2.50185378e-16]
 [ 1.38777878e-16 -5.54700196e-01]
 [ 2.08166817e-16 -8.32050294e-01]]
V^T matrix:
 [[-1.00000000e+00  2.22044605e-16  6.93889390e-17]
 [-6.93889390e-17 -2.22044605e-16 -1.00000000e+00]]


In [19]:
k=100
u,s,vt = svds(co_matrix, k=k)

In [20]:
u[word_to_id["king"]].dot(u[word_to_id["man"]])

9.868049310015605e-05

In [21]:
import numpy as np

In [22]:
np.log2(10000*5/(10*20))

7.965784284662087

In [25]:
picked_up_word="king"
picked_up_word_id=word_to_id[picked_up_word]

print(f"co-occurrence matrix row sum of word {picked_up_word}(its id {picked_up_word_id}) : {co_matrix[picked_up_word_id].sum()}")
print(f"co-occurrence matrix column sum of word {picked_up_word}(its id {picked_up_word_id}) : {co_matrix[:,picked_up_word_id].sum()}")

co-occurrence matrix row sum of word king(its id 2235) : 86.0
co-occurrence matrix column sum of word king(its id 2235) : 86.0


In [27]:
print(f"total count : {co_matrix.sum():,}")
print(f"corpus size : {len(corpus):,}")

total count : 2,054,333.0
corpus size : 1,027,168


In [28]:
np.log2(0)

  np.log2(0)


-inf