<a href="https://colab.research.google.com/github/royn5618/Medium_Blog_Codes/blob/master/Sparsity_and_Density.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import sklearn, gensim
print("This notebook uses the following Python Libraries:")
print("Scikit Learn, version: ", sklearn.__version__)
print("Gensim, version: ", gensim.__version__)

This notebook uses the following Python Libraries:
Scikit Learn, version:  1.2.1
Gensim, version:  4.1.1


# Generating a Sparse Matrix using Count Vectorizer

In [9]:
docs = ['a demonstration of a product or technique',
        'a public meeting or march protesting against something or expressing views on a political issue',
        'record a song or piece of music to demonstrate the capabilities of a musical group or performer or as preparation for a full recording',
        'demonstrate the capabilities of software or another product']

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

In [11]:
_cv = vectorizer.fit_transform(docs)

In [27]:
# vectorizer.vocabulary_ 

In [13]:
vocabulary = vectorizer.get_feature_names_out()
vocabulary

array(['against', 'another', 'as', 'capabilities', 'demonstrate',
       'demonstration', 'expressing', 'for', 'full', 'group', 'issue',
       'march', 'meeting', 'music', 'musical', 'of', 'on', 'or',
       'performer', 'piece', 'political', 'preparation', 'product',
       'protesting', 'public', 'record', 'recording', 'software',
       'something', 'song', 'technique', 'the', 'to', 'views'],
      dtype=object)

In [14]:
len(vocabulary)

34

In [15]:
_cv

<4x34 sparse matrix of type '<class 'numpy.int64'>'
	with 43 stored elements in Compressed Sparse Row format>

In [16]:
print(_cv.toarray())

[[0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0]
 [1 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 1 2 0 0 1 0 0 1 1 0 0 0 1 0 0 0 0 1]
 [0 0 1 1 1 0 0 1 1 1 0 0 0 1 1 2 0 3 1 1 0 1 0 0 0 1 1 0 0 1 0 1 1 0]
 [0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0]]


In [17]:
import numpy as np

num_elements = 34 * 4 #_cv.toarray().size
num_zeros = num_elements - np.count_nonzero(_cv.toarray())

In [18]:
num_zeros

93

In [19]:
num_zeros / num_elements

0.6838235294117647

# Generating a Dense Matrix Using Word2Vec

In [20]:
import nltk
# nltk.download('punkt')
from nltk.tokenize import word_tokenize
from gensim.models import word2vec

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nroy0\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [21]:
vector_size = 10

In [22]:
tokenized_docs = [word_tokenize(each_train_text) for each_train_text in docs]
print(tokenized_docs)

[['a', 'demonstration', 'of', 'a', 'product', 'or', 'technique'], ['a', 'public', 'meeting', 'or', 'march', 'protesting', 'against', 'something', 'or', 'expressing', 'views', 'on', 'a', 'political', 'issue'], ['record', 'a', 'song', 'or', 'piece', 'of', 'music', 'to', 'demonstrate', 'the', 'capabilities', 'of', 'a', 'musical', 'group', 'or', 'performer', 'or', 'as', 'preparation', 'for', 'a', 'full', 'recording'], ['demonstrate', 'the', 'capabilities', 'of', 'software', 'or', 'another', 'product']]


In [24]:
w2v_model = word2vec.Word2Vec(
    tokenized_docs,
    vector_size=vector_size,  # Dimensionality of the word vectors
    window=2,
    min_count=1,
    sg=1  # 1 for skip-gram; otherwise CBOW
)

<gensim.models.word2vec.Word2Vec at 0x1df7bcb23d0>

In [26]:
w2v_model.wv.get_vector('demonstration')

array([-0.03709092, -0.08746651,  0.05438888,  0.06511763, -0.00784039,
       -0.06706186, -0.07086928, -0.02501981,  0.05135532, -0.03659423],
      dtype=float32)

In [32]:
dense_vector_document_1 = [w2v_model.wv.get_vector(each_token) for each_token in tokenized_docs[0]] # selecting the first document only
dense_vector_document_1

[array([-0.00543602,  0.00242176,  0.05117757,  0.09023254, -0.09288687,
        -0.07112895,  0.06502526,  0.08983966, -0.05024805, -0.03765193],
       dtype=float32),
 array([-0.03709092, -0.08746651,  0.05438888,  0.06511763, -0.00784039,
        -0.06706186, -0.07086928, -0.02501981,  0.05135532, -0.03659423],
       dtype=float32),
 array([ 0.07311484,  0.05067236,  0.06759576,  0.00768621,  0.063453  ,
        -0.03407172, -0.00933318,  0.05775234, -0.07525568, -0.0394193 ],
       dtype=float32),
 array([-0.00543602,  0.00242176,  0.05117757,  0.09023254, -0.09288687,
        -0.07112895,  0.06502526,  0.08983966, -0.05024805, -0.03765193],
       dtype=float32),
 array([-0.0960355 ,  0.05007293, -0.08759587, -0.04391825, -0.000351  ,
        -0.00296183, -0.0766124 ,  0.09614742,  0.04982056,  0.09233143],
       dtype=float32),
 array([ 0.07357611, -0.01511498, -0.04508233,  0.06564156, -0.04849871,
        -0.01822503,  0.02962095,  0.01011256, -0.08299582, -0.09466723],
   

Thanks for visiting!

Recommended Next Steps: Check out FastText, GloVE, BERT to explore more ways of dense representation for texts