In [1]:
# Count Vectorizer
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

message = ["Shaw likes to play cricket",
          "Mary likes to play tennis",
          "John likes to play volleyball or cricket",
           "Heena likes to play tennis or throwball"]

vec = CountVectorizer()

vec.fit(message)

In [2]:
transform_message = vec.transform(message)
print("Unique words are:", vec.get_feature_names_out())

Unique words are: ['cricket' 'heena' 'john' 'likes' 'mary' 'or' 'play' 'shaw' 'tennis'
 'throwball' 'to' 'volleyball']


In [3]:
data = pd.DataFrame(transform_message.toarray())
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1,0,0,1,0,0,1,1,0,0,1,0
1,0,0,0,1,1,0,1,0,1,0,1,0
2,1,0,1,1,0,1,1,0,0,0,1,1
3,0,1,0,1,0,1,1,0,1,1,1,0


In [4]:
data.columns = vec.get_feature_names_out()
data

Unnamed: 0,cricket,heena,john,likes,mary,or,play,shaw,tennis,throwball,to,volleyball
0,1,0,0,1,0,0,1,1,0,0,1,0
1,0,0,0,1,1,0,1,0,1,0,1,0
2,1,0,1,1,0,1,1,0,0,0,1,1
3,0,1,0,1,0,1,1,0,1,1,1,0


In [5]:
# TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer()
transform_message = vec.fit_transform(message)
feature_names = vec.get_feature_names_out()
print("Uniqure features :", feature_names)

Uniqure features : ['cricket' 'heena' 'john' 'likes' 'mary' 'or' 'play' 'shaw' 'tennis'
 'throwball' 'to' 'volleyball']


In [6]:
matrix = transform_message.todense()
matrix

matrix([[0.50487895, 0.        , 0.        , 0.3341742 , 0.        ,
         0.        , 0.3341742 , 0.64037493, 0.        , 0.        ,
         0.3341742 , 0.        ],
        [0.        , 0.        , 0.        , 0.3341742 , 0.64037493,
         0.        , 0.3341742 , 0.        , 0.50487895, 0.        ,
         0.3341742 , 0.        ],
        [0.39127526, 0.        , 0.49628305, 0.25898108, 0.        ,
         0.39127526, 0.25898108, 0.        , 0.        , 0.        ,
         0.25898108, 0.49628305],
        [0.        , 0.49628305, 0.        , 0.25898108, 0.        ,
         0.39127526, 0.25898108, 0.        , 0.39127526, 0.49628305,
         0.25898108, 0.        ]])

In [7]:
denselist = matrix.tolist()
denselist

[[0.5048789499185483,
  0.0,
  0.0,
  0.3341742038105307,
  0.0,
  0.0,
  0.3341742038105307,
  0.6403749295935449,
  0.0,
  0.0,
  0.3341742038105307,
  0.0],
 [0.0,
  0.0,
  0.0,
  0.3341742038105307,
  0.6403749295935449,
  0.0,
  0.3341742038105307,
  0.0,
  0.5048789499185483,
  0.0,
  0.3341742038105307,
  0.0],
 [0.39127525900506077,
  0.0,
  0.49628305255642946,
  0.25898108481225013,
  0.0,
  0.39127525900506077,
  0.25898108481225013,
  0.0,
  0.0,
  0.0,
  0.25898108481225013,
  0.49628305255642946],
 [0.0,
  0.49628305255642946,
  0.0,
  0.25898108481225013,
  0.0,
  0.39127525900506077,
  0.25898108481225013,
  0.0,
  0.39127525900506077,
  0.49628305255642946,
  0.25898108481225013,
  0.0]]

In [8]:
df = pd.DataFrame(denselist, columns = feature_names)
df

Unnamed: 0,cricket,heena,john,likes,mary,or,play,shaw,tennis,throwball,to,volleyball
0,0.504879,0.0,0.0,0.334174,0.0,0.0,0.334174,0.640375,0.0,0.0,0.334174,0.0
1,0.0,0.0,0.0,0.334174,0.640375,0.0,0.334174,0.0,0.504879,0.0,0.334174,0.0
2,0.391275,0.0,0.496283,0.258981,0.0,0.391275,0.258981,0.0,0.0,0.0,0.258981,0.496283
3,0.0,0.496283,0.0,0.258981,0.0,0.391275,0.258981,0.0,0.391275,0.496283,0.258981,0.0


In [9]:
import nltk
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec

with open("word2vec.txt", "r", encoding = 'utf-8') as f:
    sentences = f.readlines()
sentences

['With Bag of Words and TF-IDF text vectorization techniques we did not get semantic meaning of words But for most of the applications of NLP tasks like sentiment classification, sarcasm detection etc require semantic meaning of a word and semantic relationships of a word with other words.\n',
 'Word embeddings captures semantic and syntactic relationships between words and also the context of words in a document. \n',
 'Word2vec technique used to implement word embeddings.\n',
 'Word2vec model takes input as a large size of corpus and produces output to vector space. \n',
 'This vector space size may be in hundred of dimensionality. \n',
 'Each word vector will be placed on this vector space. \n',
 'In vector space words that share context commonly in a corpus are closer to each other. \n',
 'Word vector having positions of corresponding words in a vector space.\n',
 'Word2vec models predict the context words of a center word using skip-gram method. \n',
 'Skip-gram works well with a 

In [10]:
print("First sentence : ", sentences[0])

First sentence :  With Bag of Words and TF-IDF text vectorization techniques we did not get semantic meaning of words But for most of the applications of NLP tasks like sentiment classification, sarcasm detection etc require semantic meaning of a word and semantic relationships of a word with other words.



In [11]:
import re
for i in range (len(sentences)):
    sentences[i] = re.sub("[^a-zA-Z]", " ", str(sentences[i]))

In [12]:
print("No. of sentences: ", len(sentences))

No. of sentences:  11


In [13]:
sentences[0]

'With Bag of Words and TF IDF text vectorization techniques we did not get semantic meaning of words But for most of the applications of NLP tasks like sentiment classification  sarcasm detection etc require semantic meaning of a word and semantic relationships of a word with other words  '

In [14]:
word_tokens = []
for sent in sentences:
    word_tokens.append(word_tokenize(sent))
    
print("Tokenized sentence: ", word_tokens[0])

Tokenized sentence:  ['With', 'Bag', 'of', 'Words', 'and', 'TF', 'IDF', 'text', 'vectorization', 'techniques', 'we', 'did', 'not', 'get', 'semantic', 'meaning', 'of', 'words', 'But', 'for', 'most', 'of', 'the', 'applications', 'of', 'NLP', 'tasks', 'like', 'sentiment', 'classification', 'sarcasm', 'detection', 'etc', 'require', 'semantic', 'meaning', 'of', 'a', 'word', 'and', 'semantic', 'relationships', 'of', 'a', 'word', 'with', 'other', 'words']


In [15]:
model_cbow = Word2Vec( sentences = word_tokens,
                       vector_size = 2,
                       window = 2,
                       min_count = 2,
                       workers = 1,
                       sg = 0)

In [16]:
words_cbow = list(model_cbow.wv.index_to_key)
words_cbow

['of',
 'and',
 'words',
 'a',
 'vector',
 'the',
 'word',
 'space',
 'Word',
 'in',
 'semantic',
 'context',
 'method',
 'gram',
 'to',
 'vec',
 'skip',
 'as',
 'are',
 'we',
 'meaning',
 'for',
 'most',
 'center',
 'be',
 'input',
 'relationships',
 'with',
 'other',
 'corpus',
 'embeddings',
 'well',
 'size',
 'is']

In [17]:
print("Total words: ", len(words_cbow))

Total words:  34


In [18]:
#Get word embeddings:
model_cbow.wv.__getitem__('and')

array([0.25310904, 0.45070305], dtype=float32)