In [1]:
import gensim
from gensim import corpora

text1 = ["""Gensim is a free open-source Python library for representing documents as semantic vectors,
           as efficiently and painlessly as possible. Gensim is designed 
           to process raw, unstructured digital texts using unsupervised machine learning algorithms."""]

tokens1 = [[item for item in line.split()] for line in text1]
g_dict1 = corpora.Dictionary(tokens1)

print("The dictionary has: " +str(len(g_dict1)) + " tokens\n")
print(g_dict1.token2id)

The dictionary has: 29 tokens

{'Gensim': 0, 'Python': 1, 'a': 2, 'algorithms.': 3, 'and': 4, 'as': 5, 'designed': 6, 'digital': 7, 'documents': 8, 'efficiently': 9, 'for': 10, 'free': 11, 'is': 12, 'learning': 13, 'library': 14, 'machine': 15, 'open-source': 16, 'painlessly': 17, 'possible.': 18, 'process': 19, 'raw,': 20, 'representing': 21, 'semantic': 22, 'texts': 23, 'to': 24, 'unstructured': 25, 'unsupervised': 26, 'using': 27, 'vectors,': 28}


In [6]:
from gensim.utils import simple_preprocess
from gensim import corpora

text2 = open('sample.txt', encoding ='utf-8')
 
tokens2 =[]
for line in text2.read().split('.'):
  tokens2.append(simple_preprocess(line, deacc = True))

g_dict2 = corpora.Dictionary(tokens2)

print("The dictionary has: " +str(len(g_dict2)) + " tokens\n")
print(g_dict2.token2id)

The dictionary has: 3 tokens

{'khandagale': 0, 'laxman': 1, 'shruti': 2}


In [8]:
g_dict1.add_documents(tokens2)

print("The dictionary has: " +str(len(g_dict1)) + " tokens\n")
print(g_dict1.token2id)

The dictionary has: 32 tokens

{'Gensim': 0, 'Python': 1, 'a': 2, 'algorithms.': 3, 'and': 4, 'as': 5, 'designed': 6, 'digital': 7, 'documents': 8, 'efficiently': 9, 'for': 10, 'free': 11, 'is': 12, 'learning': 13, 'library': 14, 'machine': 15, 'open-source': 16, 'painlessly': 17, 'possible.': 18, 'process': 19, 'raw,': 20, 'representing': 21, 'semantic': 22, 'texts': 23, 'to': 24, 'unstructured': 25, 'unsupervised': 26, 'using': 27, 'vectors,': 28, 'khandagale': 29, 'laxman': 30, 'shruti': 31}


Creating Bag of Words

In [9]:
g_bow =[g_dict1.doc2bow(token, allow_update = True) for token in tokens1]
print("Bag of Words : ", g_bow)

Bag of Words :  [[(0, 2), (1, 1), (2, 1), (3, 1), (4, 1), (5, 3), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 2), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1)]]


In [10]:
g_bow =[g_dict1.doc2bow(token, allow_update = True) for token in tokens1]
print("Bag of Words : ", g_bow)

Bag of Words :  [[(0, 2), (1, 1), (2, 1), (3, 1), (4, 1), (5, 3), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 2), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1)]]


Creating a tf-idf

In [13]:
from gensim import corpora, models
from gensim.utils import simple_preprocess
import numpy as np
text = ["The food is excellent but the service can be better",
        "The food is always delicious and loved the service",
        "The food was mediocre and the service was terrible"]

g_dict = corpora.Dictionary([simple_preprocess(line) for line in text])
g_bow = [g_dict.doc2bow(simple_preprocess(line)) for line in text]

print("Dictionary : ")
for item in g_bow:
    print([[g_dict[id], freq] for id, freq in item])

g_tfidf = models.TfidfModel(g_bow, smartirs='ntc')

print("TF-IDF Vector:")
for item in g_tfidf[g_bow]:
    print([[g_dict[id], np.around(freq, decimals=2)] for id, freq in item])

Dictionary : 
[['be', 1], ['better', 1], ['but', 1], ['can', 1], ['excellent', 1], ['food', 1], ['is', 1], ['service', 1], ['the', 2]]
[['food', 1], ['is', 1], ['service', 1], ['the', 2], ['always', 1], ['and', 1], ['delicious', 1], ['loved', 1]]
[['food', 1], ['service', 1], ['the', 2], ['and', 1], ['mediocre', 1], ['terrible', 1], ['was', 2]]
TF-IDF Vector:
[['be', 0.43], ['better', 0.43], ['but', 0.43], ['can', 0.43], ['excellent', 0.43], ['food', 0.09], ['is', 0.21], ['service', 0.09], ['the', 0.18]]
[['food', 0.11], ['is', 0.26], ['service', 0.11], ['the', 0.21], ['always', 0.52], ['and', 0.26], ['delicious', 0.52], ['loved', 0.52]]
[['food', 0.08], ['service', 0.08], ['the', 0.16], ['and', 0.2], ['mediocre', 0.39], ['terrible', 0.39], ['was', 0.78]]


Word to Vec Model 

In [15]:
from gensim.models.word2vec import Word2Vec
from multiprocessing import cpu_count
import gensim.downloader as api

# Load the text8 dataset
dataset = api.load("text8")

# Prepare the data (dataset is an iterator, we need a list of tokenized sentences)
words = [d for d in dataset]

# Use the first 1000 sentences
data1 = words[:1000]

# Train a Word2Vec model on the tokenized sentences
w2v_model = Word2Vec(data1, min_count=1, workers=cpu_count())

# Save the model
w2v_model.save("word2vec_model_text8")

# Example: Getting vector for a word
word_vector = w2v_model.wv['king']  # Example to get the word vector for 'king'

# Display the word vector for 'king'
print(word_vector)


[-1.8031335  -0.21976678 -1.0846417   2.139007    0.15377729 -0.68191826
 -3.0022268  -0.62324005  2.070712   -0.88086414 -0.07410897  2.0458372
  3.308508    1.2110171   3.9215193  -2.0003214   0.09288459  1.0773298
  3.9262824  -1.0983424   1.2539767   1.1885593   0.84988713  2.74991
  0.7938108   0.5388029   2.1001287  -2.819218    0.25290787  0.8201379
  0.23509927 -2.1467423   0.05534726 -1.15489     2.4897785   1.8280953
  1.5219172  -1.0381143  -0.44220635  0.7947314  -0.25239903 -0.62839824
  0.48405114 -0.81509256  1.4324458  -3.7155836  -1.8690464  -3.19993
  1.2248409   1.6176951   1.556343    2.2219458  -2.5986428  -3.190451
 -2.0061145   2.288872    3.3241117   0.41138983  0.60243815 -1.3466425
  0.96312314  2.842241    0.3628329   0.51295614  0.5152199  -0.79482746
 -0.92165357 -1.2423123   0.14244322 -1.3210897   0.3068016   0.88770425
 -0.88662046 -1.2813677  -1.1594144   0.51025367  2.4039226  -1.1943753
 -1.5858489   0.7479321   1.6926975   1.8125972   0.734578    0.5