# Word representation using Scikit-learn

In [1]:
import numpy as np

data = np.array([
    'this is the first document',
    'this document is the second document',
    'this is the third one not the first nor the third',
    'is this the first document or is is another document'
])

data

array(['this is the first document',
       'this document is the second document',
       'this is the third one not the first nor the third',
       'is this the first document or is is another document'],
      dtype='<U52')

In [2]:
sentWords = [[word for word in sentence.split()] for sentence in data]

sentWords

[['this', 'is', 'the', 'first', 'document'],
 ['this', 'document', 'is', 'the', 'second', 'document'],
 ['this',
  'is',
  'the',
  'third',
  'one',
  'not',
  'the',
  'first',
  'nor',
  'the',
  'third'],
 ['is',
  'this',
  'the',
  'first',
  'document',
  'or',
  'is',
  'is',
  'another',
  'document']]

## I. TF-IDF

In [3]:
# train a dictionary from a corpus
from gensim.corpora import Dictionary

dct = Dictionary(sentWords)  # fit dictionary

# (ID, count)
dct.doc2bow(["this", "is", "the", "document", "non_existent_word"])

[(0, 1), (2, 1), (3, 1), (4, 1)]

In [4]:
from gensim.models import TfidfModel

corpus = [dct.doc2bow(doc) for doc in sentWords]  # convert corpus to BoW format
model = TfidfModel(corpus)  # fit model
model

<gensim.models.tfidfmodel.TfidfModel at 0x7fe7b2ade940>

In [5]:
vector = model[corpus[0]]  # apply model to the first corpus document
vector

[(0, 0.7071067811865476), (1, 0.7071067811865476)]

## II. LSA

In [6]:
from gensim.models import LsiModel

lsa_model = LsiModel(corpus, num_topics=3, id2word=dct)

lsa_model

<gensim.models.lsimodel.LsiModel at 0x7fe7dccfa8e0>

In [7]:
# U: Terms X Concepts matrix 
lsa_model.projection.u

array([[ 0.40414174, -0.48828482,  0.43929742],
       [ 0.26324215,  0.06153243, -0.2836474 ],
       [ 0.54361713, -0.31896294, -0.41302551],
       [ 0.52112281,  0.45620371,  0.18601684],
       [ 0.32806837, -0.01958453,  0.22758559],
       [ 0.06482621, -0.08111696,  0.51123299],
       [ 0.09652722,  0.23789412, -0.02078438],
       [ 0.09652722,  0.23789412, -0.02078438],
       [ 0.09652722,  0.23789412, -0.02078438],
       [ 0.19305444,  0.47578824, -0.04156876],
       [ 0.10777438, -0.14968921, -0.32030555],
       [ 0.10777438, -0.14968921, -0.32030555]])

In [8]:
# S: Concepts X Concepts
lsa_model.projection.s

array([5.91216815, 3.40420027, 1.6491539 ])

In [9]:
# D: Documents X Concepts
D = lsa_model[corpus]

for d in D:
    print(d)

[(0, 2.0601921988067566), (1, -0.3090961544608346), (2, 0.15622694457256267)]
[(0, 2.2659179935801155), (1, -0.9400303667833381), (2, 1.3904047535449264)]
[(0, 3.373986613846059), (1, 2.756854907321676), (2, -0.056527454365849425)]
[(0, 3.7671169748912425), (1, -1.7346852748315205), (2, -0.8711377505388352)]


In [10]:
# show the first topic
lsa_model.show_topic(0)

[('is', 0.5436171339274469),
 ('the', 0.521122805794372),
 ('document', 0.4041417393082777),
 ('this', 0.3280683650061333),
 ('first', 0.2632421547705262),
 ('third', 0.19305444078823875),
 ('another', 0.10777438446065689),
 ('or', 0.10777438446065689),
 ('nor', 0.09652722039411941),
 ('not', 0.09652722039411939)]

## III. LDA

In [11]:
from gensim.models.ldamodel import LdaModel

lda = LdaModel(corpus=corpus, id2word=dct, num_topics=4, update_every=1, chunksize=100, passes=1)
lda

<gensim.models.ldamodel.LdaModel at 0x7fe7b2212e20>

In [12]:
# use LDA model: transform new doc to bag-of-words, then apply lda
doc_bow = dct.doc2bow(["this", "is", "not", "the", "third", "second", "or", "third", "document"])
doc_bow

[(0, 1), (2, 1), (3, 1), (4, 1), (5, 1), (7, 1), (9, 2), (11, 1)]

In [13]:
doc_lda = lda[doc_bow]
# doc_lda is vector of length num_topics representing weighted presence of each topic in the doc
doc_lda

[(0, 0.45403457), (1, 0.30465317), (2, 0.21605769), (3, 0.025254535)]

## IV. Word2Vec

In [14]:
from gensim.models import Word2Vec

model_w2v= Word2Vec(sentences=sentWords, vector_size=4, window=3, epochs=20, min_count=1)
word_vectors = model_w2v.wv

word_vectors

<gensim.models.keyedvectors.KeyedVectors at 0x7fe7f82b9af0>

In [15]:
# vocabulary
word_vectors.key_to_index

{'the': 0,
 'is': 1,
 'document': 2,
 'this': 3,
 'first': 4,
 'third': 5,
 'another': 6,
 'or': 7,
 'nor': 8,
 'not': 9,
 'one': 10,
 'second': 11}

In [16]:
# get embedding: method1
word_vectors["first"]

array([ 0.07200756,  0.02396637, -0.20704502, -0.23569417], dtype=float32)

In [17]:
# get embedding: method2
word_vectors.get_vector("document")

array([-0.1260561 , -0.09467296,  0.1847378 , -0.03786594], dtype=float32)

In [18]:
# cosine similarity
word_vectors.distance("first","document")

1.5126422047615051

In [19]:
# save the word2vec model
w2vpath = '/home/kariminf/Data/tutoriel/gensim_word2vec.model'
model_w2v.save(w2vpath)

In [20]:
# récupérer un modèle sauvegardé
model2 = Word2Vec.load(w2vpath)

# search the top 5 similar words to another
word = "another"
sim_words = []
if word in model2.wv: # tester si le mot existe dans la vocabulaire
    sim_words = model2.wv.most_similar(word, topn=5)
sim_words

[('nor', 0.6649483442306519),
 ('second', 0.5571070313453674),
 ('the', 0.4490523636341095),
 ('not', 0.4280834197998047),
 ('third', 0.2622111737728119)]

## V. Fasttext

In [21]:
from gensim.models import FastText

fasttext_model = FastText(vector_size=4, window=3, min_count=1, sentences=sentWords, epochs=10)

fasttext_model

<gensim.models.fasttext.FastText at 0x7fe7b2ade6a0>

In [22]:
fasttext_model.wv['karim']

array([-0.03808689, -0.01358647,  0.02698918,  0.08353347], dtype=float32)

In [23]:
fasttext_model.wv['document']

array([ 0.02423025,  0.00086855, -0.00287357,  0.01489758], dtype=float32)