In [1]:
# !pip install gensim

import os

import pandas as pd

from tqdm.auto import tqdm

from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec

## Prepare Corpus

In [2]:
df = pd.read_csv("datasets/spam_or_not_spam.csv")
df = df.loc[df["email"].notna()]
df.head()

Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0


In [3]:
# membuat tokenize
sentences = [word_tokenize(email.lower()) for email in tqdm(df["email"])]
sentences[:5]

HBox(children=(FloatProgress(value=0.0, max=2999.0), HTML(value='')))




[['date',
  'wed',
  'number',
  'aug',
  'number',
  'number',
  'number',
  'number',
  'number',
  'from',
  'chris',
  'garrigues',
  'cwg',
  'dated',
  'number',
  'numberfanumberd',
  'deepeddy',
  'com',
  'message',
  'id',
  'number',
  'number',
  'tmda',
  'deepeddy',
  'vircio',
  'com',
  'i',
  'can',
  't',
  'reproduce',
  'this',
  'error',
  'for',
  'me',
  'it',
  'is',
  'very',
  'repeatable',
  'like',
  'every',
  'time',
  'without',
  'fail',
  'this',
  'is',
  'the',
  'debug',
  'log',
  'of',
  'the',
  'pick',
  'happening',
  'number',
  'number',
  'number',
  'pick_it',
  'exec',
  'pick',
  'inbox',
  'list',
  'lbrace',
  'lbrace',
  'subject',
  'ftp',
  'rbrace',
  'rbrace',
  'number',
  'number',
  'sequence',
  'mercury',
  'number',
  'number',
  'number',
  'exec',
  'pick',
  'inbox',
  'list',
  'lbrace',
  'lbrace',
  'subject',
  'ftp',
  'rbrace',
  'rbrace',
  'number',
  'number',
  'sequence',
  'mercury',
  'number',
  'number',
  'n

## Train Model Word2Vec

In [4]:
model = Word2Vec(sentences, size=128, window=5, min_count=3, workers=4, iter=800, sg=0, hs=0)

## Save Model

In [5]:
os.makedirs("model/w2v/", exist_ok=True)
model.save("model/w2v/email_spam.w2v")

## Load Model

In [6]:
model = Word2Vec.load("model/w2v/email_spam.w2v")

## Model Information

In [7]:
w2v = model.wv

In [8]:
w2v.index2word

['number',
 'the',
 'to',
 'and',
 'of',
 'a',
 'in',
 'i',
 'url',
 'is',
 'that',
 'you',
 'it',
 'for',
 'this',
 'on',
 's',
 'with',
 'be',
 'not',
 'have',
 'are',
 'from',
 'as',
 't',
 'or',
 'your',
 'at',
 'by',
 'if',
 'but',
 'we',
 'can',
 'was',
 'an',
 'all',
 'will',
 'list',
 'my',
 'they',
 'so',
 'has',
 'one',
 'do',
 'more',
 'there',
 'get',
 'our',
 'no',
 'just',
 'out',
 'about',
 'what',
 'which',
 'people',
 'time',
 'their',
 'up',
 'use',
 'would',
 'only',
 'like',
 'new',
 'he',
 'who',
 'any',
 'free',
 'email',
 'now',
 'some',
 'me',
 'when',
 'don',
 'other',
 'mail',
 'here',
 'been',
 'm',
 'than',
 'how',
 'them',
 'wrote',
 'also',
 'rpm',
 'e',
 'then',
 'make',
 'mailing',
 'date',
 'message',
 'world',
 'said',
 'had',
 'us',
 'his',
 'because',
 'hyperlink',
 'into',
 're',
 'over',
 'spamassassin',
 'its',
 'd',
 'way',
 'were',
 'money',
 'first',
 'could',
 'users',
 'information',
 'these',
 'think',
 'should',
 've',
 'work',
 '__________

In [9]:
w2v.vectors

array([[-0.48425394, -2.4420595 ,  0.57668823, ..., -3.209725  ,
         1.1528778 , -1.0570666 ],
       [-0.9712879 , -0.11089331, -1.4091527 , ..., -0.78666264,
        -0.25651205,  2.0329738 ],
       [-1.3585579 , -2.0334811 , -2.9978566 , ..., -1.1884505 ,
         0.36884072,  0.89595515],
       ...,
       [-1.2237101 ,  1.1278837 ,  0.06294397, ...,  0.9311315 ,
        -0.22662179, -4.587529  ],
       [-0.1028017 ,  1.4032881 , -1.01217   , ...,  1.4370338 ,
        -0.0281599 , -3.6900573 ],
       [-0.11113238, -0.07932117, -1.0202459 , ...,  0.8643712 ,
        -0.5111585 , -5.549954  ]], dtype=float32)

In [10]:
w2v.vector_size

128

In [11]:
w2v["online"]

array([ 3.1251576 ,  1.6283026 ,  1.3168931 ,  1.0806856 ,  0.6165383 ,
        1.4555733 , -2.9434695 , -2.6267276 , -0.7347846 , -0.63291854,
       -0.42770213, -1.6969575 , -0.7420905 ,  3.5092707 , -5.3976665 ,
        3.4823084 ,  2.223181  , -0.4290541 ,  0.8436355 , -6.3392367 ,
       -0.20925023,  3.296211  ,  2.2208211 , -0.27603784, -0.89867795,
       -1.2270781 , -0.9215378 ,  4.2275267 , -2.0149496 ,  1.6046672 ,
        0.70325816,  1.3176095 ,  0.36537904,  1.5368588 ,  4.462055  ,
        3.673685  ,  4.5297174 ,  0.47667426,  4.273858  ,  1.3359423 ,
        0.18882078,  0.19374351, -2.792836  , -1.4162123 ,  0.08573525,
        0.4580659 ,  0.83232737, -0.08674087, -3.3137245 ,  1.0465537 ,
        2.4803734 , -1.8778602 ,  2.735989  , -0.5030345 , -1.3576709 ,
       -3.789288  , -0.647946  , -0.84428805, -2.2344751 , -1.7749156 ,
       -2.4141164 , -2.754965  ,  4.9356523 , -3.4935641 ,  4.2265286 ,
        4.783513  ,  0.01864129, -1.326966  , -4.048335  , -1.74

## Sanity Check
### Similar word

In [12]:
w2v.similar_by_word("me", topn=5)

[('us', 0.5087110996246338),
 ('it', 0.41770100593566895),
 ('him', 0.40021273493766785),
 ('i', 0.39329099655151367),
 ('you', 0.380694180727005)]

### Higher order visualization

In [13]:
from umap import UMAP
import numpy as np
import pandas as pd
import plotly.express as px

In [None]:
X