<a href="https://colab.research.google.com/github/stevengregori92/LearnWord2Vec/blob/main/Word2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
import os
import pandas as pd

from tqdm.auto import tqdm
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec

#Prepare Corpus

In [3]:
!gdown https://drive.google.com/uc?id=10OnDq1OAByUxcQEPCDZ19jxU0W8QV6hu
!unzip /content/word2vec.zip

Downloading...
From: https://drive.google.com/uc?id=10OnDq1OAByUxcQEPCDZ19jxU0W8QV6hu
To: /content/word2vec.zip
  0% 0.00/184k [00:00<?, ?B/s]100% 184k/184k [00:00<00:00, 84.2MB/s]
Archive:  /content/word2vec.zip
   creating: data/
  inflating: data/kompas.txt         
   creating: __MACOSX/
   creating: __MACOSX/data/
  inflating: __MACOSX/data/._kompas.txt  
  inflating: data/.DS_Store          
  inflating: __MACOSX/data/._.DS_Store  
  inflating: data/gojek_review_sentiment.csv  
  inflating: __MACOSX/data/._gojek_review_sentiment.csv  
  inflating: data/gojek_review.csv   


In [4]:
df = pd.read_csv('data/gojek_review.csv')
df.head()

Unnamed: 0,review,rate
0,Sangat kecewa. Kecewa sekali. Udh top up. Mau ...,1
1,Ga niat ngasih promo sialan temen udh pake ref...,1
2,Kalau sistemnya rata begini apa bedanya yg raj...,1
3,"Ongkosnya da mahal, minimal 16rb..... Sekarang...",1
4,Tolol anjing..!!!! Aplikasi yang katanga karya...,1


In [5]:
sentences = [word_tokenize(review.lower()) for review in tqdm(df.review)]
sentences[:5]

  0%|          | 0/1277 [00:00<?, ?it/s]

[['sangat',
  'kecewa',
  '.',
  'kecewa',
  'sekali',
  '.',
  'udh',
  'top',
  'up',
  '.',
  'mau',
  'di',
  'transaksi',
  'in',
  'malah',
  'kasih',
  'nomor',
  'yang',
  'salah',
  '.',
  'padahal',
  'nomor',
  'di',
  'profil',
  'benar',
  '.',
  'buat',
  'ap',
  'ah',
  'q',
  'top',
  'up',
  'tp',
  'ga',
  'bisa',
  'digunain',
  '.',
  'balikin.',
  '!',
  'ga',
  'becus',
  '.',
  'kasih',
  'nomer',
  'saalah',
  '.',
  'saya',
  'mau',
  'masukin',
  'kode',
  'transaksi',
  'gimana.',
  '!',
  '!',
  '!',
  '!'],
 ['ga',
  'niat',
  'ngasih',
  'promo',
  'sialan',
  'temen',
  'udh',
  'pake',
  'reff',
  'ga',
  'pake',
  'cara',
  'curang',
  'malah',
  'di',
  'stop',
  '.',
  'kapitalis',
  '.',
  'daerah',
  'ku',
  'ongkir',
  'ga',
  'ada',
  'subsidi',
  'di',
  'daerah',
  'lain',
  'masih',
  'ada',
  '.',
  'ampas',
  'gojek',
  'skrng',
  '.',
  'strategi',
  'baru',
  'ga',
  'bakar',
  'duit',
  'gimana',
  '.',
  'yg',
  'ada',
  'transaksi',
  'h

#Train Word2Vec Model

In [6]:
model = Word2Vec(sentences, vector_size=128, window=5, min_count=3, workers=4, epochs=1000, sg=0, hs=0)

#Save

In [7]:
os.makedirs("model/w2v/", exist_ok=True)

In [8]:
model.save('model/w2v/gojek_review.w2v')

#Load

In [9]:
model = Word2Vec.load('model/w2v/gojek_review.w2v')

#Continue Training

In [10]:
contoh_data = [
    ['aplikasinya', 'ok', 'kok', '.', 'sangat', 'membantu', 'untuk', 'keseharian', 'saya'],
    ['aplikasinya', 'gak', 'nyaman', 'dipakai', 'bule']
]

In [11]:
model.train(contoh_data, total_examples = len(contoh_data), epochs=1)



(10, 14)

In [12]:
model.save('model/w2v/gojek_review.w2v')

#Model Information

In [13]:
w2v = model.wv

In [14]:
w2v.index_to_key

[',',
 '.',
 'saya',
 '!',
 'di',
 'gojek',
 '..',
 'yg',
 'bisa',
 '?',
 'ada',
 'nya',
 'driver',
 'dan',
 'ini',
 'aplikasi',
 'tidak',
 'yang',
 'gak',
 'mau',
 'lagi',
 'ga',
 '...',
 'tolong',
 'aja',
 'sudah',
 'tapi',
 'untuk',
 'voucher',
 'padahal',
 'gopay',
 'ke',
 'sama',
 'udah',
 'order',
 'lebih',
 'akun',
 'sangat',
 'dengan',
 'ya',
 'buat',
 'sekarang',
 'kenapa',
 'jadi',
 'kasih',
 'malah',
 'lama',
 'dari',
 'gofood',
 'itu',
 'promo',
 'kalo',
 'jauh',
 'juga',
 'dapat',
 'pake',
 'karena',
 'masuk',
 'banyak',
 'terus',
 'kalau',
 'apa',
 'baru',
 'go',
 'dong',
 'kok',
 'gk',
 'kecewa',
 'pakai',
 'selalu',
 'harus',
 'pesan',
 'orderan',
 'sistem',
 'hari',
 'cuma',
 'makin',
 'tp',
 "''",
 'saldo',
 'banget',
 'dulu',
 'atau',
 'anak',
 'beli',
 'sekali',
 'mohon',
 'sampai',
 'makanan',
 'sering',
 'mahal',
 'setiap',
 'grab',
 'masih',
 'klo',
 'pas',
 'jangan',
 'baik',
 'saja',
 '1',
 'drivernya',
 'sy',
 'update',
 'kali',
 'membantu',
 'perbaiki',
 '(',

In [15]:
w2v.vectors

array([[ 0.03834431,  0.19737446,  1.5746331 , ..., -0.24893184,
        -1.2714243 ,  0.7045421 ],
       [ 0.47159183,  0.9920001 ,  1.8186547 , ..., -1.5711766 ,
        -1.3462093 , -0.5467184 ],
       [-0.23808195,  1.6348968 , -0.1357832 , ...,  0.2739658 ,
        -2.1487014 , -0.45402798],
       ...,
       [-1.1425531 , -0.8045286 ,  0.11660334, ..., -2.1053054 ,
        -2.9483645 ,  1.7326711 ],
       [ 2.4282458 ,  1.7900873 , -0.81434923, ...,  1.2435498 ,
        -1.6615396 ,  1.6834487 ],
       [ 1.4906496 , -1.6972607 , -0.62908584, ..., -1.2329481 ,
        -0.6993661 , -0.7414474 ]], dtype=float32)

In [16]:
w2v.vector_size

128

In [17]:
w2v['sepertinya']

array([ 0.76833177,  1.1094381 ,  0.133505  , -1.1274123 ,  1.9300805 ,
       -0.78012   ,  0.05615833, -0.79901135,  3.0788934 ,  0.8870458 ,
        0.79384387, -3.2046099 , -1.7834723 , -1.8499963 ,  0.24075627,
        3.2859387 ,  1.3676164 ,  1.9708302 , -0.2314091 , -3.029348  ,
       -0.38174322, -3.0621772 , -1.4423974 ,  3.7645247 ,  2.737375  ,
       -0.5082192 , -2.7026832 , -0.93534434,  1.590301  ,  2.9525247 ,
        3.4516602 , -2.4428782 , -1.1483067 , -0.18216507, -0.35854962,
       -1.814619  , -0.46979484,  0.470524  , -0.25099593, -0.68367624,
       -1.2579013 ,  1.4763334 ,  1.9010266 , -3.9073598 ,  1.2268602 ,
        1.6215059 , -0.7460377 , -2.5657167 ,  0.80725026,  0.7474756 ,
       -0.04583569,  2.7801256 ,  0.19732949,  1.1312993 , -0.7957213 ,
        1.9160415 ,  4.6669    , -1.8121775 , -2.436151  , -1.2830954 ,
       -0.3760371 , -1.2716372 , -2.244738  ,  0.11568068, -0.739354  ,
        2.7568247 ,  0.2985564 , -0.8226016 ,  1.348325  , -0.58