**IMPORT LIBRARY**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

import tensorflow as tf

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

**DATA ACQUISITION**

In [2]:
# Download dataset
!wget https://raw.githubusercontent.com/ksnugroho/klasifikasi-spam-sms/master/data/dataset_sms_spam_v1.csv

--2023-10-29 23:01:12--  https://raw.githubusercontent.com/ksnugroho/klasifikasi-spam-sms/master/data/dataset_sms_spam_v1.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 128896 (126K) [text/plain]
Saving to: ‘dataset_sms_spam_v1.csv’


2023-10-29 23:01:12 (4.07 MB/s) - ‘dataset_sms_spam_v1.csv’ saved [128896/128896]



In [3]:
data = pd.read_csv('dataset_sms_spam_v1.csv')
data.head()

Unnamed: 0,teks,label
0,[PROMO] Beli paket Flash mulai 1GB di MY TELKO...,2
1,2.5 GB/30 hari hanya Rp 35 Ribu Spesial buat A...,2
2,"2016-07-08 11:47:11.Plg Yth, sisa kuota Flash ...",2
3,"2016-08-07 11:29:47.Plg Yth, sisa kuota Flash ...",2
4,4.5GB/30 hari hanya Rp 55 Ribu Spesial buat an...,2


**TEXT PREPROCESSING**

In [5]:
# melakukan preprocessing karena kita tidak ingin mengubah makna (konteks) dari suatu kata yang menyusun suatu kalimat

import re

def text_preprocessing(text):
  text = text.lower()                               # Mengubah teks menjadi lower case
  text = re.sub(r'https?://\S+|www\.\S+', '', text) # Menghapus URL
  text = re.sub(r'[-+]?[0-9]+', '', text)           # Menghapus angka
  text = re.sub(r'[^\w\s]','', text)                # Menghapus karakter tanda baca
  text = text.strip()                               # Menghapus whitespaces
  return text

In [6]:
%time data['clean_teks'] = data['teks'].apply(text_preprocessing)

# Perhatikan waktu komputasi ketika proses text preprocessing, bandingkan dengan langkah text preprocessing pada pertemuan sebelumnya

CPU times: user 26.4 ms, sys: 1.62 ms, total: 28 ms
Wall time: 35.8 ms


In [7]:
data.head()

Unnamed: 0,teks,label,clean_teks
0,[PROMO] Beli paket Flash mulai 1GB di MY TELKO...,2,promo beli paket flash mulai gb di my telkomse...
1,2.5 GB/30 hari hanya Rp 35 Ribu Spesial buat A...,2,gb hari hanya rp ribu spesial buat anda yang ...
2,"2016-07-08 11:47:11.Plg Yth, sisa kuota Flash ...",2,plg yth sisa kuota flash anda kb download myte...
3,"2016-08-07 11:29:47.Plg Yth, sisa kuota Flash ...",2,plg yth sisa kuota flash anda kb download myte...
4,4.5GB/30 hari hanya Rp 55 Ribu Spesial buat an...,2,gb hari hanya rp ribu spesial buat anda yang ...


**WORD EMBEDING**

In [8]:
# Import library
import gensim

# Lihat versi Gensim yang digunakan
gensim.__version__

'4.3.2'

In [9]:
# Tokenize kata pada setiap kalimat
def tokenization(text):
  text = re.split('\W+', text)
  return text

sentences = data['clean_teks'].apply(lambda x: tokenization(x.lower()))
sentences

0       [promo, beli, paket, flash, mulai, gb, di, my,...
1       [gb, hari, hanya, rp, ribu, spesial, buat, and...
2       [plg, yth, sisa, kuota, flash, anda, kb, downl...
3       [plg, yth, sisa, kuota, flash, anda, kb, downl...
4       [gb, hari, hanya, rp, ribu, spesial, buat, and...
                              ...                        
1138    [yooo, sama, oke, nanti, aku, umumin, di, grup...
1139    [sebelumnya, ga, ad, nulis, kerudung, kirain, ...
1140                                [mba, mau, kirim, ya]
1141    [nama, beaok, bwrangkat, pagimau, cas, atay, t...
1142                    [no, bri, atas, nama, kamu, mana]
Name: clean_teks, Length: 1143, dtype: object

In [10]:
# Defenisikan parameter training Word2Vec

EMBEDDING_SIZE = 100    # Dimensi word vektor / neuron pada projection (hidden) layer
WINDOW_SIZE = 5         # Window size. Jarak maksimum antara kata saat ini dan yang diprediksi dalam sebuah
MIN_WORD  = 1           # Model akan mengabaikan semua kata dengan frekuensi total lebih rendah dari ini (opsional)
EPOCH = 10              # Jumlah iterasi (epoch).
SG = 1                  # Strategi algoritma pelatihan: 1 untuk skip-gram, 0 untuk CBOW
NEGATIVE = 5            # Negative sampling. Jika 0, negative sampling tidak digunakan

In [14]:
%%time
# Proses training Word2Vec
from gensim.models import Word2Vec

model_word2vec = Word2Vec(sentences, vector_size=EMBEDDING_SIZE, sg=SG, min_count=MIN_WORD, window=WINDOW_SIZE, negative=NEGATIVE, epochs=EPOCH)


CPU times: user 1.36 s, sys: 14.1 ms, total: 1.37 s
Wall time: 850 ms


In [15]:
# Cari nilai vektor dari kata tertentu
model_word2vec.wv['kuota']

array([ 0.24411376, -0.01263912, -0.05993753, -0.32596415,  0.08765812,
       -0.15162742,  0.25806516,  0.65170914, -0.25549203, -0.13530655,
        0.02114179, -0.07087318,  0.21364853,  0.22406279, -0.0629739 ,
       -0.46499294,  0.03859901, -0.33678436, -0.5071769 , -0.57295555,
        0.45570347, -0.1869036 ,  0.08123992, -0.05913262, -0.04679316,
       -0.18949053, -0.0530128 , -0.39285833, -0.3440436 ,  0.15931547,
        0.21821137, -0.4487441 ,  0.13375813, -0.32924992,  0.04452154,
        0.2702818 , -0.6135232 ,  0.28342366,  0.10221034, -0.4509163 ,
       -0.23934641, -0.02171874, -0.4468141 ,  0.02523799,  0.5085053 ,
       -0.19954169, -0.31195405,  0.02343066, -0.2936841 ,  0.12303311,
        0.30055302, -0.24665155, -0.42836636, -0.23672077,  0.348198  ,
       -0.02711496, -0.02471961,  0.20822302, -0.5443058 ,  0.20925057,
       -0.3906779 ,  0.2735098 , -0.11698335,  0.06192494, -0.02340549,
       -0.28943166,  0.0411907 ,  0.23590295,  0.00619035,  0.10

In [16]:
# Menemukan kata kata teratas yang paling mirip dari kata terentu
# Menghitung kesamaan dari vektor bobot proyeksi dari kata-kata yang diberikan dan vektor untuk setiap kata dalam model.

model_word2vec.wv.most_similar('kuota')

[('paket', 0.9924838542938232),
 ('mb', 0.9912590980529785),
 ('flash', 0.9877873063087463),
 ('mulai', 0.9845399856567383),
 ('extra', 0.9836029410362244),
 ('nikmati', 0.9825679659843445),
 ('internetan', 0.9818686842918396),
 ('cuma', 0.9808180928230286),
 ('hr', 0.9804248213768005),
 ('hanya', 0.9799321293830872)]

In [17]:
# Kata-kata yang ingin kita plot vektornya
word_list = ['paket', 'flash', 'hanya', 'cuma', 'extra', 'promo', 'anda', 'buruan', 'hanya']

# daftar vektor dari kata-kata tersebut
word_vectors = np.array([model_word2vec.wv[w] for w in word_list])
print(word_vectors)

[[ 2.05151826e-01  2.76848674e-02 -1.54141504e-02 -2.46195346e-01
   5.09769544e-02 -1.25613272e-01  2.22080484e-01  6.64640486e-01
  -1.82436928e-01 -8.77740011e-02  3.63425910e-03 -5.48662283e-02
   1.84074253e-01  1.62879467e-01 -7.63297603e-02 -4.53295350e-01
   8.20766091e-02 -3.09257686e-01 -4.66736227e-01 -4.84286904e-01
   4.25417930e-01 -1.80746391e-01  8.99982676e-02 -6.00868985e-02
  -3.84445935e-02 -1.47731692e-01 -1.22831292e-01 -3.33091229e-01
  -2.73645759e-01  1.81637749e-01  2.41808280e-01 -4.09397990e-01
   1.38801277e-01 -3.06129217e-01  3.42752300e-02  2.66651779e-01
  -5.44341683e-01  2.17699081e-01  8.83530080e-02 -4.20977205e-01
  -2.33199582e-01 -4.46175151e-02 -3.47725391e-01  1.31426491e-02
   4.43673134e-01 -2.18659252e-01 -3.04891974e-01 -2.27620695e-02
  -2.50746965e-01  1.17059961e-01  2.90024340e-01 -2.33300820e-01
  -3.49739283e-01 -1.98481813e-01  2.83401132e-01 -4.71962132e-02
   3.69816385e-02  2.00217932e-01 -5.34715235e-01  2.08583415e-01
  -3.53784