In [1]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import time

In [2]:
#load dataset

df = pd.read_csv("data/sms.csv")

In [3]:
#menghitung jumlah baris dan kolom

df.shape

(20829, 2)

In [4]:
#tipe
# 0 = Penipuan (Spam)
# 1 = Pinjaman Online (Spam)
# 2 = Judi Online (Spam)
# 3 = Lain-lain (Ham)

df.head()

Unnamed: 0,tipe,pesan
0,0,10.10 festival selamat anda m-dapatkan hadiah ...
1,0,surat keputusan dari pt.shopee slamat anda m-d...
2,0,info pemenang slamat!!! no.anda t-pilih m-dapa...
3,0,no.and4 terpilih mndptkn rp.175jt program thun...
4,1,oktober untung !! kartu super bagus dan jackpo...


In [5]:
#total data setiap tipe

print((df.tipe == 0).sum())
print((df.tipe == 1).sum())
print((df.tipe == 2).sum())
print((df.tipe == 3).sum())

11786
1651
5629
1763


In [6]:
#preprocessing
import re
import string

## fungsi untuk menghapus url didalam teks
def remove_URL(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"",text)

## fungsi untuk menghapus tanda baca didalam teks
def remove_punct(text):
    translator = str.maketrans("", "", string.punctuation)
    return text.translate(translator)

## fungsi untuk mengubah teks menjadi huruf kecil
def case_folding(text):
    return text.casefold()

In [7]:
## stopwords
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

stopwords = StopWordRemoverFactory().create_stop_word_remover()

def remove_stopwords(text):
    return stopwords.remove(text)

In [51]:
## stemming
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

stemmer = StemmerFactory().create_stemmer()

def stemming(text):
    return stemmer.stem(text)

In [52]:
df["pesan"] = df.pesan.map(remove_URL)
df["pesan"] = df.pesan.map(remove_punct)
df["pesan"] = df.pesan.map(case_folding)
df["pesan"] = df.pesan.map(remove_stopwords)
df["pesan"] = df.pesan.map(stemming)

In [10]:
from collections import Counter

## fungsi untuk menghitung total setiap kata
def counter_word(text_col):
    count = Counter()
    for text in text_col.values:
        for word in text.split():
            count[word] += 1;
    return count

counter = counter_word(df.pesan)

In [11]:
#total kata

len(counter)

35632

In [12]:
counter

Counter({'1010': 2,
         'festival': 4,
         'selamat': 1980,
         'mdapatkan': 185,
         'hadiah': 3895,
         'ke2': 1043,
         'cek': 3595,
         'tunai': 1305,
         'rp175jt': 282,
         'pin': 3419,
         'j7k2b59': 19,
         'info': 3841,
         'klik': 4212,
         'sidprogramhadiah77': 1,
         'surat': 335,
         'putus': 220,
         'ptshopee': 79,
         'slamat': 169,
         'pemenang25f4777': 1,
         'uinfo': 2123,
         'bitlyundianshopee75': 1,
         'menang': 2999,
         'noanda': 848,
         'tpilih': 70,
         'rp175000000': 13,
         'dri': 758,
         'sh0pee': 20,
         '2020': 29,
         '25f4777': 20,
         'noand4': 1,
         'pilih': 1561,
         'mndptkn': 27,
         'program': 335,
         'thunan': 1,
         'rejeki': 10,
         'id': 543,
         'sidpemenangresmi': 1,
         'oktober': 10,
         'untung': 222,
         'kartu': 1035,
         'super': 56,

In [13]:
counter.most_common(5)

[('klik', 4212), ('hadiah', 3895), ('info', 3841), ('cek', 3595), ('no', 3422)]

In [14]:
num_unique_words = len(counter)

In [15]:
#membagi data set menjadi data training & data testing

train_size = int(df.shape[0] * 0.8)

train_df = df[:train_size]
test_df = df[train_size:]

#pecah pesan dan tipe
train_pesan = train_df.pesan.to_numpy()
train_tipe = train_df.tipe.to_numpy()
test_pesan = test_df.pesan.to_numpy()
test_tipe = test_df.tipe.to_numpy()

In [16]:
#one hot encoding
from tensorflow.keras.utils import to_categorical

train_tipe = to_categorical(train_tipe)
test_tipe = to_categorical(test_tipe)

train_tipe

array([[1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       ...,
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]], dtype=float32)

In [17]:
## tokenize

from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=num_unique_words)
tokenizer.fit_on_texts(train_pesan)

#fungsi untuk memberikan key untuk setiap kata
word_index = tokenizer.word_index

word_index

{'klik': 1,
 'info': 2,
 'hadiah': 3,
 'cek': 4,
 'pin': 5,
 'no': 6,
 'menang': 7,
 'hub': 8,
 'resmi': 9,
 'dgn': 10,
 'uinfo': 11,
 'selamat': 12,
 'bpkb': 13,
 'wa': 14,
 'bunga': 15,
 'kode': 16,
 'dana': 17,
 'pilih': 18,
 'bi': 19,
 'pulsa': 20,
 'pesan': 21,
 '1': 22,
 'yg': 23,
 'yth': 24,
 'langgan': 25,
 'tunai': 26,
 'bri': 27,
 'cepat': 28,
 'undi': 29,
 'proses': 30,
 'bantu': 31,
 'pinjam': 32,
 'sy': 33,
 'isi': 34,
 'promo': 35,
 'sms': 36,
 'mobil': 37,
 'ke2': 38,
 'main': 39,
 'ulang': 40,
 'kartu': 41,
 'bonus': 42,
 'dr': 43,
 'utk': 44,
 'bank': 45,
 'online': 46,
 'sd': 47,
 'indonesia': 48,
 'minat': 49,
 'hubung': 50,
 'jamin': 51,
 'tutup': 52,
 'uang': 53,
 'rek': 54,
 'nomor': 55,
 'cicil': 56,
 'chat': 57,
 'kta': 58,
 'dri': 59,
 'noanda': 60,
 'rp': 61,
 'juta': 62,
 'butuh': 63,
 'an': 64,
 'harga': 65,
 '2': 66,
 'aman': 67,
 '5': 68,
 'kredit': 69,
 'mudah': 70,
 'maaf': 71,
 'bayar': 72,
 'id': 73,
 'diskon': 74,
 'solusi': 75,
 'kirim': 76,
 'dapat'

In [18]:
#fungsi untuk mengubah setiap kata menjadi angka berdasarkan key tokenize
train_sequences = tokenizer.texts_to_sequences(train_pesan)
test_sequences = tokenizer.texts_to_sequences(test_pesan)

In [19]:
print(train_pesan[0])
print(train_sequences[0])

print(word_index['festival'])

1010 festival selamat mdapatkan hadiah ke2 cek tunai rp175jt pin j7k2b59 info klik sidprogramhadiah77
[6953, 5255, 12, 204, 3, 38, 4, 26, 154, 5, 1279, 2, 1, 11027]
5255


In [20]:
#fungsi untuk menyamakan panjang kata untuk semua urutan
from tensorflow.keras.preprocessing.sequence import pad_sequences

#jumlah maksimum kata dalam satu urutan
max_length = 50

train_padded = pad_sequences(train_sequences, maxlen=max_length, padding="post", truncating="post")
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding="post", truncating="post")

In [21]:
print(train_pesan[1])
print(train_sequences[1])
print(train_padded[1])

surat putus ptshopee slamat mdapatkan cek tunai rp175jt pin pemenang25f4777 uinfo klik bitlyundianshopee75
[165, 266, 434, 241, 204, 4, 26, 154, 5, 11028, 11, 1, 11029]
[  165   266   434   241   204     4    26   154     5 11028    11     1
 11029     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0]


In [22]:
#fungsi untuk menukar posisi key dengan value
reverse_word_index = dict([(idx, word) for (word, idx) in word_index.items()])

reverse_word_index

{1: 'klik',
 2: 'info',
 3: 'hadiah',
 4: 'cek',
 5: 'pin',
 6: 'no',
 7: 'menang',
 8: 'hub',
 9: 'resmi',
 10: 'dgn',
 11: 'uinfo',
 12: 'selamat',
 13: 'bpkb',
 14: 'wa',
 15: 'bunga',
 16: 'kode',
 17: 'dana',
 18: 'pilih',
 19: 'bi',
 20: 'pulsa',
 21: 'pesan',
 22: '1',
 23: 'yg',
 24: 'yth',
 25: 'langgan',
 26: 'tunai',
 27: 'bri',
 28: 'cepat',
 29: 'undi',
 30: 'proses',
 31: 'bantu',
 32: 'pinjam',
 33: 'sy',
 34: 'isi',
 35: 'promo',
 36: 'sms',
 37: 'mobil',
 38: 'ke2',
 39: 'main',
 40: 'ulang',
 41: 'kartu',
 42: 'bonus',
 43: 'dr',
 44: 'utk',
 45: 'bank',
 46: 'online',
 47: 'sd',
 48: 'indonesia',
 49: 'minat',
 50: 'hubung',
 51: 'jamin',
 52: 'tutup',
 53: 'uang',
 54: 'rek',
 55: 'nomor',
 56: 'cicil',
 57: 'chat',
 58: 'kta',
 59: 'dri',
 60: 'noanda',
 61: 'rp',
 62: 'juta',
 63: 'butuh',
 64: 'an',
 65: 'harga',
 66: '2',
 67: 'aman',
 68: '5',
 69: 'kredit',
 70: 'mudah',
 71: 'maaf',
 72: 'bayar',
 73: 'id',
 74: 'diskon',
 75: 'solusi',
 76: 'kirim',
 77: 'da

In [23]:
#fungsi untuk membaca kembali kata yang telah diubah menjadi angka
def decode(sequence):
    return " ".join([reverse_word_index.get(idx, "?") for idx in sequence])

In [24]:
decoded_text = decode(train_sequences[2])

print(train_sequences[2])
print(decoded_text)

[2, 7, 241, 60, 539, 204, 4, 1731, 59, 1225, 1332, 5, 1226, 2, 3, 1]
info menang slamat noanda tpilih mdapatkan cek rp175000000 dri sh0pee 2020 pin 25f4777 info hadiah klik


In [30]:
# create model
from tensorflow.keras import layers
from tensorflow.keras.metrics import Precision, Recall, AUC, CategoricalAccuracy

def get_model():
    model = keras.models.Sequential([
        layers.Embedding(num_unique_words, 8, input_length=max_length), # 8 embedding dimension
        layers.Flatten(),
        layers.Dense(4, activation='softmax') # 4 probability output
    ])
    metrics = [
        CategoricalAccuracy(),
        Precision(),
        Recall(),
        AUC(),
    ]
    model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=metrics)
    return model

model = get_model()

In [45]:
#fungsi untuk melakukan training model

model.fit(train_padded, train_tipe, epochs=20, batch_size=512, validation_data=(test_padded, test_tipe))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x2a0f8fac0>

In [49]:
predictions = model.predict(test_padded)

In [50]:
# menampilkan beberapa prediksi
for i in range(30, 60):
    pred_tipe = np.argmax(predictions[i])
    true_tipe = np.argmax(test_tipe[i])
    message = decode(test_sequences[i])
    confident_level = predictions[i, pred_tipe] * 100
    
    print('"{}"\n> Tipe: {}\n> Prediksi: {} ({:.2f}%)\n\n'.format(message.strip(), true_tipe, pred_tipe, confident_level))

"biaya asi fund tawar pinjam tunai proses cepat kredit kta multiguna takeover bunga saing 640 flat tahun info"
> Tipe: 2
> Prediksi: 2 (100.00%)


"satusatunya situs togel online main baik lengkap habis waktu santai main seru ayo bukti"
> Tipe: 1
> Prediksi: 1 (99.99%)


"bpkibu indira andriani dptkan kta sd 250 jt syarat ktpccnpwp hub anton phwa 085782561770 bantu proses tkq"
> Tipe: 2
> Prediksi: 2 (100.00%)


"rekening bank bri an ade agus salim no rek"
> Tipe: 0
> Prediksi: 0 (100.00%)


"bonus stiker aliando ambil gratis main jg game seru 123995 stop 12366 info 087877707891 xl 838 axis"
> Tipe: 1
> Prediksi: 1 (99.74%)


"pijat lemak dar darah promo 250rb sms free ongkir"
> Tipe: 3
> Prediksi: 3 (67.50%)


"uang bpkb solusi financial leasing ram 350 cab indonesia proses cepat hub lil info klik bitlyeasylifeind"
> Tipe: 2
> Prediksi: 2 (85.92%)


"biar mobiltruk nganggur jamin finance besar tp murah proses hub anton 082123138899081513377899"
> Tipe: 2
> Prediksi: 2 (100.00%)


"inv

In [112]:
#fungsi untuk menyimpan model

concrete_func = model.signatures[
  tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
concrete_func.inputs[0].set_shape([1, 416, 416, 3])
coverter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
tflite_model = coverter.convert()
open("./model/tensorf_lite.tflite", "wb").write(tflite_model)

AttributeError: 'Sequential' object has no attribute 'signatures'