# Membuat Model NLP dengan TensorFlow

Dataset dari kaggle : [Spam Text Message Classification](https://www.kaggle.com/team-ai/spam-text-message-classification)

In [1]:
import pandas as pd

df = pd.read_csv('/content/SPAM text message 20170820 - Data.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [2]:
Category = pd.get_dummies(df.Category)
df_baru = pd.concat([df, Category], axis=1)
df_baru = df_baru.drop(columns='Category')
df_baru

Unnamed: 0,Message,ham,spam
0,"Go until jurong point, crazy.. Available only ...",1,0
1,Ok lar... Joking wif u oni...,1,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,0,1
3,U dun say so early hor... U c already then say...,1,0
4,"Nah I don't think he goes to usf, he lives aro...",1,0
...,...,...,...
5567,This is the 2nd time we have tried 2 contact u...,0,1
5568,Will ü b going to esplanade fr home?,1,0
5569,"Pity, * was in mood for that. So...any other s...",1,0
5570,The guy did some bitching but I acted like i'd...,1,0


In [3]:
from sklearn.model_selection import train_test_split

pesan = df_baru['Message'].values
label = df_baru[['spam']].values


pesan_latih, pesan_test, label_latih, label_test = train_test_split(pesan, label, test_size=0.2)

In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=5000, oov_token='x')
tokenizer.fit_on_texts(pesan_latih)
tokenizer.fit_on_texts(pesan_test)

sekuens_latih = tokenizer.texts_to_sequences(pesan_latih)
sekuens_test = tokenizer.texts_to_sequences(pesan_test)

padded_latih = pad_sequences(sekuens_latih)
padded_test = pad_sequences(sekuens_test)

print(sekuens_latih)
print(sekuens_test)

[[2, 227, 157, 719, 35, 40, 227, 32, 134, 35, 1690, 517, 29, 2994, 9], [40, 55, 4427, 2995, 73, 2332, 16, 67, 25, 6, 248, 4428, 4429, 4430, 119, 2996, 61, 4431, 1500, 441, 53, 16, 211, 296, 4432, 20, 4433, 2997, 2998, 2333, 3, 1501, 106, 159, 2996, 1502, 9, 106, 44, 43, 902, 556, 42, 70, 58, 1503, 2998, 687, 557, 4434, 130, 6, 1328, 10, 6, 67, 6, 124, 4435], [1329, 93, 36, 582, 114], [6, 4436, 472, 44, 43, 149, 4437, 903, 2999, 374, 4438, 44, 43, 44, 64, 43, 63, 79, 535, 324, 8, 72, 4, 199, 151], [85, 12, 2334, 634, 1025, 15, 47, 35, 2, 60, 76, 22], [85, 79, 17, 104], [457, 39, 841, 4439, 19, 3, 842, 26, 153, 35, 431, 26, 3000, 7, 136], [4440, 1026, 961, 4441, 113, 107, 17, 196], [1691, 100, 33, 64], [26, 164, 536, 18, 39, 249, 31, 1941], [96, 2, 1692, 1693, 352, 391, 40, 29, 50, 42, 1330, 4442, 75, 797, 432, 3, 159, 392, 1331, 3, 3001, 42, 3002, 720, 29, 50, 1332, 797, 3003, 3, 20, 392, 1331, 3, 4443, 1106, 42, 1942, 1331, 30, 42], [558, 3004, 114, 583, 114, 7, 83, 26, 1027, 25, 2, 29

In [5]:
import tensorflow as tf

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=5000, output_dim=16),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(128, activation='relu'),
    # tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid'),
])

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [6]:
num_epochs = 8
history = model.fit(padded_latih,
                    label_latih,
                    epochs=num_epochs,
                    validation_data=(padded_test, label_test),
                    verbose=2)

Epoch 1/8
140/140 - 13s - loss: 0.2052 - accuracy: 0.9376 - val_loss: 0.0768 - val_accuracy: 0.9758
Epoch 2/8
140/140 - 5s - loss: 0.0312 - accuracy: 0.9915 - val_loss: 0.0734 - val_accuracy: 0.9794
Epoch 3/8
140/140 - 5s - loss: 0.0143 - accuracy: 0.9971 - val_loss: 0.0791 - val_accuracy: 0.9794
Epoch 4/8
140/140 - 5s - loss: 0.0085 - accuracy: 0.9982 - val_loss: 0.0774 - val_accuracy: 0.9803
Epoch 5/8
140/140 - 5s - loss: 0.0067 - accuracy: 0.9980 - val_loss: 0.0785 - val_accuracy: 0.9812
Epoch 6/8
140/140 - 5s - loss: 0.0063 - accuracy: 0.9973 - val_loss: 0.0952 - val_accuracy: 0.9848
Epoch 7/8
140/140 - 5s - loss: 0.0015 - accuracy: 0.9998 - val_loss: 0.1216 - val_accuracy: 0.9839
Epoch 8/8
140/140 - 5s - loss: 3.1330e-04 - accuracy: 1.0000 - val_loss: 0.1265 - val_accuracy: 0.9830
