# Natural Language Processing with Tensorflow

# Multiclass Text Classification

In [1]:
import pandas as pd

df = pd.read_csv('/content/imdb_indonesian_movies_2.csv')
df = df.drop(columns=['judul_film'])

In [2]:
df.head()

Unnamed: 0,ringkasan_sinopsis,genre
0,Raden Mas Said putra sulung Tumenggung Wilarik...,Drama
1,Soe Hok Gie adalah seorang aktivis yang hidup ...,Drama
2,Guru Bangsa Tjokroaminoto menceritakan tentang...,Drama
3,POL menceritakan kisah hidup yang luar biasa d...,Drama
4,Perjalanan pahlawan Indonesia KH Ahmad Dahlan ...,Drama


In [3]:
category = pd.get_dummies(df.genre)
df_baru = pd.concat([df, category], axis=1)
df_baru = df_baru.drop(columns='genre')
df_baru

Unnamed: 0,ringkasan_sinopsis,Drama,Horor,Komedi,Laga,Romantis
0,Raden Mas Said putra sulung Tumenggung Wilarik...,1,0,0,0,0
1,Soe Hok Gie adalah seorang aktivis yang hidup ...,1,0,0,0,0
2,Guru Bangsa Tjokroaminoto menceritakan tentang...,1,0,0,0,0
3,POL menceritakan kisah hidup yang luar biasa d...,1,0,0,0,0
4,Perjalanan pahlawan Indonesia KH Ahmad Dahlan ...,1,0,0,0,0
...,...,...,...,...,...,...
1000,Winter in Tokyo berpusat pada kehidupan Ishida...,0,0,0,0,1
1001,Markonah melarikan diri ke Jakarta karena akan...,0,0,0,0,1
1002,"Tempat aking lebih dari 36 jam, Last Night ada...",0,0,0,0,1
1003,Proyek baru ini adalah tentang seorang lelaki ...,0,0,0,0,1


In [4]:
sinopsis = df_baru['ringkasan_sinopsis'].values
label = df_baru[['Drama', 'Horor', 'Komedi', 'Laga', 'Romantis']].values

In [5]:
from sklearn.model_selection import train_test_split

sinopsis_latih, sinopsis_test, label_latih, label_test = train_test_split(sinopsis, label, test_size=0.2)

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
 
tokenizer = Tokenizer(num_words=5000, oov_token='x')
tokenizer.fit_on_texts(sinopsis_latih) 
tokenizer.fit_on_texts(sinopsis_test)
 
sekuens_latih = tokenizer.texts_to_sequences(sinopsis_latih)
sekuens_test = tokenizer.texts_to_sequences(sinopsis_test)
 
padded_latih = pad_sequences(sekuens_latih) 
padded_test = pad_sequences(sekuens_test)

In [7]:
import tensorflow as tf

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=5000, output_dim=16),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')
])
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [8]:
num_epochs = 30
history = model.fit(padded_latih, 
                    label_latih, 
                    epochs=num_epochs, 
                    validation_data=(padded_test, label_test), 
                    verbose=2)

Epoch 1/30
26/26 - 12s - loss: 1.6104 - accuracy: 0.1803 - val_loss: 1.6096 - val_accuracy: 0.1940
Epoch 2/30
26/26 - 5s - loss: 1.6058 - accuracy: 0.2413 - val_loss: 1.6065 - val_accuracy: 0.1990
Epoch 3/30
26/26 - 5s - loss: 1.5517 - accuracy: 0.3035 - val_loss: 1.5739 - val_accuracy: 0.2836
Epoch 4/30
26/26 - 5s - loss: 1.3102 - accuracy: 0.4104 - val_loss: 1.5810 - val_accuracy: 0.2886
Epoch 5/30
26/26 - 5s - loss: 0.9891 - accuracy: 0.5634 - val_loss: 1.7664 - val_accuracy: 0.2985
Epoch 6/30
26/26 - 5s - loss: 0.6549 - accuracy: 0.6953 - val_loss: 2.1211 - val_accuracy: 0.2687
Epoch 7/30
26/26 - 5s - loss: 0.4788 - accuracy: 0.8209 - val_loss: 2.1490 - val_accuracy: 0.3284
Epoch 8/30
26/26 - 5s - loss: 0.2699 - accuracy: 0.9167 - val_loss: 2.5522 - val_accuracy: 0.3682
Epoch 9/30
26/26 - 5s - loss: 0.1139 - accuracy: 0.9789 - val_loss: 3.1018 - val_accuracy: 0.3632
Epoch 10/30
26/26 - 5s - loss: 0.0479 - accuracy: 0.9888 - val_loss: 3.3729 - val_accuracy: 0.3682
Epoch 11/30
26/26 