In [3]:
# Import Pandas
import pandas as pd

In [4]:
# Mengubah dataset menjadi dataframe dan hapus kolom judul_film
df = pd.read_csv("imdb_indonesian_movies_2.csv")
df = df.drop(columns=["judul_film"])

In [5]:
# Panggil 5 sampel teratas
df.head()

Unnamed: 0,ringkasan_sinopsis,genre
0,Raden Mas Said putra sulung Tumenggung Wilarik...,Drama
1,Soe Hok Gie adalah seorang aktivis yang hidup ...,Drama
2,Guru Bangsa Tjokroaminoto menceritakan tentang...,Drama
3,POL menceritakan kisah hidup yang luar biasa d...,Drama
4,Perjalanan pahlawan Indonesia KH Ahmad Dahlan ...,Drama


In [6]:
# Melakukan one-hot-encoding
category = pd.get_dummies(df.genre)
df_baru = pd.concat([df, category], axis=1)
df_baru = df_baru.drop(columns="genre")
df_baru

Unnamed: 0,ringkasan_sinopsis,Drama,Horor,Komedi,Laga,Romantis
0,Raden Mas Said putra sulung Tumenggung Wilarik...,1,0,0,0,0
1,Soe Hok Gie adalah seorang aktivis yang hidup ...,1,0,0,0,0
2,Guru Bangsa Tjokroaminoto menceritakan tentang...,1,0,0,0,0
3,POL menceritakan kisah hidup yang luar biasa d...,1,0,0,0,0
4,Perjalanan pahlawan Indonesia KH Ahmad Dahlan ...,1,0,0,0,0
...,...,...,...,...,...,...
1000,Winter in Tokyo berpusat pada kehidupan Ishida...,0,0,0,0,1
1001,Markonah melarikan diri ke Jakarta karena akan...,0,0,0,0,1
1002,"Tempat aking lebih dari 36 jam, Last Night ada...",0,0,0,0,1
1003,Proyek baru ini adalah tentang seorang lelaki ...,0,0,0,0,1


In [7]:
# Mengubah nilai-nilai dari dataframe ke dalam tipe data numpy array
sinopsis = df_baru["ringkasan_sinopsis"].values
label = df_baru[["Drama", "Horor", "Komedi", "Laga", "Romantis"]].values

In [8]:
# Membagi data menjadi training dan testing
from sklearn.model_selection import train_test_split
sinopsis_latih, sinopsis_test, label_latih, label_test = train_test_split(sinopsis, label, test_size=0.2)

In [9]:
# Ubah kata ke dalam numerik (tokenizer), lalu konversi sampel menjadi sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=5000, oov_token="x")
tokenizer.fit_on_texts(sinopsis_latih)
tokenizer.fit_on_texts(sinopsis_test)

sekuens_latih = tokenizer.texts_to_sequences(sinopsis_latih)
sekuens_test = tokenizer.texts_to_sequences(sinopsis_test)

padded_latih = pad_sequences(sekuens_latih)
padded_test = pad_sequences(sekuens_test)

In [10]:
# Arsitektur
import tensorflow as tf
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=5000, output_dim=16),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(128, activation="relu"),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(5, activation="softmax")
])

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [11]:
num_epochs = 30
history = model.fit(padded_latih, label_latih, epochs=num_epochs,
                    validation_data=(padded_test, label_test), verbose=2)

Epoch 1/30
26/26 - 17s - loss: 1.6101 - accuracy: 0.1903 - val_loss: 1.6104 - val_accuracy: 0.1692 - 17s/epoch - 637ms/step
Epoch 2/30
26/26 - 11s - loss: 1.6059 - accuracy: 0.2090 - val_loss: 1.6072 - val_accuracy: 0.2537 - 11s/epoch - 441ms/step
Epoch 3/30
26/26 - 13s - loss: 1.5455 - accuracy: 0.3557 - val_loss: 1.5812 - val_accuracy: 0.2289 - 13s/epoch - 489ms/step
Epoch 4/30
26/26 - 11s - loss: 1.3041 - accuracy: 0.4502 - val_loss: 1.5988 - val_accuracy: 0.2935 - 11s/epoch - 432ms/step
Epoch 5/30
26/26 - 13s - loss: 0.9424 - accuracy: 0.6119 - val_loss: 1.8844 - val_accuracy: 0.3234 - 13s/epoch - 504ms/step
Epoch 6/30
26/26 - 12s - loss: 0.6423 - accuracy: 0.7774 - val_loss: 2.2803 - val_accuracy: 0.3234 - 12s/epoch - 458ms/step
Epoch 7/30
26/26 - 13s - loss: 0.3839 - accuracy: 0.8694 - val_loss: 2.6757 - val_accuracy: 0.3582 - 13s/epoch - 507ms/step
Epoch 8/30
26/26 - 12s - loss: 0.2319 - accuracy: 0.9502 - val_loss: 2.8469 - val_accuracy: 0.3632 - 12s/epoch - 463ms/step
Epoch 9/