In [1]:
# RAHMADI RIDWAN
# ridwan@blast.co.id

import numpy as np
import csv

In [2]:
# Dataset yang digunakan adalah dataset berita BBC News yang terdiri atas 2225 baris data

In [3]:
!wget --no-check-certificate \
    https://storage.googleapis.com/dataset-uploader/bbc/bbc-text.csv \
    -O /tmp/bbc-text.csv

--2021-07-22 11:36:51--  https://storage.googleapis.com/dataset-uploader/bbc/bbc-text.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 172.253.117.128, 74.125.142.128, 74.125.195.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|172.253.117.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5057493 (4.8M) [text/csv]
Saving to: ‘/tmp/bbc-text.csv’


2021-07-22 11:36:51 (211 MB/s) - ‘/tmp/bbc-text.csv’ saved [5057493/5057493]



In [4]:
# import library untuk preprocessing teks

import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
inggris = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# Penghilangan stopwords dari dataset dan penampungan isi atribut teks dataset kedalam list articles 
# dan isi atribut kategori/topik berita list labels (setelah dilucuti stopwordsnya)

kelas = []
teks = []

with open("/tmp/bbc-text.csv", 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)
    for row in reader:
        kelas.append(row[0])
        t = row[1]
        for word in inggris:
            token = ' ' + word + ' '
            t = t.replace(token, ' ')
            t = t.replace(' ', ' ')
        teks.append(t)

In [6]:
# pembagian dataset menjadi data training dan data uji

from sklearn.model_selection import train_test_split
teks_training, teks_uji, kelas_training, kelas_uji = train_test_split(teks, kelas, test_size=0.2)

In [7]:
# memuatkan semua dependensi dan library tensorflow dan keras yang dibutuhkan

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, LSTM, Dropout, Activation, Embedding, Bidirectional

In [8]:
# tokenisasi teks training

tokenizer = Tokenizer(num_words = 5000, oov_token='<OOV>')
tokenizer.fit_on_texts(teks_training)
word_index = tokenizer.word_index

In [9]:
# sequencing untuk teks
train_sequences = tokenizer.texts_to_sequences(teks_training)
validation_sequences = tokenizer.texts_to_sequences(teks_uji)

# padding untuk teks
train_padded = pad_sequences(train_sequences, maxlen=200, padding='post', truncating='post')
validation_padded = pad_sequences(validation_sequences, maxlen=200, padding='post', truncating='post')

In [10]:
# tokenisasi kelas/label
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(kelas)

# sequencing untuk kelas/label
training_label_seq = np.array(label_tokenizer.texts_to_sequences(kelas_training))
validation_label_seq = np.array(label_tokenizer.texts_to_sequences(kelas_uji))

In [11]:
# penyusunan model sequential
model = tf.keras.Sequential([tf.keras.layers.Embedding(5000, 64),tf.keras.layers.Dropout(0.5),tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),tf.keras.layers.Dense(6, activation='softmax')])


In [12]:
# kompilasi model dengan parameter optimasi
optimasi = tf.keras.optimizers.Adam(learning_rate=0.001, decay=1e-6)
model.compile(loss='sparse_categorical_crossentropy',optimizer=optimasi,metrics=['accuracy'])

In [13]:
# pelatihan model
la_histoire = model.fit(train_padded, training_label_seq, epochs=12, validation_data=(validation_padded, validation_label_seq), verbose=2)

Epoch 1/12
56/56 - 15s - loss: 1.5887 - accuracy: 0.2764 - val_loss: 1.3939 - val_accuracy: 0.3955
Epoch 2/12
56/56 - 11s - loss: 1.2418 - accuracy: 0.4792 - val_loss: 1.3301 - val_accuracy: 0.6135
Epoch 3/12
56/56 - 11s - loss: 0.8163 - accuracy: 0.7421 - val_loss: 0.5926 - val_accuracy: 0.8270
Epoch 4/12
56/56 - 11s - loss: 0.3008 - accuracy: 0.9079 - val_loss: 0.2580 - val_accuracy: 0.9281
Epoch 5/12
56/56 - 11s - loss: 0.2349 - accuracy: 0.9343 - val_loss: 0.3481 - val_accuracy: 0.9056
Epoch 6/12
56/56 - 11s - loss: 0.5563 - accuracy: 0.8511 - val_loss: 0.4208 - val_accuracy: 0.8944
Epoch 7/12
56/56 - 11s - loss: 0.2099 - accuracy: 0.9635 - val_loss: 0.3061 - val_accuracy: 0.9326
Epoch 8/12
56/56 - 11s - loss: 0.0949 - accuracy: 0.9882 - val_loss: 0.2069 - val_accuracy: 0.9371
Epoch 9/12
56/56 - 11s - loss: 0.0462 - accuracy: 0.9927 - val_loss: 0.1575 - val_accuracy: 0.9596
Epoch 10/12
56/56 - 11s - loss: 0.0236 - accuracy: 0.9989 - val_loss: 0.1440 - val_accuracy: 0.9596
Epoch 11/