In [1]:
from datasets import load_dataset

data_files = {
    "train": "indosum/train.04.jsonl",
    "validation": "indosum/dev.04.jsonl",
    "test": "indosum/test.04.jsonl",   
}

# Muat dataset
dataset = load_dataset("json", data_files=data_files)
print(dataset)

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['category', 'gold_labels', 'id', 'paragraphs', 'source', 'source_url', 'summary'],
        num_rows: 14272
    })
    validation: Dataset({
        features: ['category', 'gold_labels', 'id', 'paragraphs', 'source', 'source_url', 'summary'],
        num_rows: 750
    })
    test: Dataset({
        features: ['category', 'gold_labels', 'id', 'paragraphs', 'source', 'source_url', 'summary'],
        num_rows: 3752
    })
})


In [2]:
train_data = dataset['train']
print(train_data[0])

validation_data = dataset['validation']
print(validation_data[0])

test_data = dataset['test']
print(test_data[0])

{'category': 'tajuk utama', 'gold_labels': [[True], [False], [False], [True], [False], [True, True], [False, False, False], [False, False, False], [False], [False, False]], 'id': '1512964800-dirut-bri-ingin-holding-bumn-jasa-keuangan-segera-', 'paragraphs': [[['Setelah', 'selesai', 'membentuk', 'holding', 'BUMN', 'pertambangan', ',', 'kini', 'pemerintah', 'tengah', 'menggarap', 'holding', '-', 'holding', 'lainnya', ',', 'salah', 'satunya', 'holding', 'BUMN', 'jasa', 'keuangan', '.']], [['PT', 'Danareksa', '(', 'Persero', ')', 'akan', 'menjadi', 'induk', 'holding', 'membawahi', '4', 'Bank', 'BUMN', ',', 'yaitu', 'BNI', ',', 'BRI', ',', 'BTN', ',', 'dan', 'Bank', 'Mandiri', '.']], [['Selain', 'itu', ',', 'perusahaan', 'pelat', 'merah', 'lain', 'yang', 'bergerak', 'di', 'sektor', 'jasa', 'keuangan', 'seperti', 'PT', 'Jalin', 'Pembayaran', 'Nusantara', '(', 'JPN', ')', ',', 'PT', 'Permodalan', 'Nasional', 'Madani', '(', 'Persero', ')', '(', 'PMN', ')', ',', 'dan', 'PT', 'Pegadaian', '(', '

In [3]:
def join_data(dataset):
    documents = []
    summaries = []
    for item in dataset:
        document = [" ".join(sent) for para in item["paragraphs"] for sent in para]
        summary = " ".join([" ".join(sent) for sent in item["summary"]])
        
        documents.append(document)
        summaries.append(summary)
    
    return documents, summaries

# Memproses data
train_docs, train_summaries = join_data(train_data)
val_docs, val_summaries = join_data(validation_data)
test_docs, test_summaries = join_data(test_data)

In [4]:
print(train_docs[0])
print(train_summaries[0])
len(train_docs)

['Setelah selesai membentuk holding BUMN pertambangan , kini pemerintah tengah menggarap holding - holding lainnya , salah satunya holding BUMN jasa keuangan .', 'PT Danareksa ( Persero ) akan menjadi induk holding membawahi 4 Bank BUMN , yaitu BNI , BRI , BTN , dan Bank Mandiri .', 'Selain itu , perusahaan pelat merah lain yang bergerak di sektor jasa keuangan seperti PT Jalin Pembayaran Nusantara ( JPN ) , PT Permodalan Nasional Madani ( Persero ) ( PMN ) , dan PT Pegadaian ( Persero ) juga akan bergabung di dalam holding BUMN jasa keuangan .', 'Direktur Utama PT Bank Rakyat Indonesia Tbk ( BBRI ) , Suprajarto , berharap pembentukan holding BUMN jasa keuangan segera terealisasi .', '" Kalau saya berharap lebih cepat lebih baik karena manfaatnya banyak , " kata Suprajarto di sela-sela acara BRI Run 2017 Bogor Series , Kebun Raya Bogor , Minggu ( 10 / 12 ) .', 'Orang nomor satu BRI ini mengatakan , jika holding jasa keuangan cepat terbentuk , pengadaan ATM bisa jauh lebih efisien secar

14272

In [5]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

input_tokenizer = Tokenizer()
output_tokenizer = Tokenizer()
input_tokenizer.fit_on_texts(train_docs)
output_tokenizer.fit_on_texts(train_summaries)

input_vocab_size = len(input_tokenizer.word_index) + 1
output_vocab_size = len(output_tokenizer.word_index) + 1

# Mengubah teks menjadi sequence
X = input_tokenizer.texts_to_sequences(train_docs)
y = output_tokenizer.texts_to_sequences(train_summaries)

X_val = input_tokenizer.texts_to_sequences(val_docs)
y_val = output_tokenizer.texts_to_sequences(val_summaries)

X_test = input_tokenizer.texts_to_sequences(test_docs)
y_test = output_tokenizer.texts_to_sequences(test_summaries)

# max_length = max(max(len(seq) for seq in X), max(len(seq) for seq in y))
max_input_length = max(len(seq) for seq in X)
max_output_length = max(len(seq) for seq in y)

max_length = max(max_input_length, max_output_length)

X_train = pad_sequences(X, maxlen=max_input_length, padding='post')
y_train = pad_sequences(y, maxlen=max_output_length, padding='post')

X_val = pad_sequences(X_val, maxlen=max_input_length, padding='post')
y_val = pad_sequences(y_val, maxlen=max_output_length, padding='post')

X_test = pad_sequences(X_test, maxlen=max_input_length, padding='post')
y_test = pad_sequences(y_test, maxlen=max_output_length, padding='post')

print(X_train.shape)  # (14272, 86)
print(y_train.shape)  # (14272, 86)


2024-12-02 14:46:31.408156: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1733125591.422181   19871 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1733125591.426235   19871 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-02 14:46:31.442901: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


(14272, 80)
(14272, 86)


In [8]:
from tensorflow.keras import layers, Model
import numpy as np

class PositionalEncoding(layers.Layer):
    def __init__(self, sequence_length, d_model):
        super(PositionalEncoding, self).__init__()
        self.sequence_length = sequence_length
        self.d_model = d_model
    
    def build(self, input_shape):
        # Create a matrix of shape (sequence_length, d_model)
        position = np.arange(self.sequence_length)[:, np.newaxis]
        div_term = np.exp(np.arange(0, self.d_model, 2) * -(np.log(10000.0) / self.d_model))
        pe = np.zeros((self.sequence_length, self.d_model))
        pe[:, 0::2] = np.sin(position * div_term)
        pe[:, 1::2] = np.cos(position * div_term)
        pe = pe[np.newaxis, ...]

        self.pe = tf.constant(pe, dtype=tf.float32)
    
    def call(self, inputs):
        # Add positional encoding to the input embeddings
        return inputs + self.pe[:, :tf.shape(inputs)[1], :]


def transformer_model(input_vocab_size, output_vocab_size, input_max_len, output_max_len, d_model=512, num_heads=8, num_layers=6):
    inputs = layers.Input(shape=(input_max_len,))
    targets = layers.Input(shape=(output_max_len,))

    # Encoder
    enc_emb = layers.Embedding(input_vocab_size, d_model)(inputs)
    enc_pos = PositionalEncoding(input_max_len, d_model)(enc_emb)
    enc_output = layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)(enc_pos, enc_pos)
    enc_output = layers.LayerNormalization()(enc_output)

    # Decoder
    dec_emb = layers.Embedding(output_vocab_size, d_model)(targets)
    dec_pos = PositionalEncoding(output_max_len, d_model)(dec_emb)
    dec_output = layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)(dec_pos, enc_output)
    dec_output = layers.LayerNormalization()(dec_output)
    
    # Final output layer
    output = layers.Dense(output_vocab_size, activation='softmax')(dec_output)

    model = tf.keras.Model(inputs=[inputs, targets], outputs=output)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# model = transformer_model(input_vocab_size, output_vocab_size, max_length, max_length)
model = transformer_model(input_vocab_size, output_vocab_size, max_input_length, max_output_length-1)
model.summary()


In [11]:
# Prepare the training data for the decoder (Shift the target sequences)
y_train_decoder_input = y_train[:, :-1]  # Remove the last token
y_train_decoder_output = y_train[:, 1:]  # Shift the sequence by 1 for teacher forcing

y_val_decoder_input = y_val[:, :-1]
y_val_decoder_output = y_val[:, 1:]

# y_train_decoder_input = pad_sequences(y_train_decoder_input, maxlen=max_length, padding='post')
# y_val_decoder_input = pad_sequences(y_val_decoder_input, maxlen=max_length, padding='post')

print(y_train.shape)  # Pastikan ukuran sesuai
print(y_train_decoder_input.shape)  # Pastikan ukuran sesuai
print(y_train_decoder_output.shape)  # Pastikan ukuran sesuai

print(y_val.shape)  # Pastikan ukuran sesuai
print(y_val_decoder_input.shape)  # Pastikan ukuran sesuai
print(y_val_decoder_output.shape)  # Pastikan ukuran sesuai

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"  # Nonaktifkan GPU


history = model.fit(
    [X_train, y_train_decoder_input], y_train_decoder_output,
    epochs=10, batch_size=64, validation_data=([X_val, y_val_decoder_input], y_val_decoder_output)
)

(14272, 86)
(14272, 85)
(14272, 85)
(750, 86)
(750, 85)
(750, 85)
Epoch 1/10


E0000 00:00:1733126017.728065   19989 cuda_dnn.cc:522] Loaded runtime CuDNN library: 9.1.0 but source was compiled with: 9.3.0.  CuDNN library needs to have matching major version and equal or higher minor version. If using a binary install, upgrade your CuDNN library.  If building from sources, make sure the library loaded at runtime is compatible with the version specified during compile configuration.
E0000 00:00:1733126017.756538   19989 cuda_dnn.cc:522] Loaded runtime CuDNN library: 9.1.0 but source was compiled with: 9.3.0.  CuDNN library needs to have matching major version and equal or higher minor version. If using a binary install, upgrade your CuDNN library.  If building from sources, make sure the library loaded at runtime is compatible with the version specified during compile configuration.
2024-12-02 14:53:37.763109: W tensorflow/core/framework/op_kernel.cc:1841] OP_REQUIRES failed at xla_ops.cc:577 : FAILED_PRECONDITION: DNN library initialization failed. Look at the er

FailedPreconditionError: Graph execution error:

Detected at node StatefulPartitionedCall defined at (most recent call last):
  File "/home/rakhargo/miniconda3/envs/cudaTF/lib/python3.9/runpy.py", line 197, in _run_module_as_main

  File "/home/rakhargo/miniconda3/envs/cudaTF/lib/python3.9/runpy.py", line 87, in _run_code

  File "/home/rakhargo/miniconda3/envs/cudaTF/lib/python3.9/site-packages/ipykernel_launcher.py", line 18, in <module>

  File "/home/rakhargo/miniconda3/envs/cudaTF/lib/python3.9/site-packages/traitlets/config/application.py", line 1075, in launch_instance

  File "/home/rakhargo/miniconda3/envs/cudaTF/lib/python3.9/site-packages/ipykernel/kernelapp.py", line 739, in start

  File "/home/rakhargo/miniconda3/envs/cudaTF/lib/python3.9/site-packages/tornado/platform/asyncio.py", line 205, in start

  File "/home/rakhargo/miniconda3/envs/cudaTF/lib/python3.9/asyncio/base_events.py", line 601, in run_forever

  File "/home/rakhargo/miniconda3/envs/cudaTF/lib/python3.9/asyncio/base_events.py", line 1905, in _run_once

  File "/home/rakhargo/miniconda3/envs/cudaTF/lib/python3.9/asyncio/events.py", line 80, in _run

  File "/home/rakhargo/miniconda3/envs/cudaTF/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 545, in dispatch_queue

  File "/home/rakhargo/miniconda3/envs/cudaTF/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 534, in process_one

  File "/home/rakhargo/miniconda3/envs/cudaTF/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 437, in dispatch_shell

  File "/home/rakhargo/miniconda3/envs/cudaTF/lib/python3.9/site-packages/ipykernel/ipkernel.py", line 362, in execute_request

  File "/home/rakhargo/miniconda3/envs/cudaTF/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 778, in execute_request

  File "/home/rakhargo/miniconda3/envs/cudaTF/lib/python3.9/site-packages/ipykernel/ipkernel.py", line 449, in do_execute

  File "/home/rakhargo/miniconda3/envs/cudaTF/lib/python3.9/site-packages/ipykernel/zmqshell.py", line 549, in run_cell

  File "/home/rakhargo/miniconda3/envs/cudaTF/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3048, in run_cell

  File "/home/rakhargo/miniconda3/envs/cudaTF/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3103, in _run_cell

  File "/home/rakhargo/miniconda3/envs/cudaTF/lib/python3.9/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner

  File "/home/rakhargo/miniconda3/envs/cudaTF/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3308, in run_cell_async

  File "/home/rakhargo/miniconda3/envs/cudaTF/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3490, in run_ast_nodes

  File "/home/rakhargo/miniconda3/envs/cudaTF/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3550, in run_code

  File "/tmp/ipykernel_19871/568849640.py", line 20, in <module>

  File "/home/rakhargo/miniconda3/envs/cudaTF/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/home/rakhargo/miniconda3/envs/cudaTF/lib/python3.9/site-packages/keras/src/backend/tensorflow/trainer.py", line 320, in fit

  File "/home/rakhargo/miniconda3/envs/cudaTF/lib/python3.9/site-packages/keras/src/backend/tensorflow/trainer.py", line 121, in one_step_on_iterator

DNN library initialization failed. Look at the errors above for more details.
	 [[{{node StatefulPartitionedCall}}]] [Op:__inference_one_step_on_iterator_5410]