In [1]:
import keras
import numpy as np
import MeCab
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import WordLevelTrainer

2025-01-18 11:33:54.319747: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-18 11:33:54.328853: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1737167634.340264   74620 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1737167634.343517   74620 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-18 11:33:54.357072: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [11]:
url = "https://www.rondhuit.com/download/ldcc-20140209.tar.gz"
batch_size = 32
validation_split = 0.2
seed=42
max_sequence_length = 100

## livedoorニュースコーパスのダウンロードと解凍

In [4]:
extracted_dir = keras.utils.get_file("ldcc-20140209", origin=url, extract=True)

## データセットの作成

In [5]:
data_dir = f"{extracted_dir}/text"

train_ds = keras.utils.text_dataset_from_directory(
    data_dir,
    batch_size=batch_size,
    validation_split=validation_split,
    subset="training",
    seed=seed,
)
val_ds = keras.utils.text_dataset_from_directory(
    data_dir,
    batch_size=batch_size,
    validation_split=validation_split,
    subset="validation",
    seed=seed,
)

Found 7376 files belonging to 9 classes.
Using 5901 files for training.


W0000 00:00:1737167965.419039   74620 gpu_device.cc:2344] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


Found 7376 files belonging to 9 classes.
Using 1475 files for validation.


## トークン解析

In [6]:
class_names = train_ds.class_names
mecab = MeCab.Tagger()

def mecab_tokenize(text):
    node = mecab.parseToNode(text)
    tokens = []
    while node:
        if node.feature.split(',')[0] != 'BOS/EOS':
            tokens.append(node.surface)
        node = node.next
    return tokens

tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()

texts = [text.numpy().decode("utf-8") for text, label in train_ds.unbatch()]
trainer = WordLevelTrainer(vocab_size=20000, special_tokens=["[PAD]", "[UNK]"])
tokenizer.train_from_iterator(texts, trainer)

2025-01-18 11:40:28.845077: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


## データの前処理

In [7]:
def preprocess_dataset(dataset, tokenizer, max_len):
    texts = [text.numpy().decode("utf-8") for text, label in dataset.unbatch()]
    labels = [label.numpy() for text, label in dataset.unbatch()]
    tokenized = [mecab_tokenize(text) for text in texts]
    tokenized_ids = [tokenizer.encode(" ".join(tokens)).ids for tokens in tokenized]
    padded = keras.utils.pad_sequences(tokenized_ids, maxlen=max_len, padding='post', truncating='post')
    return np.array(padded), np.array(labels)

X_train, y_train = preprocess_dataset(train_ds, tokenizer, max_sequence_length)
X_val, y_val = preprocess_dataset(val_ds, tokenizer, max_sequence_length)

print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_val shape: {X_val.shape}, y_val shape: {y_val.shape}")

2025-01-18 11:41:21.589402: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-01-18 11:41:36.832598: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


X_train shape: (5901, 100), y_train shape: (5901,)
X_val shape: (1475, 100), y_val shape: (1475,)


## モデルのトレーニング

In [8]:
num_classes = len(class_names)
model = keras.Sequential([
    keras.layers.Embedding(input_dim=20000, output_dim=128),
    keras.layers.LSTM(128, return_sequences=False),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(num_classes, activation='softmax'),
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=batch_size, validation_data=(X_val, y_val))


Epoch 1/10
[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 50ms/step - accuracy: 0.1090 - loss: 2.1926 - val_accuracy: 0.1220 - val_loss: 2.1936
Epoch 2/10
[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 48ms/step - accuracy: 0.1515 - loss: 2.1741 - val_accuracy: 0.1322 - val_loss: 2.1999
Epoch 3/10
[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 48ms/step - accuracy: 0.2176 - loss: 2.1145 - val_accuracy: 0.1234 - val_loss: 2.2356
Epoch 4/10
[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 47ms/step - accuracy: 0.3020 - loss: 1.9649 - val_accuracy: 0.1349 - val_loss: 2.3406
Epoch 5/10
[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 47ms/step - accuracy: 0.3814 - loss: 1.7463 - val_accuracy: 0.1288 - val_loss: 2.4940
Epoch 6/10
[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 48ms/step - accuracy: 0.4741 - loss: 1.5349 - val_accuracy: 0.1349 - val_loss: 2.6540
Epoch 7/10
[1m185/18

<keras.src.callbacks.history.History at 0x7f16d0983320>

## モデルの評価

In [10]:
text = "人工知能が進化して、社会や仕事に与える影響は計り知れません。"

tokenized = mecab.parse(text).split()
tokenized_ids = tokenizer.encode(" ".join(tokenized)).ids
padded = keras.utils.pad_sequences([tokenized_ids], maxlen=max_sequence_length, padding="post", truncating="post")
processed_text = np.array(padded)

prediction = model.predict(processed_text)
predicted_class = np.argmax(prediction, axis=-1)[0]

predicted_category = class_names[predicted_class]
print(f"予測されたカテゴリ: {predicted_category}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
予測されたカテゴリ: it-life-hack
