GoogleDriveをマウントして、自分のディレクトリとして扱う。

In [14]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


ディレクトリの中身を確認

In [15]:
!ls gdrive/My\ Drive/NLP

corpus


Chainerをインストールしましょう。

In [16]:
!curl https://colab.chainer.org/install | sh -

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100  1580  100  1580    0     0  10748      0 --:--:-- --:--:-- --:--:-- 10748
+ apt -y -q install cuda-libraries-dev-10-0
Reading package lists...
Building dependency tree...
Reading state information...
cuda-libraries-dev-10-0 is already the newest version (10.0.130-1).
0 upgraded, 0 newly installed, 0 to remove and 8 not upgraded.
+ pip install -q cupy-cuda100  chainer 
+ set +ex
Installation succeeded!


ライブラリのインポート

In [0]:
import numpy as np
import codecs
import sys
import chainer
from chainer import training, datasets, iterators, optimizers
import chainer.functions as F
import chainer.links as L
from chainer.training import extensions

RNNモデル定義

In [0]:
class Generate_RNN(chainer.Chain):
	def __init__(self, words_size, nodes):
		super(Generate_RNN, self).__init__()
		with self.init_scope():
			# Embed -> 埋め込み
			# LSTM -> Long Sort Term Memory
			self.embed = L.EmbedID(words_size, words_size)
			self.l1 = L.LSTM(words_size, nodes)
			self.l2 = L.LSTM(nodes, nodes)
			self.l3 = L.Linear(nodes, words_size)

	def reset_state(self):
		self.l1.reset_state()
		self.l2.reset_state()

	def __call__(self, x):
		h0 = self.embed(x)
		h1 = self.l1(h0)
		h2 = self.l2(h1)
		y = self.l3(h2)
		return y

In [0]:
class RNNUpdater(training.StandardUpdater):
	def __init__(self, train_iter, optimizer, device, cp):
		super(RNNUpdater, self).__init__(
			train_iter,
			optimizer,
			device=device,
		)
		self.cp = cp

	def update_core(self):
		loss = 0

		# IteratorとOptimizerの取得
		train_iter = self.get_iterator("main")
		optimizer = self.get_optimizer("main")

		# Modelの取得
		model = optimizer.target
		# 文をバッチ取得(足りないものを埋める)
		x = train_iter.__next__()
		# モデルのステータスをリセット
		model.reset_state()

		# 文の単語をRNNに学習させる
		for i in range(len(x[0])-1):
			# バッチ処理の配列
			batch = self.cp.array([s[i] for s in x], dtype=self.cp.int32)
			# 正解ラベル
			t = self.cp.array([s[i+1] for s in x], dtype=self.cp.int32)

			#終端文字なら終わり
			if self.cp.min(batch) == 1 and self.cp.max(batch) == 1:
				break

			# RNNを１回実行
			y = model(batch)
			# 損失を求める
			loss += F.softmax_cross_entropy(y, t)

		# 逆伝播
		optimizer.target.cleargrads()
		loss.backward()
		optimizer.update()


GPUの設定

In [0]:
batch_size = 10
uses_device = 0

cp = np
if uses_device >= 0:
	import cupy as cp
	import chainer.cuda

読み込み、保存ファイル名の定義

In [0]:
train_data = './gdrive/My Drive/NLP/corpus/all-sentence.txt'
conb_data = './gdrive/My Drive/NLP/corpus/all-words.txt'
model_file = './gdrive/My Drive/NLP/normal.hdf5'

学習データの読み込み

In [0]:
# ファイルを読み込む
s = codecs.open(train_data, "r", "utf-8")
sentence = []


In [0]:
# 行の中の単語を数字のリストにする。
# 行が終わると終端文字を入れ、新しい文を追加。
line = s.readline()
while line:
	one = [0]
	one.extend(list(map(int, line.split(","))))
	one.append(1)
	sentence.append(one)
	line = s.readline()
s.close()

In [0]:
# 単語の種類数
word_size = max([max(l) for l in sentence]) + 1
# 最長の文の長さ
l_max = max([len(l) for l in sentence])

バッチ処理

In [0]:
# バッチ処理で、全ての文の長さを揃える。
for i in range(len(sentence)):
	sentence[i].extend([1] * (l_max - len(sentence[i])))

モデルの生成(GPUに適応)

In [0]:
model = Generate_RNN(word_size, 200)
if uses_device >= 0:
	chainer.cuda.get_device(uses_device).use()
	chainer.cuda.check_cuda_available()
	model.to_gpu()

誤差逆伝播の設定

In [0]:
# 逆伝播の方法はAdam
optimizer = optimizers.Adam()
optimizer.setup(model)

train_iter = iterators.SerialIterator(sentence, batch_size, shuffle=False)

設定を適応

In [0]:
# デバイス(CPU or GPU)を選択し、トレーナーを作成
updater = RNNUpdater(train_iter, optimizer, device=uses_device, cp=cp)
trainer = training.Trainer(updater, (30, "epoch"))

学習（しばらくお待ちください）

In [33]:
# 進行状況を表示
trainer.extend(extensions.ProgressBar(update_interval=1))

# 学習開始
trainer.run()

[J     total [..................................................]  1.41%
this epoch [#####################.............................] 42.39%
        16 iter, 0 epoch / 30 epochs
   0.26223 iters/sec. Estimated time to finish: 1:10:40.101283.
[4A[J     total [..................................................]  1.41%
this epoch [#####################.............................] 42.39%
        16 iter, 0 epoch / 30 epochs
       inf iters/sec. Estimated time to finish: 0:00:00.
[4A[J     total [..................................................]  1.50%
this epoch [######################............................] 44.89%
        17 iter, 0 epoch / 30 epochs
    0.2759 iters/sec. Estimated time to finish: 1:07:22.318024.
[4A[J     total [..................................................]  1.50%
this epoch [######################............................] 44.89%
        17 iter, 0 epoch / 30 epochs
    1.2674 iters/sec. Estimated time to finish: 0:15:34.951197.
[4A[J    

学習済みモデルの保存

In [0]:
# 学習モデルの保存
chainer.serializers.save_hdf5(model_file, model)