# 第8章: ニューラルネット

## 70.単語ベクトルの和による特徴量

In [1]:
import gensim
import numpy as np
import pandas as pd
import re

In [2]:
#configs
TRAIN_DATA_PATH ='./ch6_folder/save_data/train.csv'
VALID_DATA_PATH = './ch6_folder/save_data/valid.csv'
TEST_DATA_PATH = './ch6_folder/save_data/test.csv'
GOOGLE_MODEL_PATH = './ch7_folder/model/GoogleNews-vectors-negative300.bin'

In [3]:
model = gensim.models.KeyedVectors.load_word2vec_format(GOOGLE_MODEL_PATH, binary=True)

In [4]:
model_vocab_list = list(model.wv.vocab.keys())

  """Entry point for launching an IPython kernel.


記事の見出しの単語列 $(w_{i,1}, \ldots, w_{i,T_i})$ に対して、embeddingをおこない、以下のような特徴ベクトルを生成すると書いてあります。

$$
    {x}_i = \frac{1}{T_i}\sum_{1 \leq t \leq T_i} {emb}(w_{i,t}) 
$$

これは、各単語数に関する平均を特徴量にするということですね。

In [5]:
CATEGORY_TAG = {'b': 0, 't': 1, 'e': 2, 'm': 3}

In [6]:
df_train = pd.read_csv(TRAIN_DATA_PATH)
df_valid = pd.read_csv(VALID_DATA_PATH)
df_test = pd.read_csv(TEST_DATA_PATH)

In [7]:
train_titles_list = [title.split() for title in df_train['TITLE'].values.tolist()]
valid_titles_list = [title.split() for title in df_valid['TITLE'].values.tolist()]
test_titles_list = [title.split() for title in df_test['TITLE'].values.tolist()]

In [8]:
train_length = len(train_titles_list)
valid_length = len(valid_titles_list)
test_length = len(test_titles_list)
print(train_length, valid_length, test_length)

10672 1334 1334


In [9]:
def tokenize(word: str) -> str:
    """
    """
    word = word.replace('-', '')
    word = word.replace('.', '')
    word = word.replace('\'', '')
    word = word.replace(':', '')
    return word

In [10]:
def get_embedding_matrix(titles_list):
    feature_vecs_list = []
    
    for title in titles_list:
        tmp_vecs_list =[]
        for word in title:
            word = tokenize(word)
            try:
                vec = model[word]
                tmp_vecs_list.append(vec)
            except KeyError as e:
                pass
        
        if len(tmp_vecs_list) == 1:
            feature_vec = tmp_vecs_list[0]
        else:
            np_vecs = np.array(tmp_vecs_list)
            feature_vec = np.mean(np_vecs, axis=0)
        feature_vecs_list.append(feature_vec)
    return np.array(feature_vecs_list)

In [11]:
X_train = get_embedding_matrix(train_titles_list)
X_valid = get_embedding_matrix(valid_titles_list)
X_test = get_embedding_matrix(test_titles_list)

In [12]:
X_train.shape

(10672, 300)

In [13]:
X_valid.shape

(1334, 300)

In [14]:
X_test.shape

(1334, 300)

In [15]:
y_train= pd.read_csv(TRAIN_DATA_PATH)['CATEGORY'].map(CATEGORY_TAG).values
y_valid = pd.read_csv(VALID_DATA_PATH)['CATEGORY'].map(CATEGORY_TAG).values
y_test = pd.read_csv(TEST_DATA_PATH)['CATEGORY'].map(CATEGORY_TAG).values

In [16]:
Y_train = np.eye(4)[y_train]
Y_valid = np.eye(4)[y_valid]
Y_test = np.eye(4)[y_test]

In [17]:
y_train[1]

0

In [18]:
y_train.shape

(10672,)

In [19]:
Y_train.shape

(10672, 4)

In [20]:
y_valid.shape

(1334,)

In [21]:
Y_valid.shape

(1334, 4)

In [22]:
y_test.shape

(1334,)

In [23]:
Y_test.shape

(1334, 4)

## 71. 単層NNによる予測

個人的にこういう時にKerasを使うのが好きなのでKerasを使っていきます。
重みの初期化のところ、毎回同じ値を取りたいのでrandom seedみたいなことをしたいのですが、どうすればいいですかね。

In [24]:
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model
import tensorflow as tf

In [25]:
def predict_nn():
    inputs = Input(shape=(300,))
    predicts = Dense(4, activation='softmax') (inputs)
    model = Model(inputs=inputs, outputs=predicts)
    return model

In [26]:
x_1 = X_train[:1]
X_1_4 = X_train[:4]

In [27]:
model = predict_nn()

In [28]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 300)]             0         
_________________________________________________________________
dense (Dense)                (None, 4)                 1204      
Total params: 1,204
Trainable params: 1,204
Non-trainable params: 0
_________________________________________________________________


In [29]:
y_1 = model.predict(x_1)
Y_1_4 = model.predict(X_1_4)

In [30]:
y_1

array([[0.23747101, 0.26101652, 0.25859946, 0.24291295]], dtype=float32)

In [31]:
Y_1_4

array([[0.23747101, 0.26101652, 0.25859946, 0.24291295],
       [0.2698821 , 0.22152211, 0.22567463, 0.28292125],
       [0.23746988, 0.26058245, 0.23853041, 0.26341733],
       [0.26514286, 0.24195269, 0.25889996, 0.23400448]], dtype=float32)

## 72. 損失と勾配の計算

文面からは、実際自分でクロスエントロピー損失関数や、勾配を`numpy`や`math`を用いて作成して、作成した関数を用いて求めてほしいのか、71とかで使ったモジュールを用いてやって欲しいのかイマイチわからないのですが、自作はせずにkerasを使ってやります。

In [32]:
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.backend import gradients

In [33]:
y_train_1 = Y_train[:1]
Y_train_1_4 = Y_train[:4]

In [34]:
CategoricalCrossentropy?

[0;31mInit signature:[0m
[0mCategoricalCrossentropy[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mfrom_logits[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlabel_smoothing[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mreduction[0m[0;34m=[0m[0;34m'auto'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mname[0m[0;34m=[0m[0;34m'categorical_crossentropy'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Computes the crossentropy loss between the labels and predictions.

Use this crossentropy loss function when there are two or more label classes.
We expect labels to be provided in a `one_hot` representation. If you want to
provide labels as integers, please use `SparseCategoricalCrossentropy` loss.
There should be `# classes` floating point values per feature.

In the snippet below, there is `# classes` floating pointing values per
example. The shape of both `y_pred` and `y_true

In [35]:
cce = CategoricalCrossentropy()

$\hat{y}_1$のクロスエントロピー損失は、

In [36]:
cce(y_train_1,y_1).numpy()

1.4377097

$\hat{Y}$のクロスエントロピー損失は、

In [37]:
cce(Y_train_1_4,Y_1_4).numpy()

1.3522456

損失を求められたので、次は勾配を求めます。実は、`keras`の関数である`gradient`を用いると`RuntimeError`がでます。
そのため、今回は`tf.GradientTape()`を用いて求めます。
ちなみに、先ほど定義したモデルから、weightの行列を取得し、そのweight行列と`numpy`の`gradient`を用いて計算して、gradientを求めようと思ったのですが、`numpy.gradient`の仕様がよくわからなくてやめました。誰か書ける人がいたら教えてください。

In [38]:
def calculate_grad(x,y):
    
    def _loss_fn(y_predict, y):
        return cce(y_predict, y)

    inputs = tf.convert_to_tensor(x)
    targets = tf.convert_to_tensor(y)
    with tf.GradientTape() as tape:
        y_predict = model(inputs)
        loss = _loss_fn(y_predict, targets)
    grads = tape.gradient(loss, model.variables)
    
    return grads[0].numpy()

In [39]:
calculate_grad(x_1, y_train_1)

array([[ 0.04970098, -0.01701283, -0.01685529, -0.01583286],
       [-0.01669423,  0.0057145 ,  0.00566158,  0.00531815],
       [ 0.01343682, -0.00459948, -0.00455688, -0.00428047],
       ...,
       [-0.1963863 ,  0.06722378,  0.06660128,  0.06256128],
       [-0.15187682,  0.05198802,  0.0515066 ,  0.04838224],
       [ 0.29698431, -0.10165887, -0.10071749, -0.09460802]],
      dtype=float32)

In [40]:
calculate_grad(x_1, y_train_1).shape

(300, 4)

In [41]:
calculate_grad(X_1_4, Y_train_1_4)

array([[-0.11603267,  0.04916372,  0.04986542,  0.01700349],
       [-0.11608543,  0.05008772,  0.05015508,  0.01584258],
       [ 0.02115218, -0.00044278, -0.00016743, -0.02054194],
       ...,
       [-0.08123177,  0.00707258,  0.00925068,  0.06490856],
       [-0.16651726,  0.06467234,  0.0660722 ,  0.03577271],
       [ 0.09841748, -0.00613205, -0.0088886 , -0.0833969 ]],
      dtype=float32)

In [42]:
calculate_grad(X_1_4, Y_train_1_4).shape

(300, 4)

## 73. 確率的勾配降下法による学習

>確率的勾配降下法（SGD: Stochastic Gradient Descent）を用いて，行列Wを学習せよ.

とのことなので、さっきのモデルを用いて、行いたいと思います。

In [43]:
from tensorflow.keras.optimizers import SGD

In [None]:
opt = SGD(learning_rate=0.01)
model.compile(optimizer=opt, loss='categorical_crossentropy',metrics=['accuracy'])
history = model.fit(X_train, Y_train, epochs=100, validation_data=(X_valid, Y_valid))

Train on 10672 samples, validate on 1334 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
 2272/10672 [=====>........................] - ETA: 0s - loss: 0.5302 - accuracy: 0.8217

## 74.正解率の計測

In [None]:
score = model.evaluate(X_test, Y_test, verbose=0)

In [None]:
print('Test loss:{0}'.format(score[0]))
print('Test accuracy:{0}'.format(score[1]))