In [2]:
import numpy as np
from sklearn import preprocessing

raw_csv_data = np.loadtxt('Audiobooks-data.csv',delimiter = ',')
unscaled_inputs_all = raw_csv_data[:,1:-1]
targets_all = raw_csv_data[:,-1]

In [3]:
#バランシング(targetが0と1の量が揃うように調整、具体的には0の数が1の数を超えたらそれ以降の0のデータを全て消す。)
num_one_targets = int(np.sum(targets_all))
zero_targets_counter = 0
indices_to_remove = []

for i in range(targets_all.shape[0]):
    if targets_all[i]==0:
        zero_targets_counter += 1
        if zero_targets_counter > num_one_targets:
            indices_to_remove.append(i)
            
unscaled_inputs_equal_priors = np.delete(unscaled_inputs_all,indices_to_remove,axis = 0)
targets_equal_priors = np.delete(targets_all,indices_to_remove,axis = 0)

In [4]:
#標準化(データの影響の大きさを同じにする)
scaled_inputs = preprocessing.scale(unscaled_inputs_equal_priors)

In [5]:
#シャッフル(バッチ処理をするため、処理ごとに特定のデータが集中してしまうことを避ける。)
shuffled_indices = np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_indices)
shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = targets_equal_priors[shuffled_indices]

In [6]:
#訓練用、検証用、テスト用に分割_count
samples_count = shuffled_inputs.shape[0]
train_samples_count = int(0.8*samples_count)
validation_samples_count = int(0.1*samples_count)
test_samples_count = samples_count -train_samples_count - validation_samples_count

train_inputs = shuffled_inputs[:train_samples_count]
train_targets = shuffled_targets[:train_samples_count]

validation_inputs = shuffled_inputs[train_samples_count:train_samples_count+validation_samples_count]
validation_targets = shuffled_targets[train_samples_count:train_samples_count+validation_samples_count]

test_inputs = shuffled_inputs[train_samples_count+validation_samples_count:]
test_targets = shuffled_targets[train_samples_count+validation_samples_count:]
#大体0と1が半々になっていると良い。
print(np.sum(train_targets),train_samples_count,np.sum(train_targets)/train_samples_count)
print(np.sum(validation_targets),train_samples_count,np.sum(validation_targets)/validation_samples_count)
print(np.sum(test_targets),test_samples_count,np.sum(test_targets)/test_samples_count)

1798.0 3579 0.502374965074043
226.0 3579 0.5055928411633109
213.0 448 0.47544642857142855


In [7]:
#データ保存
np.savez('Audiobooks_data_train',inputs = train_inputs,targets = train_targets)
np.savez('Audiobooks_data_validation',inputs = validation_inputs,targets = validation_targets)
np.savez('Audiobooks_data_test',inputs = test_inputs,targets = test_targets)

In [9]:
import tensorflow as tf

input_size = 10
output_size = 2
# 隠れ層のユニットの数を定義します
hidden_layer_size = 50
    
# モデルの定義を進めていきます
model = tf.keras.Sequential([    
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'), # 1番目の隠れ層
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'), # 2番目の隠れ層
    tf.keras.layers.Dense(output_size, activation='softmax') # 出力層
])
#損失が上がったり下がったりしているのでアーリーストッピングが必要
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',metrics = ['accuracy'])
batch_size = 100
max_epochs = 100
model.fit(train_inputs,
          train_targets,
          batch_size = batch_size,
          epochs =max_epochs,
          validation_data = (validation_inputs,validation_targets),
          verbose = 2)

Epoch 1/100
36/36 - 1s - loss: 0.5901 - accuracy: 0.6795 - val_loss: 0.5070 - val_accuracy: 0.7584
Epoch 2/100
36/36 - 0s - loss: 0.4736 - accuracy: 0.7583 - val_loss: 0.4335 - val_accuracy: 0.7919
Epoch 3/100
36/36 - 0s - loss: 0.4206 - accuracy: 0.7874 - val_loss: 0.3999 - val_accuracy: 0.8054
Epoch 4/100
36/36 - 0s - loss: 0.3942 - accuracy: 0.7932 - val_loss: 0.3826 - val_accuracy: 0.7919
Epoch 5/100
36/36 - 0s - loss: 0.3776 - accuracy: 0.7972 - val_loss: 0.3701 - val_accuracy: 0.8098
Epoch 6/100
36/36 - 0s - loss: 0.3663 - accuracy: 0.8100 - val_loss: 0.3632 - val_accuracy: 0.8121
Epoch 7/100
36/36 - 0s - loss: 0.3589 - accuracy: 0.8122 - val_loss: 0.3592 - val_accuracy: 0.7919
Epoch 8/100
36/36 - 0s - loss: 0.3532 - accuracy: 0.8125 - val_loss: 0.3470 - val_accuracy: 0.8166
Epoch 9/100
36/36 - 0s - loss: 0.3484 - accuracy: 0.8134 - val_loss: 0.3471 - val_accuracy: 0.8210
Epoch 10/100
36/36 - 0s - loss: 0.3453 - accuracy: 0.8097 - val_loss: 0.3435 - val_accuracy: 0.8031
Epoch 11/

36/36 - 0s - loss: 0.3065 - accuracy: 0.8298 - val_loss: 0.3160 - val_accuracy: 0.8188
Epoch 84/100
36/36 - 0s - loss: 0.3065 - accuracy: 0.8326 - val_loss: 0.3274 - val_accuracy: 0.8076
Epoch 85/100
36/36 - 0s - loss: 0.3069 - accuracy: 0.8343 - val_loss: 0.3109 - val_accuracy: 0.8255
Epoch 86/100
36/36 - 0s - loss: 0.3069 - accuracy: 0.8329 - val_loss: 0.3187 - val_accuracy: 0.7987
Epoch 87/100
36/36 - 0s - loss: 0.3061 - accuracy: 0.8312 - val_loss: 0.3258 - val_accuracy: 0.8054
Epoch 88/100
36/36 - 0s - loss: 0.3047 - accuracy: 0.8351 - val_loss: 0.3202 - val_accuracy: 0.8143
Epoch 89/100
36/36 - 0s - loss: 0.3072 - accuracy: 0.8332 - val_loss: 0.3201 - val_accuracy: 0.8143
Epoch 90/100
36/36 - 0s - loss: 0.3056 - accuracy: 0.8329 - val_loss: 0.3211 - val_accuracy: 0.8121
Epoch 91/100
36/36 - 0s - loss: 0.3052 - accuracy: 0.8340 - val_loss: 0.3210 - val_accuracy: 0.8143
Epoch 92/100
36/36 - 0s - loss: 0.3084 - accuracy: 0.8273 - val_loss: 0.3177 - val_accuracy: 0.8166
Epoch 93/100


<tensorflow.python.keras.callbacks.History at 0x7fcdc7020040>