In [2]:
import numpy as np
from sklearn import preprocessing

raw_csv_data = np.loadtxt('Audiobooks-data.csv',delimiter = ',')
unscaled_inputs_all = raw_csv_data[:,1:-1]
targets_all = raw_csv_data[:,-1]

In [3]:
#バランシング(targetが0と1の量が揃うように調整、具体的には0の数が1の数を超えたらそれ以降の0のデータを全て消す。)
num_one_targets = int(np.sum(targets_all))
zero_targets_counter = 0
indices_to_remove = []

for i in range(targets_all.shape[0]):
    if targets_all[i]==0:
        zero_targets_counter += 1
        if zero_targets_counter > num_one_targets:
            indices_to_remove.append(i)
            
unscaled_inputs_equal_priors = np.delete(unscaled_inputs_all,indices_to_remove,axis = 0)
targets_equal_priors = np.delete(targets_all,indices_to_remove,axis = 0)

In [4]:
#標準化(データの影響の大きさを同じにする)
scaled_inputs = preprocessing.scale(unscaled_inputs_equal_priors)

In [5]:
#シャッフル(バッチ処理をするため、処理ごとに特定のデータが集中してしまうことを避ける。)
shuffled_indices = np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_indices)
shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = targets_equal_priors[shuffled_indices]

In [6]:
#訓練用、検証用、テスト用に分割_count
samples_count = shuffled_inputs.shape[0]
train_samples_count = int(0.8*samples_count)
validation_samples_count = int(0.1*samples_count)
test_samples_count = samples_count -train_samples_count - validation_samples_count

train_inputs = shuffled_inputs[:train_samples_count]
train_targets = shuffled_targets[:train_samples_count]

validation_inputs = shuffled_inputs[train_samples_count:train_samples_count+validation_samples_count]
validation_targets = shuffled_targets[train_samples_count:train_samples_count+validation_samples_count]

test_inputs = shuffled_inputs[train_samples_count+validation_samples_count:]
test_targets = shuffled_targets[train_samples_count+validation_samples_count:]
#大体0と1が半々になっていると良い。
print(np.sum(train_targets),train_samples_count,np.sum(train_targets)/train_samples_count)
print(np.sum(validation_targets),train_samples_count,np.sum(validation_targets)/validation_samples_count)
print(np.sum(test_targets),test_samples_count,np.sum(test_targets)/test_samples_count)

1809.0 3579 0.5054484492875104
211.0 3579 0.4720357941834452
217.0 448 0.484375


In [7]:
#データ保存
np.savez('Audiobooks_data_train',inputs = train_inputs,targets = train_targets)
np.savez('Audiobooks_data_validation',inputs = validation_inputs,targets = validation_targets)
np.savez('Audiobooks_data_test',inputs = test_inputs,targets = test_targets)

In [8]:
import tensorflow as tf

input_size = 10
output_size = 2
# 隠れ層のユニットの数を定義します
hidden_layer_size = 50
    
# モデルの定義を進めていきます
model = tf.keras.Sequential([    
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'), # 1番目の隠れ層
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'), # 2番目の隠れ層
    tf.keras.layers.Dense(output_size, activation='softmax') # 出力層
])
#損失が上がったり下がったりしているのでアーリーストッピングが必要
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',metrics = ['accuracy'])
batch_size = 100
max_epochs = 100
#損失関数が1回増えてしまっても繰り返す。
early_stopping  = tf.keras.callbacks.EarlyStopping(patience =2)
model.fit(train_inputs,
          train_targets,
          batch_size = batch_size,
          epochs =max_epochs,
          validation_data = (validation_inputs,validation_targets),
          callbacks = [early_stopping],
          verbose = 2)

Epoch 1/100
36/36 - 1s - loss: 0.5960 - accuracy: 0.6683 - val_loss: 0.4951 - val_accuracy: 0.7651
Epoch 2/100
36/36 - 0s - loss: 0.4659 - accuracy: 0.7614 - val_loss: 0.4240 - val_accuracy: 0.8076
Epoch 3/100
36/36 - 0s - loss: 0.4175 - accuracy: 0.7793 - val_loss: 0.3914 - val_accuracy: 0.7919
Epoch 4/100
36/36 - 0s - loss: 0.3910 - accuracy: 0.7857 - val_loss: 0.3786 - val_accuracy: 0.7830
Epoch 5/100
36/36 - 0s - loss: 0.3757 - accuracy: 0.7969 - val_loss: 0.3640 - val_accuracy: 0.8233
Epoch 6/100
36/36 - 0s - loss: 0.3630 - accuracy: 0.8044 - val_loss: 0.3509 - val_accuracy: 0.8098
Epoch 7/100
36/36 - 0s - loss: 0.3564 - accuracy: 0.8094 - val_loss: 0.3457 - val_accuracy: 0.8098
Epoch 8/100
36/36 - 0s - loss: 0.3484 - accuracy: 0.8153 - val_loss: 0.3380 - val_accuracy: 0.8300
Epoch 9/100
36/36 - 0s - loss: 0.3448 - accuracy: 0.8128 - val_loss: 0.3371 - val_accuracy: 0.8121
Epoch 10/100
36/36 - 0s - loss: 0.3437 - accuracy: 0.8108 - val_loss: 0.3364 - val_accuracy: 0.8389
Epoch 11/

<tensorflow.python.keras.callbacks.History at 0x7fea6fac7d60>

In [10]:
#モデルのテスト
test_loss, test_accuracy = model.evaluate(test_inputs, test_targets)



In [11]:
#test_accuracy < val_accuracyなので若干過学習気味)
test_loss, test_accuracy

(0.3548872172832489, 0.8035714030265808)