# データの前処理

## import

In [230]:
import numpy as np
import pandas as pd
from sklearn import tree
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf
import tensorflow.keras.layers as layers
# tensorflowのversion
print("tensorflow version is {}".format(tf.__version__))

tensorflow version is 2.3.0


## データセットの生成

In [231]:
from data_generator import DataGenerator
inputfilename = "./input/questionnaire_latest.csv"
dg = DataGenerator(inputfilename)
dg.generate()
# データセットへのアクセス
#dg.df_static_info_binary   # 静的情報（バイナリベクトル）
#dg.df_static_info_weight   # 静的情報（重み付きベクトル）
#dg.df_dynamic_info         # 動的情報
#dg.df_human_info           # 性格情報

In [232]:
def split_label(df):
    culm = len(df.iloc[0])-1
    return df.iloc[:,:culm], df.iloc[:,culm]

def dataset(dg, test_size=0.1):
    # trainデータとtestデータに分ける
    train, test = train_test_split(dg, test_size=test_size)
    train = train.reset_index(drop=True)
    test  = test.reset_index(drop=True)
    
    # 特徴量と正解ラベルを分ける
    x_train, y_train = split_label(train)
    x_test, y_test = split_label(test)
    
    print(x_train.shape)
    # 標準化する
    x_train = scale(x_train, axis=0)
    x_test = scale(x_test, axis=0)
    
    # np.arrayに変換する
    y_train = np.array(y_train.tolist())
    y_test  = np.array(y_test.tolist())
    
    return x_train, y_train, x_test, y_test

In [233]:
x_train_b, y_train_b, x_test_b, y_test_b = dataset(dg.df_static_info_binary)

(39, 8)


In [234]:
x_train_w, y_train_w, x_test_w, y_test_w = dataset(dg.df_static_info_weight)

(39, 8)


In [292]:
dg.df_dynamic_info

Unnamed: 0,現在，身体的に疲れを感じていますか？（%の値は疲労の度合いです）,現在，精神的に疲れを感じていますか？（%の値は疲労の度合いです）,明後日に予定(会社や大学などの外せない予定)はありますか？（%の値は予定の無さ具合です）,あなたは明日遊びに行くとするならば，どの遊びを選択しますか？次のジャンルのうち，明後日の予定や現在の身体的・精神的状態を考慮した後，当てはまるものを1つ選択してください．
0,3,3,3,1
1,5,5,4,7
2,4,4,4,6
3,2,3,4,6
4,4,4,2,6
5,3,3,2,6
6,2,1,0,1
7,5,5,0,7
8,4,4,1,5
9,4,4,4,5


# 主成分分析

In [42]:
pca = PCA(n_components=6)
pca.fit(x_train)
x_train_pca = pca.transform(x_train)
sum(pca.explained_variance_ratio_)

0.8069409700061265

In [43]:
pca.fit(x_test)
x_test_pca = pca.transform(x_test)
sum(pca.explained_variance_ratio_)

0.9337145221758625

# MLP

## モデルの実装

In [282]:
def create_model():
    inputs = layers.Input((8,))
    x = layers.Dense(8, activation="sigmoid")(inputs)
    #x = layers.Dense(8, activation="sigmoid")(x)
    return tf.keras.models.Model(inputs=inputs, outputs=x)

def main():    
    model = create_model()
    loss = tf.keras.losses.MeanSquaredError()
    acc = tf.keras.losses.MeanSquaredError()
    optim = tf.keras.optimizers.Adam()

    # train
    model.compile(optimizer=optim, loss=loss, metrics=[acc])
    model.fit(x_train_w, y_train_w, validation_data=(x_test_w, y_test_w), epochs=300, batch_size=4)

    # eval
    val_loss, val_acc = model.evaluate(x_test_w, y_test_w, batch_size=4)
    print(val_loss, val_acc)
    
    return model 

if __name__ == "__main__":
    model = main()

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

## 結果の確認

In [289]:
raw = 31

In [290]:
print(model.predict(x_train_w[raw:raw+1,:]))

[[0.08175126
  0.22736073
  0.25331008
  0.87236786
  0.19683701
  0.14224696
  0.37981164
  0.04632583]]


In [291]:
print(y_train_w[raw,:])

[0.  0.  0.  1.
 0.4 0.  0.7 0. ]


# 実装(不採択)

## ランダムフォレスト

In [7]:
clf = RandomForestClassifier(max_depth=5, random_state=0)

In [8]:
# original
clf.fit(x_train,y_train)

# pca
#clf.fit(x_train_pca,y_train)

RandomForestClassifier(max_depth=5, random_state=0)

In [9]:
print(clf.feature_importances_)

[0.03545779 0.03583479 0.07081688 0.07663516 0.07443875 0.07018595
 0.1078693  0.0668443  0.0823463  0.05600769 0.08187283 0.07853895
 0.08160876 0.08154253]


In [10]:
# original
clf.score(x_test,y_test)

# pca
#clf.score(x_test_pca,y_test)

0.13333333333333333