In [1]:
import numpy as np
import pandas
import matplotlib.pyplot as plt

from keras import layers, optimizers, models
from keras import backend as K
from keras.utils import plot_model
from keras.utils.vis_utils import model_to_dot
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import mean_squared_error
from IPython.display import SVG

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# 2. Adult dataset 的數值預測 (預測 hours-per-week)

## 1.1 資料集

In [4]:
train_data = pandas.read_csv("data/adult.data", encoding="UTF-8", header=None)
test_data = pandas.read_csv("data/adult.test", encoding="UTF-8", header=None, skiprows=1) # skip 第一行

In [5]:
train_data # 展示訓練資料

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [6]:
test_data # 展示測試資料

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.
5,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K.
6,29,?,227026,HS-grad,9,Never-married,?,Unmarried,Black,Male,0,0,40,United-States,<=50K.
7,63,Self-emp-not-inc,104626,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,3103,0,32,United-States,>50K.
8,24,Private,369667,Some-college,10,Never-married,Other-service,Unmarried,White,Female,0,0,40,United-States,<=50K.
9,55,Private,104996,7th-8th,4,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,10,United-States,<=50K.


## 1.2 前置處理

In [8]:
test_data[14] = test_data[14].str.replace('.', '') # 將 test_data class label 最後多餘的點去掉
train_data = train_data.replace(' ?', np.nan) # 將 missing value 改成 NaN
test_data = test_data.replace(' ?', np.nan) # 將 missing value 改成 NaN

# 
# if instances with unknown values are removed (train=30162, test=15060)
# 
train_data = train_data.dropna() # 去掉 missing value 的那筆資料，剩下 30162 筆
test_data = test_data.dropna() # 去掉 missing value 的那筆資料，剩下 15060 筆

In [9]:
cols = train_data.columns.tolist() # 把train_data的columns轉換成list
cols.insert(14, cols.pop(cols.index(12))) # 調換 12 跟 14 列

In [10]:
cols = test_data.columns.tolist() # 把test_data的columns轉換成list
cols.insert(14,cols.pop(cols.index(12))) # 調換 12 跟 14 列
cols

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 12]

In [11]:
train_data = train_data.loc[:,cols] # set剛剛重新定位的columns
train_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,13,14,12
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,United-States,<=50K,40
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,United-States,<=50K,13
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,United-States,<=50K,40
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,United-States,<=50K,40
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,Cuba,<=50K,40
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,United-States,<=50K,40
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,Jamaica,<=50K,16
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,United-States,>50K,45
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,United-States,>50K,50
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,United-States,>50K,40


In [None]:
test_data = test_data.loc[:,cols] # set剛剛重新定義的columns
test_data

In [None]:
train_data.columns = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] # 對train_data的cloumns重新命名，讓它能夠恢復名稱順序
train_data

In [None]:
test_data.columns = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] # 對test_data的cloumns重新命名，讓它能夠恢復名稱順序
test_data

In [None]:
#使用Label Encoding 把字串以數字取代
def transform_label(full_data):
    transform_needed = [False,
           True,
           False,
           True,
           False,
           True,
           True,
           True,
           True,
           True,
           False,
           False,
           True,
           True,
           False]


    result = np.zeros(shape=(full_data.shape[0], full_data.shape[1]), dtype=np.float32)

    for i in range(len(transform_needed)):
        if transform_needed[i]:
            tmp_data = full_data.iloc[:, i].tolist()
            encoder = LabelEncoder()
            encoder.fit(tmp_data)
            result[:, i] = encoder.transform(tmp_data)
        else:
            result[:, i] = full_data.iloc[:, i].tolist()
            
    return result

In [None]:
trainingdata = transform_label(train_data)
testingdata = transform_label(test_data)

# 正規化
scaler = MinMaxScaler().fit(trainingdata)

trainingdata = scaler.transform(trainingdata)
testingdata = scaler.transform(testingdata)


In [None]:
len(trainingdata)

In [None]:
len(testingdata)

In [None]:
for val in trainingdata[0]:
    print(val)

## 2.1 資料集

## 2.2 前置處理

In [None]:
x_train = trainingdata[:, :13]
y_train = trainingdata[:, 14]

x_test = testingdata[:, :13]
y_test = testingdata[:, 14]

## 2.3 實驗設計

In [None]:
# 實驗設計
model = models.Sequential()
model.add(layers.Dense(64, input_shape=(x_train.shape[1],), activation="relu"))
model.add(layers.Dense(32, activation="relu"))
model.add(layers.Dense(32, activation="relu"))
model.add(layers.Dropout(0.05))
model.add(layers.Dense(16, activation="relu"))
model.add(layers.Dense(1))

def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1))

model.compile(loss='mse', optimizer='adam', metrics=[root_mean_squared_error])
# history = model.fit(x_train, y_train, validation_split=0.33, epochs=100, batch_size=128)
history = model.fit(x_train, y_train, validation_split=0.33, epochs=10, batch_size=128)

print(model.summary())

model_plot_image_name = 'predict_hours_per_week_model_plot.png'
SVG(model_to_dot(model).create(prog='dot', format='svg'))

## 2.4 實驗結果

In [None]:
test_mse_score, test_mae_score = model.evaluate(x_test, y_test)
print('test_mse_score :' + str(test_mse_score))
print('test_mae_score :' + str(test_mae_score))


plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['root_mean_squared_error'], label='root_mean_squared_error')
# plt.plot(history.history['root_mean_squared_error'], label="rmae")
plt.legend()
plt.show()


In [None]:
len(x_test[:, :][1])

In [None]:
a=test_data[14]
a=a.to_numpy()
a.shape

In [None]:
a=a.reshape(len(a), 1)

In [None]:
a.shape

In [None]:
scaler = MinMaxScaler().fit(a)
yhat = model.predict(x_test)
# yhat.shape
ryhat = scaler.inverse_transform(yhat)
ryhat

In [None]:
actual_result = test_data[14].to_numpy().reshape(len(test_data[14].to_numpy()), 1)
predict_result = ryhat

temp_rmse = np.sqrt(mean_squared_error(actual_result,predict_result))
temp_mse=mean_squared_error(actual_result,predict_result)
print('TEMP RMSE: %.3f' % temp_rmse)
print('TEMP MSE: %.3f' % temp_mse)