In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras import Model, regularizers
from tensorflow.keras.layers import Input, Dense, Dropout, Activation, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('/kaggle/input/mnist-digit-recognizer/train.csv')
df.head()

In [None]:
df.info()

In [None]:
x_data = df.iloc[:, 1:].to_numpy()
y_data = df.iloc[:, 0].to_numpy()

In [None]:
print (type(x_data))
print (type(y_data))
print (x_data.shape)
print (y_data.shape)

In [None]:
nor_x_data = x_data.astype('float32') / 255.

In [None]:
x_train, x_test, y_train, y_test = train_test_split(nor_x_data, y_data, test_size=0.2)
print (x_train.shape, y_train.shape)
print (x_test.shape, y_test.shape)

In [None]:
plt.imshow(x_train[0].reshape(28, 28))
print (y_train[0])

# [AutoEncoder][1] 解析

### 目的：擁有如同PCA一樣的「降維」功能，從input擷取重要特徵，代表全體。當輸入新的測試資料，和所擷取的代表特徵進行比對，判斷是否有異常。

### AutoEncoder（AE）為一【非監督式學習】的神經網路，中間的 Internal Representation（或稱 Bottleneck）可解釋為對輸入的資料做壓縮（維降）或是加入雜訊。

### AutoEncoder（AE）稱為自動編碼器，可以幫助資料分類、視覺化、儲存。
### 架構中可細分為 Encoder（編碼器）和 Decoder（解碼器）兩部分，分別做【壓縮】與【解壓縮】的動作，讓輸出值（output）和輸入值（input）表示相同意義。
### 編碼器會建立一個隱藏層（或多個隱藏層）對所輸入的資料進行「降維」。
### 解碼器會將通過隱藏層的低維向量進行重建（Reconstruct），以低維向量呈現輸入值（input）。

[1]: https://medium.com/%E7%A8%8B%E5%BC%8F%E5%B7%A5%E4%BD%9C%E7%B4%A1/autoencoder-%E4%B8%80-%E8%AA%8D%E8%AD%98%E8%88%87%E7%90%86%E8%A7%A3-725854ab25e8
![autoencoder architecture](https://miro.medium.com/proxy/1*77dA0nWbQoGGxz7bSf5Qow.png)
![autoencoder architecture](https://miro.medium.com/max/1000/0*oTxwcZWFgM7ReT5j.png)
![autoencoder architecture](https://miro.medium.com/max/700/0*Y1AAl7zNstpSOD3S.jpg)

# 單層 AutoEncoder

In [None]:
input_layer = Input(shape=(x_train.shape[1], ))  # 輸入資料維度 dim=784

encoded = Dense(units=64,
                activation='relu')(input_layer)  # 建立編碼器（Encoder）

decoded = Dense(units=x_train.shape[1],
                activation='sigmoid')(encoded)   # 建立解碼器（Decoder），輸出層與輸入層同維度 dim=784 >> 重建（Reconstruct）資料

#-- 建立 auto_encdoer 模型 --#
auto_encoder = Model(input_layer, decoded)

#-- 建立 Encoder 模型 --#
encoder = Model(input_layer, encoded)

#-- 建立 Decoder 模型 --#
encoded_input = Input(shape=(64, ))
decoder_layer = auto_encoder.layers[-1](encoded_input)
decoder = Model(encoded_input, decoder_layer)

In [None]:
auto_encoder.summary()

In [None]:
encoder.summary()

In [None]:
decoder.summary()

In [None]:
callbacks = EarlyStopping(monitor='val_loss',
                          min_delta=0.0001,
                          patience=5,
                          restore_best_weights=True)

In [None]:
auto_encoder.compile(optimizer='adam', loss='binary_crossentropy')

history = auto_encoder.fit(x_train, x_train,
                           epochs=100, batch_size=256,
                           validation_data=(x_test, x_test),
                           shuffle=True)

In [None]:
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.legend()
plt.show()

In [None]:
encoded = encoder.predict(x_test)
decoded = decoder.predict(encoded)

In [None]:
fig = plt.figure(figsize=(16, 8))

n_plots = 10
n_rows = int(n_plots / 2)

for j in range(n_plots):
    #-- 原始資料 --#
    fig.add_subplot(n_rows, 6, 3*j+1)
    plot_tmp = plt.imshow(x_test[j].reshape(28, 28))
    plt.xticks([])
    plt.yticks([])
    
    #-- 低維陣列 --#
    fig.add_subplot(n_rows, 6, 3*j+2)
    plot_tmp = plt.imshow(encoded[j].reshape(1, 64))
    plt.xticks([])
    plt.yticks([])
    
    #-- Decoder 重建輸出值 --#
    fig.add_subplot(n_rows, 6, 3*j+3)
    plot_tmp = plt.imshow(decoded[j].reshape(28, 28))
    plt.xticks([])
    plt.yticks([])
    
plt.show()

In [None]:
input_layer = Input(shape=(x_train.shape[1], ))  # dim=784

encoded = Dense(units=32,
                activation='relu',
                activity_regularizer=regularizers.l1(10e-5))(input_layer)  # 增加 regularizer
                                                                           # e 是底數，代表 10
                                                                           # 「＋」代表正 N 次方；「－」代表負 N 次方
                                                                           # 1.e+5  = 10的5次方 = 100,000
                                                                           # 2.e-6 =  2x 10的負6次方 = 0.000002
                                                                           # 3.14e-4 = 3.14x10的負4次方 = 0.000314

decoded = Dense(units=x_train.shape[1],
                activation='sigmoid')(encoded)  # dim=784

#-- 建立 AutoEncoder --#
auto_encoder = Model(input_layer, decoded)

#-- 建立 Encoder --#
encoder = Model(input_layer, encoded)

#-- 建立 Decoder --#
encoded_input = Input(shape=(32, ))
decoded_layer = auto_encoder.layers[-1](encoded_input)
decoder = Model(encoded_input, decoded_layer)

In [None]:
auto_encoder.compile(optimizer='adam', loss='binary_crossentropy')

history = auto_encoder.fit(x_train, x_train,
                           epochs=100, batch_size=256,
                           validation_data=(x_test, x_test),
                           shuffle=True)

In [None]:
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.legend()
plt.show()

In [None]:
encoded = encoder.predict(x_test)
decoded = decoder.predict(encoded)

In [None]:
fig = plt.figure(figsize=(16, 8))

n_plots = 10
n_rows = int(n_plots / 2)

for j in range(n_plots):
    #-- 原始資料 --#
    fig.add_subplot(n_rows, 6, 3*j+1)
    plot_tmp = plt.imshow(x_test[j].reshape(28, 28))
    plt.xticks([])
    plt.yticks([])
    
    #-- 低維陣列 --#
    fig.add_subplot(n_rows, 6, 3*j+2)
    plot_tmp = plt.imshow(encoded[j].reshape(1, 32))
    plt.xticks([])
    plt.yticks([])
    
    #-- Decoder 重建輸出值 --#
    fig.add_subplot(n_rows, 6, 3*j+3)
    plot_tmp = plt.imshow(decoded[j].reshape(28, 28))
    plt.xticks([])
    plt.yticks([])
    
plt.show()

# Deep AutoEncoder

In [None]:
input_layer = Input(shape=(x_train.shape[1],))  # input_layer dim=784

encoded = Dense(units=128,
                activation='relu')(input_layer)
encoded = Dense(units=64,
                activation='relu')(encoded)        # lower dimension representation
encoded = Dense(units=32,
                activation='relu')(encoded)

decoded = Dense(units=64,
          activation='relu')(encoded)
decoded = Dense(units=128,
                activation='relu')(decoded)
decoded = Dense(units=x_train.shape[1],
                activation='sigmoid')(decoded)   # output_layer dim=784

#-- 建立 AutoEncoder --#
auto_encoder = Model(input_layer, decoded)

#-- 建立 Encoder --#
encoder = Model(input_layer, encoded)

#-- 建立 Decoder --#
encoded_input = Input(shape=(32, ))
decoder_layer_1 = auto_encoder.layers[-3](encoded_input)
decoder_layer_2 = auto_encoder.layers[-2](decoder_layer_1)
decoder_layer_3 = auto_encoder.layers[-1](decoder_layer_2)
decoder = Model(encoded_input, decoder_layer_3)

In [None]:
auto_encoder.compile(optimizer='adam',
                     loss='binary_crossentropy')

callbacks = EarlyStopping(monitor='val_loss',
                          min_delta=0.0001,
                          patience=5,
                          restore_best_weights=True)

history = auto_encoder.fit(x_train, x_train,
                           epochs=100, batch_size=256,
                           validation_data=(x_test, x_test),
                           shuffle=True)

In [None]:
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.legend()
plt.show()

In [None]:
encoded = np.array(encoder.predict(x_test))
decoded = np.array(decoder.predict(encoded))

In [None]:
print (encoded.shape)
print (decoded.shape)

In [None]:
fig = plt.figure(figsize=(16, 8))

n_plots = 10
n_rows = int(n_plots / 2)

for j in range(n_plots):
    #-- 原始資料 --#
    fig.add_subplot(n_rows, 6, 3*j+1)
    plot_tmp = plt.imshow(x_test[j].reshape(28, 28))
    plt.xticks([])
    plt.yticks([])
    
    #-- 低維陣列 --#
    fig.add_subplot(n_rows, 6, 3*j+2)
    plot_tmp = plt.imshow(encoded[j].reshape(1, 32))
    plt.xticks([])
    plt.yticks([])
    
    #-- Decoder 輸出值 --#
    fig.add_subplot(n_rows, 6, 3*j+3)
    plot_tmp = plt.imshow(decoded[j].reshape(28, 28))
    plt.xticks([])
    plt.yticks([])
    
plt.show()

## 不管是「Encoder」還是「Decoder」，
## 他們的權重是可以調整的，
## 所以建立AutoEncoder（Encoder+Decoder）的結構後，
## 搭配Input當作Output的目標答案（x 與 y 相同），
## 在Training的過程中，AutoEncoder會試著找出最好的權重，使得資訊可以盡量完整還原回去。

![1](https://miro.medium.com/max/1000/0*l5pTmpzrTagcw3GC)

# [Convolutional AutoEncoder][1]
[1]:https://www.kaggle.com/br19920702/convolutional-autoencoder/

# Band Marketing

## 預測客戶是否會簽訂 短/中/長期的 存款合約
## 影響客戶簽訂合約的重要因素（特徵工程）

In [None]:
b_df = pd.read_csv('/kaggle/input/bank-marketing-analysis/bank-additional-full.csv', sep=';')
b_df.info()

In [None]:
b_df.sample(5)

In [None]:
y = (b_df.y == 'yes') + 0
print (type(y))
print (y.iloc[:20])

# condition: False=0; True=1
# False + 0 = 0 >> 不會簽訂合約
# True + 0 = 1 >> 會簽訂合約

In [None]:
b_df.drop(['duration', 'y'], axis=1, inplace=True)
b_df.columns

In [None]:
X = pd.get_dummies(b_df)
print (type(X))

x_data = X.to_numpy()
arr_cols = X.columns

print (x_data.shape)
print (arr_cols.shape)

In [None]:
mms = MinMaxScaler()
x_data_sc = mms.fit_transform(x_data)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
from sklearn.inspection import permutation_importance

In [None]:
rf = RandomForestClassifier(n_estimators=500,
                            max_depth=2,
                            n_jobs=8,
                            random_state=42)

dict_performance = cross_validate(estimator=rf,
                                  X=x_data_sc, y=y,
                                  cv=10, n_jobs=4,
                                  return_train_score=True,
                                  scoring=['balanced_accuracy',
                                           'f1_weighted',
                                           'roc_auc',
                                           'average_precision'])

df_performance = pd.DataFrame({'Original': [np.mean(dict_performance[k]) for k in dict_performance.keys()]},  # 將 Cross_validate 10 次的結果平均
                              index=dict_performance.keys())

df_performance

In [None]:
rf = RandomForestClassifier(n_estimators=500,
                            max_depth=2,
                            n_jobs=8,
                            random_state=42)

rf.fit(x_data_sc, y)

fi = permutation_importance(estimator=rf,
                            X=x_data_sc, y=y,
                            n_repeats=10, n_jobs=8,
                            random_state=42).importances_mean
# 以 原輸入值（x_data_sc）

fig = plt.figure(figsize=(16,8))
plt.barh(y=range(10, 0, -1),
         width=sorted(fi, reverse=True)[:10],  # sorted() 升冪排序（小→大），reverse=True 轉為降冪排序（大→小）
         alpha=0.9)                            # alpha：透明度（0~1），值越大越不透明。
plt.yticks(range(10, 0, -1),
           arr_cols[fi.argsort()[::-1][:10]])  # np.argsort() 升冪排序（小→大）； [::-1] 轉換成降冪排序（大→小）；返回索引值（index）
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Original features importance')
plt.show()

In [None]:
input_layer = Input(shape=x_data_sc.shape[1],)
x = Activation('relu')(input_layer)
x = Dense(units=int(x_data_sc.shape[1]*3/4))(x)
x = Activation('relu')(x)
bottleneck = Dense(units=int(x_data_sc.shape[1]/2))(x)

x = Activation('relu')(bottleneck)
x = Dense(units=int(x_data_sc.shape[1]*3/4))(x)
x = Activation('relu')(x)

output_layer = Dense(units=x_data_sc.shape[1])(x)

encoder = Model(input_layer, bottleneck)
auto_encoder = Model(input_layer, output_layer)

In [None]:
callback = EarlyStopping(monitor='val_loss',
                         min_delta=0.0001,
                         patience=5,
                         restore_best_weights=True)

In [None]:
auto_encoder.compile(optimizer='adam',
                     loss='mse')

In [None]:
history = auto_encoder.fit(x_data_sc, x_data_sc,
                           epochs=100, batch_size=256,
                           validation_split=0.3,
                           shuffle=True,
                           callbacks=[callback])

In [None]:
encoded = np.array(encoder(x_data_sc))

# 用 enocoder 所產出較低維的向量重建原始資料

In [None]:
rf = RandomForestClassifier(n_estimators=500,
                            max_depth=2,
                            n_jobs=8,
                            random_state=42)

dict_performance = cross_validate(estimator=rf,
                                  X=encoded, y=y,                # 以 encoder 重建的資料來進行模型訓練，
                                  cv=10, n_jobs=4,               # 與原始資料訓練出來的模型表現比較，
                                  return_train_score=True,
                                  scoring=['balanced_accuracy',
                                           'f1_weighted',
                                           'roc_auc',
                                           'average_precision'])

df_performance['Encoded'] = [np.mean(dict_performance[k]) for k in dict_performance.keys()]

In [None]:
df_performance

# 以原始資料訓練的模型表現與；
# 以 encoder 重建的資料訓練的模型表現
# 差異不大
# 代表透過 encoder 重建的原始資料與實際原始資料相似

In [None]:
feat_pers = ['age', 'marital_divorced', 'marital_married', 'marital_single', 'marital_unknown', 'education_basic.4y', 'education_basic.6y', 'education_basic.9y',
             'education_high.school', 'education_illiterate', 'education_professional.course', 'education_university.degree', 'education_unknown','job_admin.',
             'job_blue-collar', 'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired', 'job_self-employed', 'job_services', 'job_student',
             'job_technician', 'job_unemployed', 'job_unknown']

feat_fina = ['default_no', 'default_unknown', 'default_yes', 'housing_no', 'housing_unknown', 'housing_yes', 'loan_no', 'loan_unknown', 'loan_yes']

feat_camp = ['campaign', 'pdays', 'previous', 'contact_cellular', 'contact_telephone', 'month_apr', 'month_aug', 'month_dec', 'month_jul', 'month_jun', 'month_mar',
             'month_may', 'month_nov', 'month_oct', 'month_sep', 'day_of_week_fri', 'day_of_week_mon', 'day_of_week_thu', 'day_of_week_tue', 'day_of_week_wed',
             'poutcome_failure', 'poutcome_nonexistent', 'poutcome_success']

feat_econ = ['emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']

In [None]:
feat_groups = [feat_pers, feat_fina, feat_camp, feat_econ]

encoded = []

for f in feat_groups:
    dim_input_layer = len(f)
    #dim_layer_1 = np.max((int(dim_input_layer*3/4), 1))
    #dim_layer_2 = np.max((int(dim_input_layer/2), 1))
    dim_layer_1 = 64
    dim_layer_2 = 16
    
    input_layer = Input(shape=(dim_input_layer,))
    x = Activation('relu')(input_layer)
    x = Dense(units=dim_layer_1)(x)
    x = Activation('relu')(x)
    
    bottleneck = Dense(units=dim_layer_2)(x)
    
    x = Activation('relu')(bottleneck)
    x = Dense(units=dim_layer_1)(x)
    x = Activation('relu')(x)
    
    output_layer = Dense(units=dim_input_layer)(x)
    
    encoder = Model(input_layer, bottleneck)
    auto_encoder = Model(input_layer, output_layer)
    
    auto_encoder.compile(optimizer='adam',
                         loss='mse')
    
    x_tmp = x_data_sc[:, arr_cols.isin(f)]
    
    auto_encoder.fit(x_tmp, x_tmp,
                     epochs=100, batch_size=256,
                     shuffle=True,
                     validation_split=0.3,
                     callbacks=[callbacks])
    
    encoded.append(np.array(encoder(x_tmp)))
    
x_encoded = np.hstack(encoded)

In [None]:
cols_name = np.array(['pers_'+str(j) for j in range(encoded[0].shape[1])] + \
                     ['fina_'+str(j) for j in range(encoded[1].shape[1])] + \
                     ['camp_'+str(j) for j in range(encoded[2].shape[1])] + \
                     ['econ_'+str(j) for j in range(encoded[3].shape[1])])

cols_name

In [None]:
encoded_df = pd.DataFrame(x_encoded, columns=cols_name)
encoded_df.sample(10)

In [None]:
rf = RandomForestClassifier(n_estimators=500,
                            max_depth=2,
                            n_jobs=8,
                            random_state=42)

dict_performance = cross_validate(estimator=rf,
                                  X=x_encoded, y=y,
                                  cv=10, n_jobs=4,
                                  return_train_score=True,
                                  scoring=['balanced_accuracy',
                                           'f1_weighted',
                                           'roc_auc',
                                           'average_precision'])

df_performance['group_encoded'] = [np.mean(dict_performance[k]) for k in dict_performance.keys()]

In [None]:
df_performance

In [None]:
rf = RandomForestClassifier(n_estimators=500,
                            max_depth=2,
                            n_jobs=8,
                            random_state=42)

rf.fit(x_encoded, y)

fi = permutation_importance(estimator=rf,
                            X=x_encoded, y=y,
                            n_repeats=10,
                            n_jobs=8,
                            random_state=42).importances_mean

fig = plt.figure(figsize=(16, 8))
plt.barh(y=range(10, 0, -1),
         width=sorted(fi, reverse=True)[:10],
         alpha=0.9)
plt.yticks(range(10, 0, -1),
           cols_name[fi.argsort()[::-1][:10]])
plt.ylabel('Feature')
plt.xlabel('Importance')
plt.title('Group-encoded features importance')
plt.show()