# 時間序列 GRU TensorFlow 入門筆記本
在這個筆記本中，我們展示了時間序列 GRU 模型的起始代碼和用於將 Kaggle 的 50GB CSV 文件處理成多個保存的 NumPy 文件的起始代碼。使用時間序列 GRU 允許我們使用所有提供的客戶數據，而不僅僅是客戶的最後一個數據點。我們在 [此處](https://www.kaggle.com/cdeotte/time-series-eda) 發布了時間序列數據圖。在這個筆記本中，我們
* 將數據幀中的訓練數據處理成維度為“num_of_customers x 13 x 188”的 3D NumPy 數組
* 將處理後的數組保存為磁盤上的多個 NumPy 文件
* 接下來我們從磁盤上的多個文件構建和訓練 GRU
* 我們計算驗證分數並達到 0.787
* 最後我們處理並保存測試數據，推斷測試，並創建提交

請務必注意，每次運行此筆記本時，您**不需要**處理訓練和測試文件。僅在您設計新功能時再次處理數據。否則，將您保存的 NumPy 數組上傳到 Kaggle 數據集（或使用我的 Kaggle 數據集 [此處][2]）。然後當你自定義和改進你的 GRU 模型時，設置變量 `PROCESS_DATA = False` 和 `PATH_TO_DATA = [the path to your kaggle dataset]`。

要查看可以幫助您直觀了解特徵工程和改進模型架構的時間序列 EDA，請參閱我的其他筆記本 [此處](https://www.kaggle.com/datasets/cdeotte/amex-data-for-transformers-and-rnns)。請注意，在下面的代碼中，我們將 GPU 劃分為 8GB 用於 RAPIDS（特徵工程）和 8GB 用於 TensorFlow（模型構建和訓練）。

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

import tensorflow as tf
import tensorflow.keras.backend as K
print('Using TensorFlow version',tf.__version__)

# 將 TensorFlow 限制為 8GB 的 GPU RAM
# 所以我們有 8GB RAM 用於 RAPIDS
LIMIT = 8
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*LIMIT)])
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
  except RuntimeError as e:
    print(e)
print('We will restrict TensorFlow to max %iGB GPU RAM'%LIMIT)
print('then RAPIDS can use %iGB GPU RAM'%(16-LIMIT))

# 處理訓練數據
我們分塊處理訓練和測試數據。 我們將訓練數據分成 10 個部分，分別處理每個部分並保存到磁盤。 我們將測試分為 20 個部分。 這使我們能夠避免處理過程中的內存錯誤。 我們還可以在比 CPU 更快的 GPU 上執行處理。 關於數據預處理的討論在 [這裡](https://www.kaggle.com/competitions/amex-default-prediction/discussion/327828) 和 [這裡](https://www.kaggle.com/competitions/amex-default-prediction/discussion/327828)
​​​

In [None]:
# 僅加載第一列火車或測試很慢
# 你可以從我的數據集中加載第一列
# 否則將變量設置為無以從 KAGGLE 的原始數據幀中加載
PATH_TO_CUSTOMER_HASHES = '../input/amex-data-files/'

# 處理一次數據後，上傳到 KAGGLE 數據集
# 然後將下面的變量設置為 FALSE
# 並將數據集附加到筆記本並將路徑放在下面的數據集
PROCESS_DATA = True
#PATH_TO_DATA = '../input/amex-data-files//data/'
PATH_TO_DATA = '../input/amex-data-for-transformers-and-rnns/data/'

# 訓練模型後，上傳到 KAGGLE 數據集
# 然後將下面的變量設置為 FALSE
# 並將數據集附加到筆記本並將路徑放在下面的數據集
TRAIN_MODEL = True
#PATH_TO_MODEL = './model/'
PATH_TO_MODEL = '../input/amex-data-for-transformers-and-rnns/model/'

INFER_TEST = True

In [None]:
import cupy, cudf # GPU LIBRARIES
import numpy as np, pandas as pd # CPU LIBRARIES
import matplotlib.pyplot as plt, gc

if PROCESS_DATA:
    # 加載目標
    targets = cudf.read_csv('../input/amex-default-prediction/train_labels.csv')
    targets['customer_ID'] = targets['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    print(f'There are {targets.shape[0]} train targets')
    
    # 獲取火車列名
    train = cudf.read_csv('../input/amex-default-prediction/train_data.csv', nrows=1)
    T_COLS = train.columns
    print(f'There are {len(T_COLS)} train dataframe columns')
    
    # GET TRAIN CUSTOMER NAMES (use pandas to avoid memory error)
    if PATH_TO_CUSTOMER_HASHES:
        train = cudf.read_parquet(f'{PATH_TO_CUSTOMER_HASHES}train_customer_hashes.pqt')
    else:
        train = pd.read_csv('/raid/Kaggle/amex/train_data.csv', usecols=['customer_ID'])
        train['customer_ID'] = train['customer_ID'].apply(lambda x: int(x[-16:],16) ).astype('int64')
    customers = train.drop_duplicates().sort_index().values.flatten()
    print(f'There are {len(customers)} unique customers in train.')

In [None]:
# 計算每個單獨文件的大小
def get_rows(customers, train, NUM_FILES = 10, verbose = ''):
    chunk = len(customers)//NUM_FILES
    if verbose != '':
        print(f'We will split {verbose} data into {NUM_FILES} separate files.')
        print(f'There will be {chunk} customers in each file (except the last file).')
        print('Below are number of rows in each file:')
    rows = []

    for k in range(NUM_FILES):
        if k==NUM_FILES-1: cc = customers[k*chunk:]
        else: cc = customers[k*chunk:(k+1)*chunk]
        s = train.loc[train.customer_ID.isin(cc)].shape[0]
        rows.append(s)
    if verbose != '': print( rows )
    return rows

if PROCESS_DATA:
    NUM_FILES = 10
    rows = get_rows(customers, train, NUM_FILES = NUM_FILES, verbose = 'train')

# 預處理和特徵工程
下面的函數處理數據。 描述該過程的討論是 [這裡](https://www.kaggle.com/competitions/amex-default-prediction/discussion/327828) 和 [這裡](https://www.kaggle.com/competitions/amex-default-prediction/discussion/328054)。 目前下面的代碼使用 [RAPIDS](https://rapids.ai/) 和 GPU 來
* 通過轉換為 int64 減少 customer_ID 列的內存使用
* 減少日期時間列的內存使用量（然後刪除該列）。
* 我們填寫 NAN
* 標籤對分類列進行編碼
* 我們減少列的內存使用 dtypes
* 將每個客戶轉換為序列長度為 13、特徵長度為 188 的 3D 數組

要改進此模型，請嘗試添加新功能。 列已重新排列，首先具有 11 個分類特徵。 這使得以後構建 TensorFlow 模型更加容易。 我們也可以嘗試添加標準縮放器。 目前使用的數據沒有從原始的 Kaggle 訓練數據進行縮放。

In [None]:
def feature_engineer(train, PAD_CUSTOMER_TO_13_ROWS = True, targets = None):
        
    # 減少字符串列
    # 分別從 64 字節到 8 字節，以及 10 字節到 3 字節
    train['customer_ID'] = train['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    train.S_2 = cudf.to_datetime( train.S_2 )
    train['year'] = (train.S_2.dt.year-2000).astype('int8')
    train['month'] = (train.S_2.dt.month).astype('int8')
    train['day'] = (train.S_2.dt.day).astype('int8')
    del train['S_2']
        
    # LABEL ENCODE CAT COLUMNS（並減少到 1 個字節）
    # with 0: padding, 1: nan, 2,3,4,etc: values
    d_63_map = {'CL':2, 'CO':3, 'CR':4, 'XL':5, 'XM':6, 'XZ':7}
    train['D_63'] = train.D_63.map(d_63_map).fillna(1).astype('int8')

    d_64_map = {'-1':2,'O':3, 'R':4, 'U':5}
    train['D_64'] = train.D_64.map(d_64_map).fillna(1).astype('int8')
    
    CATS = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_66', 'D_68']
    OFFSETS = [2,1,2,2,3,2,3,2,2] #2 減去全列 csv 中的最小值
    # 那麼 0 將是填充，1 將是 NAN，2,3,4 等將是值
    for c,s in zip(CATS,OFFSETS):
        train[c] = train[c] + s
        train[c] = train[c].fillna(1).astype('int8')
    CATS += ['D_63','D_64']
    
    # 在此處添加新功能
    # 示例：train['feature_189'] = etc etc etc
    # 示例：train['feature_190'] = etc etc etc
    # IF CATEGORICAL, 然後添加到CATS WITH: CATS += ['feaure_190'] etc etc etc
    # 減少內存 DTYPE
    SKIP = ['customer_ID','year','month','day']
    for c in train.columns:
        if c in SKIP: continue
        if str( train[c].dtype )=='int64':
            train[c] = train[c].astype('int32')
        if str( train[c].dtype )=='float64':
            train[c] = train[c].astype('float32')
            
    # 墊行所以每個客戶有 13 行
    if PAD_CUSTOMER_TO_13_ROWS:
        tmp = train[['customer_ID']].groupby('customer_ID').customer_ID.agg('count')
        more = cupy.array([],dtype='int64') 
        for j in range(1,13):
            i = tmp.loc[tmp==j].index.values
            more = cupy.concatenate([more,cupy.repeat(i,13-j)])
        df = train.iloc[:len(more)].copy().fillna(0)
        df = df * 0 - 1 #pad numerical columns with -1
        df[CATS] = (df[CATS] * 0).astype('int8') #pad categorical columns with 0
        df['customer_ID'] = more
        train = cudf.concat([train,df],axis=0,ignore_index=True)
        
    # 添加目標（並減少到 1 個字節）
    if targets is not None:
        train = train.merge(targets,on='customer_ID',how='left')
        train.target = train.target.astype('int8')
        
    # FILL NAN
    train = train.fillna(-0.5) #這適用於數字列
    
    # 按客戶然後日期排序
    train = train.sort_values(['customer_ID','year','month','day']).reset_index(drop=True)
    train = train.drop(['year','month','day'],axis=1)
    
    # REARRANGE COLUMNS WITH 11 CATS FIRST 首先用 11 隻貓重新排列列
    COLS = list(train.columns[1:])
    COLS = ['customer_ID'] + CATS + [c for c in COLS if c not in CATS]
    train = train[COLS]
    
    return train

In [None]:
if PROCESS_DATA:
    # CREATE PROCESSED TRAIN FILES AND SAVE TO DISK 創建已處理的火車文件並保存到磁盤  
    for k in range(NUM_FILES):

        # READ CHUNK OF TRAIN CSV FILE 讀取火車 CSV 文件的塊
        skip = int(np.sum( rows[:k] ) + 1) #the plus one is for skipping header 他加一是為了跳過標題
        train = cudf.read_csv('../input/amex-default-prediction/train_data.csv', nrows=rows[k], 
                              skiprows=skip, header=None, names=T_COLS)

        # FEATURE ENGINEER DATAFRAME 特徵工程師數據框v
        train = feature_engineer(train, targets = targets)

        # SAVE FILES 保存文件
        print(f'Train_File_{k+1} has {train.customer_ID.nunique()} customers and shape',train.shape)
        tar = train[['customer_ID','target']].drop_duplicates().sort_index()
        if not os.path.exists(PATH_TO_DATA): os.makedirs(PATH_TO_DATA)
        tar.to_parquet(f'{PATH_TO_DATA}targets_{k+1}.pqt',index=False)
        data = train.iloc[:,1:-1].values.reshape((-1,13,188))
        cupy.save(f'{PATH_TO_DATA}data_{k+1}',data.astype('float32'))

    # CLEAN MEMORY 乾淨的記憶
    del train, tar, data
    del targets
    gc.collect()

# 構建模型
我們只是將序列數據輸入到一個基本的 GRU 中。 我們將遵循兩個密集層，最後是一個 sigmoid 輸出來預測默認值。 嘗試改進模型架構。

In [None]:
# SIMPLE GRU MODEL 簡單的 GRU 模型
def build_model():
    
    # INPUT - FIRST 11 COLUMNS ARE CAT, NEXT 177 ARE NUMERIC 輸入 - 前 11 列是 CAT，接下來的 177 列是數字
    inp = tf.keras.Input(shape=(13,188))
    embeddings = []
    for k in range(11):
        emb = tf.keras.layers.Embedding(10,4)
        embeddings.append( emb(inp[:,:,k]) )
    x = tf.keras.layers.Concatenate()([inp[:,:,11:]]+embeddings)
    
    # SIMPLE RNN BACKBONE 簡單的 RNN 主幹
    x = tf.keras.layers.GRU(units=128, return_sequences=False)(x)
    x = tf.keras.layers.Dense(64,activation='relu')(x)
    x = tf.keras.layers.Dense(32,activation='relu')(x)
    
    # OUTPUT 輸出
    x = tf.keras.layers.Dense(1,activation='sigmoid')(x)
    
    # COMPILE MODEL 編譯模型
    model = tf.keras.Model(inputs=inp, outputs=x)
    opt = tf.keras.optimizers.Adam(learning_rate=0.001)
    loss = tf.keras.losses.BinaryCrossentropy()
    model.compile(loss=loss, optimizer = opt)
    
    return model

In [None]:
# CUSTOM LEARNING SCHEUDLE 自定義學習計劃
def lrfn(epoch):
    lr = [1e-3]*5 + [1e-4]*2 + [1e-5]*1
    return lr[epoch]
LR = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose = False)

# 競賽指標代碼
下面的代碼來自 Konstantin Yakovlev 的討論帖 [這裡](https://www.kaggle.com/competitions/amex-default-prediction/discussion/327534)

In [None]:
# Konstantin Yakovlev 的競爭指標
# https://www.kaggle.com/kyakovlev
# https://www.kaggle.com/competitions/amex-default-prediction/discussion/327534
def amex_metric_mod(y_true, y_pred):

    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1]/gini[0] + top_four)

# 訓練模型
我們訓練 5 次折疊，每次 8 個 epoch。 我們保存 5 折模型以供稍後進行測試推理。 TRAIN_MODEL，請在本筆記本的開頭設置變量 `TRAIN_MODEL = False`。

In [None]:
if TRAIN_MODEL:
    # SAVE TRUE AND OOF
    true = np.array([])
    oof = np.array([])
    VERBOSE = 2 # use 1 for interactive 

    for fold in range(5):

        # INDICES OF TRAIN AND VALID FOLDS 火車和有效折疊指數
        valid_idx = [2*fold+1, 2*fold+2]
        train_idx = [x for x in [1,2,3,4,5,6,7,8,9,10] if x not in valid_idx]

        print('#'*25)
        print(f'### Fold {fold+1} with valid files', valid_idx)

        # READ TRAIN DATA FROM DISK 從磁盤讀取訓練數據
        X_train = []; y_train = []
        for k in train_idx:
            X_train.append( np.load(f'{PATH_TO_DATA}data_{k}.npy'))
            y_train.append( pd.read_parquet(f'{PATH_TO_DATA}targets_{k}.pqt') )
        X_train = np.concatenate(X_train,axis=0)
        y_train = pd.concat(y_train).target.values
        print('### Training data shapes', X_train.shape, y_train.shape)

        # READ VALID DATA FROM DISK 從磁盤讀取有效數據
        X_valid = []; y_valid = []
        for k in valid_idx:
            X_valid.append( np.load(f'{PATH_TO_DATA}data_{k}.npy'))
            y_valid.append( pd.read_parquet(f'{PATH_TO_DATA}targets_{k}.pqt') )
        X_valid = np.concatenate(X_valid,axis=0)
        y_valid = pd.concat(y_valid).target.values
        print('### Validation data shapes', X_valid.shape, y_valid.shape)
        print('#'*25)

        # BUILD AND TRAIN MODEL 建立和訓練模型
        K.clear_session()
        model = build_model()
        h = model.fit(X_train,y_train, 
                      validation_data = (X_valid,y_valid),
                      batch_size=512, epochs=8, verbose=VERBOSE,
                      callbacks = [LR])
        if not os.path.exists(PATH_TO_MODEL): os.makedirs(PATH_TO_MODEL)
        model.save_weights(f'{PATH_TO_MODEL}gru_fold_{fold+1}.h5')

        # INFER VALID DATA 推斷有效數據
        print('Inferring validation data...')
        p = model.predict(X_valid, batch_size=512, verbose=VERBOSE).flatten()

        print()
        print(f'Fold {fold+1} CV=', amex_metric_mod(y_valid, p) )
        print()
        true = np.concatenate([true, y_valid])
        oof = np.concatenate([oof, p])
        
        # CLEAN MEMORY 乾淨的記憶
        del model, X_train, y_train, X_valid, y_valid, p
        gc.collect()

    # PRINT OVERALL RESULTS 打印總體結果
    print('#'*25)
    print(f'Overall CV =', amex_metric_mod(true, oof) )
    K.clear_session()

# 處理測試數據
我們以與訓練數據相同的方式處理測試數據。

In [None]:
if PROCESS_DATA:
    # GET TEST COLUMN NAMES 獲取測試列名稱
    test = cudf.read_csv('../input/amex-default-prediction/test_data.csv', nrows=1)
    T_COLS = test.columns
    print(f'There are {len(T_COLS)} test dataframe columns')
    
    # GET TEST CUSTOMER NAMES (use pandas to avoid memory error) GET TEST CUSTOMER NAMES（使用 pandas 避免內存錯誤）
    if PATH_TO_CUSTOMER_HASHES:
        test = cudf.read_parquet(f'{PATH_TO_CUSTOMER_HASHES}test_customer_hashes.pqt')
    else:
        test = pd.read_csv('/raid/Kaggle/amex/test_data.csv', usecols=['customer_ID'])
        test['customer_ID'] = test['customer_ID'].apply(lambda x: int(x[-16:],16) ).astype('int64')
    customers = test.drop_duplicates().sort_index().values.flatten()
    print(f'There are {len(customers)} unique customers in test.')

In [None]:
NUM_FILES = 20
if PROCESS_DATA:
    # CALCULATE SIZE OF EACH SEPARATE FILE 計算每個單獨文件的大小
    rows = get_rows(customers, test, NUM_FILES = NUM_FILES, verbose = 'test')

In [None]:
if PROCESS_DATA:
    # SAVE TEST CUSTOMERS INDEX 保存測試客戶索引
    test_customer_hashes = cupy.array([],dtype='int64')
    
    # CREATE PROCESSED TEST FILES AND SAVE TO DISK 創建已處理的測試文件並保存到磁盤
    for k in range(NUM_FILES):

        # READ CHUNK OF TEST CSV FILE
        skip = int(np.sum( rows[:k] ) + 1) #the plus one is for skipping header 他加一是為了跳過標題
        test = cudf.read_csv('../input/amex-default-prediction/test_data.csv', nrows=rows[k], 
                              skiprows=skip, header=None, names=T_COLS)

        # FEATURE ENGINEER DATAFRAME 特徵工程師數據框
        test = feature_engineer(test, targets = None)
        
        # SAVE TEST CUSTOMERS INDEX 保存測試客戶索引
        cust = test[['customer_ID']].drop_duplicates().sort_index().values.flatten()
        test_customer_hashes = cupy.concatenate([test_customer_hashes,cust])

        # SAVE FILES 保存文件
        print(f'Test_File_{k+1} has {test.customer_ID.nunique()} customers and shape',test.shape)
        data = test.iloc[:,1:].values.reshape((-1,13,188))
        cupy.save(f'{PATH_TO_DATA}test_data_{k+1}',data.astype('float32'))
        
    # SAVE CUSTOMER INDEX OF ALL TEST FILES 保存所有測試文件的客戶索引
    cupy.save(f'{PATH_TO_DATA}test_hashes_data', test_customer_hashes)

    # CLEAN MEMORY 乾淨的記憶
    del test, data
    gc.collect()

# 推斷測試數據
我們從保存的折疊模型中推斷出測試數據。 如果您不想推斷測試，但您只想讓您的筆記本計算驗證分數以評估模型更改，請在此筆記本的開頭設置變量“INFER_TEST = False”。 此外，如果您希望從先前訓練的模型中進行推斷，請在本筆記本開頭的變量“PATH_TO_MODEL”中添加 Kaggle 數據集的路徑。

In [None]:
if INFER_TEST:
    # INFER TEST DATA 推斷測試數據
    start = 0; end = 0
    sub = cudf.read_csv('../input/amex-default-prediction/sample_submission.csv')
    
    # REARANGE SUB ROWS TO MATCH PROCESSED TEST FILES 重新排列子行以匹配已處理的測試文件
    sub['hash'] = sub['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    test_hash_index = cupy.load(f'{PATH_TO_DATA}test_hashes_data.npy')
    sub = sub.set_index('hash').loc[test_hash_index].reset_index(drop=True)
    
    for k in range(NUM_FILES):
        # BUILD MODEL 構建模型
        K.clear_session()
        model = build_model()
        
        # LOAD TEST DATA 加載測試數據
        print(f'Inferring Test_File_{k+1}')
        X_test = np.load(f'{PATH_TO_DATA}test_data_{k+1}.npy')
        end = start + X_test.shape[0]

        # INFER 5 FOLD MODELS 推斷 5 折模型
        model.load_weights(f'{PATH_TO_MODEL}gru_fold_1.h5')
        p = model.predict(X_test, batch_size=512, verbose=0).flatten() 
        for j in range(1,5):
            model.load_weights(f'{PATH_TO_MODEL}gru_fold_{j+1}.h5')
            p += model.predict(X_test, batch_size=512, verbose=0).flatten()
        p /= 5.0

        # SAVE TEST PREDICTIONS 保存測試預測
        sub.loc[start:end-1,'prediction'] = p
        start = end
        
        # CLEAN MEMORY 乾淨的記憶
        del model, X_test, p
        gc.collect()

# 創建提交

In [None]:
if INFER_TEST:
    sub.to_csv('submission.csv',index=False)
    print('Submission file shape is', sub.shape )
    display( sub.head() )

In [None]:
if INFER_TEST:
    # DISPLAY SUBMISSION PREDICTIONS
    plt.hist(sub.to_pandas().prediction, bins=100)
    plt.title('Test Predictions')
    plt.show()