In [1]:
import import_ipynb # ipynb load 위한 모듈
import config
from processingdata import get_modified_data
from DeepFM import DeepFM

import numpy as np
import pandas as pd
from time import perf_counter
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.metrics import BinaryAccuracy, AUC

importing Jupyter notebook from config.ipynb
importing Jupyter notebook from processingdata.ipynb
importing Jupyter notebook from DeepFM.ipynb
importing Jupyter notebook from FMlayer.ipynb


In [10]:



def get_data():
    file = pd.read_csv('data/adult_csv.csv', header=None)
    file = file.dropna()
    file = file.drop(file.index[0],axis = 0)
    file = file.reset_index(drop = True)
    X = file.loc[:, 0:13]
    Y = file.loc[:, 14].map({'<=50K': 0, '>50K': 1})
    
    
    X.columns = config.ALL_FIELDS
    
    field_dict, field_index, X_modified = \
        get_modified_data(X, config.ALL_FIELDS, config.CONT_FIELDS, config.CAT_FIELDS, False)
    
    X_train, X_test, Y_train, Y_test = train_test_split(X_modified, Y, test_size=0.2, stratify=Y)
    
    train_ds = tf.data.Dataset.from_tensor_slices(
        (tf.cast(X_train.values, tf.float32), tf.cast(Y_train, tf.float32))) \
        .shuffle(30000).batch(config.BATCH_SIZE)

    test_ds = tf.data.Dataset.from_tensor_slices(
        (tf.cast(X_test.values, tf.float32), tf.cast(Y_test, tf.float32))) \
        .shuffle(10000).batch(config.BATCH_SIZE)
    return train_ds, test_ds, field_dict, field_index


# Batch 단위 학습
def train_on_batch(model, optimizer, acc, auc, inputs, targets):
    with tf.GradientTape() as tape:
        y_pred = model(inputs)
        loss = tf.keras.losses.binary_crossentropy(from_logits=False, y_true=targets, y_pred=y_pred)

    grads = tape.gradient(target=loss, sources=model.trainable_variables)

    # apply_gradients()를 통해 processed gradients를 적용함
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    # accuracy & auc
    acc.update_state(targets, y_pred)
    auc.update_state(targets, y_pred)

    return loss


# 반복 학습 함수
def train(epochs):
    train_ds, test_ds, field_dict, field_index = get_data()
    model = DeepFM(embedding_size=config.EMBEDDING_SIZE, num_feature=len(field_index),
                   num_field=len(field_dict), field_index=field_index)

    optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)

    print("Start Training: Batch Size: {}, Embedding Size: {}".format(config.BATCH_SIZE, config.EMBEDDING_SIZE))
    start = perf_counter()
    for i in range(epochs):
        acc = BinaryAccuracy(threshold=0.8)
        auc = AUC()
        loss_history = []

        for x, y in train_ds:
            loss = train_on_batch(model, optimizer, acc, auc, x, y)
            loss_history.append(loss)

        print("Epoch {:03d}: 누적 Loss: {:.4f}, Acc: {:.4f}, AUC: {:.4f}".format(
            i, np.mean(loss_history), acc.result().numpy(), auc.result().numpy()))

    test_acc = BinaryAccuracy(threshold=0.5)
    test_auc = AUC()
    for x, y in test_ds:
        y_pred = model(x)
        test_acc.update_state(y, y_pred)
        test_auc.update_state(y, y_pred)

    print("테스트 ACC: {:.4f}, AUC: {:.4f}".format(test_acc.result().numpy(), test_auc.result().numpy()))
    print("Batch Size: {}, Embedding Size: {}".format(config.BATCH_SIZE, config.EMBEDDING_SIZE))
    print("걸린 시간: {:.3f}".format(perf_counter() - start))

In [11]:
train(10)

Data Prepared...
X shape: (45222, 104)
# of Feature: 104
# of Field: 14
Start Training: Batch Size: 256, Embedding Size: 5
Epoch 000: 누적 Loss: 0.6175, Acc: 0.7509, AUC: 0.5305
Epoch 001: 누적 Loss: 0.5287, Acc: 0.7520, AUC: 0.6720
Epoch 002: 누적 Loss: 0.5131, Acc: 0.7520, AUC: 0.7119
Epoch 003: 누적 Loss: 0.4982, Acc: 0.7524, AUC: 0.7442
Epoch 004: 누적 Loss: 0.4845, Acc: 0.7516, AUC: 0.7722
Epoch 005: 누적 Loss: 0.4697, Acc: 0.7520, AUC: 0.7941
Epoch 006: 누적 Loss: 0.4560, Acc: 0.7523, AUC: 0.8093
Epoch 007: 누적 Loss: 0.4440, Acc: 0.7526, AUC: 0.8204
Epoch 008: 누적 Loss: 0.4333, Acc: 0.7527, AUC: 0.8289
Epoch 009: 누적 Loss: 0.4246, Acc: 0.7523, AUC: 0.8350
테스트 ACC: 0.8046, AUC: 0.8341
Batch Size: 256, Embedding Size: 5
걸린 시간: 19.291
