In [4]:
# import tensorflow as tf

def train():
    import tensorflow as tf
    
    mnist = tf.keras.datasets.mnist

    (x_train, y_train), (x_test, y_test) = mnist.load_data()
    x_train, x_test = x_train / 255.0, x_test / 255.0

    model = tf.keras.models.Sequential([
        tf.keras.layers.Flatten(input_shape=(28, 28)),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(10, activation='softmax')
    ])

    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    model.summary()

    print("Training...")

    model.fit(
        x_train, y_train, 
        epochs=3, 
        validation_split=0.2 
    ) 

    score = model.evaluate(x_test, y_test, batch_size=128, verbose=0)
    print('Test accuracy: ', score[1])

In [3]:
# Local training
train()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 784)               0         
_________________________________________________________________
dense (Dense)                (None, 128)               100480    
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                1290      
Total params: 101,770
Trainable params: 101,770
Non-trainable params: 0
_________________________________________________________________
Training...
Train on 48000 samples, validate on 12000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Test accuracy:  0.97


In [5]:
from kubeflow import fairing

PRIVATE_REGISTRY = 'kubeflow-registry.default.svc.cluster.local:30000'

fairing.config.set_builder(
    'append',
    base_image = 'tensorflow/tensorflow:2.0.3-gpu-py3',
    # base_image = f'{PRIVATE_REGISTRY}/kf-base:latest', 
    registry = PRIVATE_REGISTRY, 
    push=True
)

fairing.config.set_deployer('job')

remote_train = fairing.config.fn(train)

remote_train()

[I 201208 02:40:16 config:134] Using preprocessor: <kubeflow.fairing.preprocessors.function.FunctionPreProcessor object at 0x7f71e75e2358>
[I 201208 02:40:16 config:136] Using builder: <kubeflow.fairing.builders.append.append.AppendBuilder object at 0x7f715c0e64e0>
[I 201208 02:40:16 config:138] Using deployer: <kubeflow.fairing.deployers.job.job.Job object at 0x7f71e75e2b70>
[W 201208 02:40:16 append:50] Building image using Append builder...
[I 201208 02:40:16 base:107] Creating docker context: /tmp/fairing_context_bx0w3iaq
[W 201208 02:40:16 base:94] /usr/local/lib/python3.6/dist-packages/kubeflow/fairing/__init__.py already exists in Fairing context, skipping...
[I 201208 02:40:16 docker_creds_:234] Loading Docker credentials for repository 'tensorflow/tensorflow:2.0.3-gpu-py3'
[W 201208 02:40:17 append:54] Image successfully built in 0.660758693000389s.
[W 201208 02:40:17 append:94] Pushing image kubeflow-registry.default.svc.cluster.local:30000/fairing-job:B4EB8726...
[I 201208 0

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
2020-12-08 02:42:02.072948: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2020-12-08 02:42:02.073058: E tensorflow/stream_executor/cuda/cuda_driver.cc:318] failed call to cuInit: UNKNOWN ERROR (-1)
2020-12-08 02:42:02.073098: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (fairing-job-dq6m4-mn7w9): /proc/driver/nvidia/version does not exist
2020-12-08 02:42:02.073460: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2020-12-08 02:42:02.083136: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2300000000 Hz
2020-12-08 02:42:02.083672: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x462dff0 executing computations on platform Host.

[W 201208 02:42:16 job:173] Cleaning up job fairing-job-dq6m4...


### curl로 프라이빗 레지스트리 확인

In [6]:
! curl http://kubeflow-registry.default.svc.cluster.local:30000/v2/_catalog

{"repositories":["fairing-job"]}


In [7]:
! curl http://kubeflow-registry.default.svc.cluster.local:30000/v2/fairing-job/tags/list

{"name":"fairing-job","tags":["B4EB8726"]}


# Function Fairing 패키지 버전 이슈
Function Fairing 시 fairing SDK가 컨테이너 내부에서 아래와 같은 명령을 실행
```python
python /app/function_shim.py --serialized_fn_file /app/pickled_fn.p --python_version 3.6
```
결과는 에러.

다음과 같이 테스트 하여 실행 성공과 실패 케이스를 정리함

- 실행 실패
  - base_image = f'{PRIVATE_REGISTRY}/kf-base:latest', # 사전준비에서 마련한 Base Image  
  - base_image = 'tensorflow/tensorflow:latest-py3',   
  - base_image = 'tensorflow/tensorflow:2.1.2-gpu',    
  - Base Image에 pip install tensorflow-gpu   <- 최신 2.3.1 설치됨
- 실행 성공  
  - base_image = 'brightfly/kubeflow-jupyter-lab:tf2.0-gpu', <- 실마리
  - base_image = 'tensorflow/tensorflow:2.0.3-gpu-py3',
  - Base Image에 pip install tensorflow-gpu==2.0.0 