In [61]:
import numpy as np
import pandas as pd
import sys
import os
from tqdm import tqdm
import sklearn
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  0


In [62]:
!pip install lib/kaggle-rig-0.2.0.tar.gz
import krig
krig.seed_everything()

Processing ./lib/kaggle-rig-0.2.0.tar.gz
Building wheels for collected packages: kaggle-rig
  Building wheel for kaggle-rig (setup.py) ... [?25ldone
[?25h  Created wheel for kaggle-rig: filename=kaggle_rig-0.2.0-py3-none-any.whl size=6539 sha256=2f2af3d66fb956b28ec3e387687ffc7a111c15780a8940f6e4b0cdbc2e44c501
  Stored in directory: /home/pankun/.cache/pip/wheels/03/99/72/07676e0abd65551d5261aee9360ad1c85fcaca1c409857a40a
Successfully built kaggle-rig
Installing collected packages: kaggle-rig
  Attempting uninstall: kaggle-rig
    Found existing installation: kaggle-rig 0.2.0
    Uninstalling kaggle-rig-0.2.0:
      Successfully uninstalled kaggle-rig-0.2.0
Successfully installed kaggle-rig-0.2.0


In [63]:
# Characters such as empty strings '' or numpy.inf are considered NA values
pd.set_option('use_inf_as_na', True)
pd.set_option('display.max_columns', 999)
pd.set_option('display.max_rows', 999)
sns.set(style="whitegrid")

In [64]:
EPOCHS = 2
LEARNING_RATE = 0.001
DROPOUT = 0.01
FOLDS = 10
BATCH_SIZE = 32
INPUT_SHAPE = (224, 224, 3)
DATA = 'input/processed'
TARGET = ['fvc_last_3', 'fvc_last_2', 'fvc_last_1']

In [65]:
data = pd.read_parquet(f'{DATA}/train.parquet')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32684 entries, 0 to 32683
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   pid          32684 non-null  object 
 1   age          32684 non-null  uint8  
 2   sex          32684 non-null  object 
 3   smoking      32684 non-null  object 
 4   week_1       32684 non-null  int16  
 5   fvc_1        32684 non-null  uint16 
 6   percent_1    32684 non-null  float32
 7   fvc_last_1   32684 non-null  uint16 
 8   fvc_last_2   32684 non-null  uint16 
 9   fvc_last_3   32684 non-null  uint16 
 10  week_last_1  32684 non-null  int16  
 11  week_last_2  32684 non-null  int16  
 12  week_last_3  32684 non-null  int16  
 13  img          32684 non-null  object 
dtypes: float32(1), int16(4), object(4), uint16(4), uint8(1)
memory usage: 1.7+ MB


In [66]:
data.head()

Unnamed: 0,pid,age,sex,smoking,week_1,fvc_1,percent_1,fvc_last_1,fvc_last_2,fvc_last_3,week_last_1,week_last_2,week_last_3,img
0,ID00007637202177411956430,79,Male,Ex-smoker,-4,2315,58.253647,2057,2064,2000,57,41,29,ID00007637202177411956430/0.png
1,ID00007637202177411956430,79,Male,Ex-smoker,-4,2315,58.253647,2057,2064,2000,57,41,29,ID00007637202177411956430/1.png
2,ID00007637202177411956430,79,Male,Ex-smoker,-4,2315,58.253647,2057,2064,2000,57,41,29,ID00007637202177411956430/10.png
3,ID00007637202177411956430,79,Male,Ex-smoker,-4,2315,58.253647,2057,2064,2000,57,41,29,ID00007637202177411956430/11.png
4,ID00007637202177411956430,79,Male,Ex-smoker,-4,2315,58.253647,2057,2064,2000,57,41,29,ID00007637202177411956430/12.png


In [67]:
splitter = sklearn.model_selection.GroupKFold(n_splits=FOLDS)
x = data['img']
y = data[TARGET]
groups = data['pid']
train = val = None
i = 0
for train_indices, test_indices in splitter.split(x, y, groups):
    if i != 0:
        break
    train = data.iloc[train_indices]
    val = data.iloc[test_indices]
    i += 1


print(f'len(data)={len(data)}, len(train)={len(train)}, len(val)={len(val)}')

len(data)=32684, len(train)=29410, len(val)=3274


In [68]:
# Transfer learning from underlying pretrained model (freeze weights!)
# do not include output layer from pretrained model
pretrained = keras.applications.EfficientNetB0(
    include_top=False, input_shape=INPUT_SHAPE, pooling='max', #weights=None
)
pretrained.trainable = False

kernel_initializer = keras.initializers.he_normal()
kernel_regularizer = keras.regularizers.l2(0.01) 

b0 = 1280
b2 = 1408
b3 = 1536
b5 = 2048
b6 = 2304
b7 = 2560

model = keras.models.Sequential()
model.add(pretrained)
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dense(
    b0,
    activation="relu",
    kernel_initializer=kernel_initializer,
    kernel_regularizer=kernel_regularizer
))
model.add(keras.layers.Dropout(DROPOUT))
model.add(keras.layers.Dense(len(TARGET), name='output'))
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
efficientnetb0 (Functional)  (None, 1280)              4049571   
_________________________________________________________________
batch_normalization_4 (Batch (None, 1280)              5120      
_________________________________________________________________
dense_4 (Dense)              (None, 1280)              1639680   
_________________________________________________________________
dropout_4 (Dropout)          (None, 1280)              0         
_________________________________________________________________
output (Dense)               (None, 3)                 3843      
Total params: 5,698,214
Trainable params: 1,646,083
Non-trainable params: 4,052,131
_________________________________________________________________


In [69]:
callbacks = [
    keras.callbacks.ReduceLROnPlateau(monitor='val_loss', patience=2, verbose=1, factor=0.7),
    keras.callbacks.EarlyStopping(monitor='val_loss', patience=2),
    keras.callbacks.ModelCheckpoint(filepath='best_model.h5', monitor='val_loss', save_best_only=True)
]
optimizer = keras.optimizers.Adam(learning_rate=LEARNING_RATE)
loss = keras.losses.MeanSquaredLogarithmicError()
rmse = keras.metrics.RootMeanSquaredError()
model.compile(loss=loss, optimizer=optimizer, metrics=[rmse])

In [70]:
target_size = (INPUT_SHAPE[0], INPUT_SHAPE[1])

idg = keras.preprocessing.image.ImageDataGenerator()

train_generator = idg.flow_from_dataframe(
    dataframe = train,
    x_col='img',
    y_col=TARGET,
    directory=DATA,
    target_size=target_size,
    color_mode='rgb',
    batch_size=BATCH_SIZE,
    shuffle=True,
    class_mode='multi_output'
)

validation_generator = idg.flow_from_dataframe(
    dataframe = val,
    x_col='img',
    y_col=TARGET,
    directory=DATA,
    target_size=target_size,
    color_mode='rgb',
    shuffle=False,
    batch_size=BATCH_SIZE,
    class_mode='multi_output'
)

Found 29410 validated image filenames.
Found 3274 validated image filenames.


In [None]:
history = model.fit(train_generator,
          epochs=EPOCHS, 
          validation_data=validation_generator,#class_weight=class_weights_dict,
          callbacks=callbacks)

Epoch 1/2
 10/920 [..............................] - ETA: 22:27 - loss: 65.0158 - root_mean_squared_error: 2888.8076