In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

%config Completer.use_jedi = False

In [None]:
# ライブラリのバージョンチェック
print(np.__version__)
print(pd.__version__)
print(tf.__version__)

In [None]:
# オリジナルデータセットの保存パスを定義
INPUT_BASE = '../input/'
INPUT_DATA_PATH = f'{INPUT_BASE}happy-whale-and-dolphin/'
TRAIN_IMAGES_PATH = f'{INPUT_DATA_PATH}train_images/'
TEST_IMAGES_PATH = f'{INPUT_DATA_PATH}test_images/'

# トリミング用のboxが格納されたデータセットの保存パスを定義
CROPPED_DATA_PATH = f'{INPUT_BASE}cropped-dataset/'

# 出力用のデータセット
OUTPUT_DIR_PATH = '/kaggle/working/'
OUTPUT_TRAIN_IMAGES_PATH = f'{OUTPUT_DIR_PATH}train_images/'
OUTPUT_TEST_IMAGES_PATH = f'{OUTPUT_DIR_PATH}test_images/'

# 出力用のファイル
#COPY_SAMPLE_CSV = 'train_sample.csv'
TRAIN_LABEL_CSV = 'train_label_box.csv'
TEST_LABEL_CSV = 'test_label_box.csv'
LABEL_MASTER_CSV = 'label_master.csv'

TRAIN_IMAGES_TFREC = 'train_images.tfrec'
TEST_IMAGES_TFREC = 'test_images.tfrec'

# 作業に使用する画像数を定義
IMAGE_COUNT = 10

In [None]:
# 正解ラベルをエンコーディングしたデータが記載された訓練データCSVを出力
def write_label_encode_train_csv():
    # 正解ラベルのエンコーディング(説明変数として利用するわけではないため、ラベルエンコーディングで実施)
    df_train = pd.read_csv(f'{INPUT_DATA_PATH}train.csv')
    le = LabelEncoder()

    # individual_idのエンコード
    encoded_individual_id = pd.Series(le.fit_transform(df_train['individual_id']))
    df_train['encoded_individual_id'] = encoded_individual_id

    # speciesのエンコード
    encoded_species = pd.Series(le.fit_transform(df_train['species']))
    df_train['encoded_species'] = encoded_species
    
    # トリミング用のbox情報をセットを読み込んで、訓練データCSVに追加
    df_train_cropped_info = pd.read_csv(f'{CROPPED_DATA_PATH}train-cropped-dataset.csv')
    df_train['box'] = df_train_cropped_info['box']
    
    # エンコードした正解ラベルが記載されたCSVファイルを出力
    df_train.to_csv(f'{OUTPUT_DIR_PATH}{TRAIN_LABEL_CSV}')

write_label_encode_train_csv()

In [None]:
# テストデータのトリミングしたbox情報を格納したcsvを出力
def write_test_box_csv():
    # 提出用サンプルにテスト画像が登録されているため、これをベースとする。
    df_test = pd.read_csv(f'{INPUT_DATA_PATH}sample_submission.csv')
    # "predictions"項目を削除する。
    df_test = df_test.drop('predictions', axis=1)

    # トリミング用のbox情報をセットを読み込んで、訓練データCSVに追加
    df_test_cropped_info = pd.read_csv(f'{CROPPED_DATA_PATH}test-cropped-dataset.csv')
    df_test['box'] = df_test_cropped_info['box']
    
    print(df_test.head())
    
    # エンコードした正解ラベルが記載されたCSVファイルを出力
    df_test.to_csv(f'{OUTPUT_DIR_PATH}{TEST_LABEL_CSV}')

write_test_box_csv()

In [None]:
# 正解ラベルのマスタcsv(label_master.csv)を作成
def create_label_master():
    # 正解ラベルのエンコーディング(説明変数として利用するわけではないため、ラベルエンコーディングで実施)
    df_train = pd.read_csv(f'{OUTPUT_DIR_PATH}{TRAIN_LABEL_CSV}')
    df_train = df_train.groupby(['encoded_individual_id', 'individual_id'],as_index=False)
    df_train = df_train.size()
    
    # "size"項目を削除する。
    df_train = df_train.drop('size', axis=1)
    df_train = df_train.rename(columns={'encoded_individual_id': 'index'})
    df_train.to_csv(f'{OUTPUT_DIR_PATH}{LABEL_MASTER_CSV}', index=False)
    
create_label_master()

In [None]:
## (暫定)とりあえず学習データを10件までに絞る(お試し用ロジック)
df_train_label = pd.read_csv(f'{OUTPUT_DIR_PATH}{TRAIN_LABEL_CSV}')
df_train_label[:IMAGE_COUNT].to_csv(f'{OUTPUT_DIR_PATH}{TRAIN_LABEL_CSV}')

In [None]:
## 訓練画像のいくつかを出力
def copy_train_images():
    import glob
    import shutil
    import os

    # お試し用の数枚の画像用の作業ディレクトリを作成
    if os.path.exists(OUTPUT_TRAIN_IMAGES_PATH):
        shutil.rmtree(OUTPUT_TRAIN_IMAGES_PATH)
    os.makedirs(OUTPUT_TRAIN_IMAGES_PATH)

    # 訓練データcsvから画像名を取得
    train_df = pd.read_csv(f'{OUTPUT_DIR_PATH}{TRAIN_LABEL_CSV}')
    # train_df = pd.read_csv(f'{INPUT_TRAIN_SAMPLE_PATH}train_sample.csv')
    # # print(train_df.head(10))

    # 訓練画像を作業ディレクトリにコピー
    for image_name in train_df['image'].to_list():
        shutil.copy(f'{TRAIN_IMAGES_PATH}{image_name}', f'{OUTPUT_TRAIN_IMAGES_PATH}{image_name}')

copy_train_images()

In [None]:
# 前処理用の変数を定義
AUTO = tf.data.experimental.AUTOTUNE
IMAGE_SIZE = 512

In [None]:
# # https://www.kaggle.com/lextoumbourou/happywhale-tfrecords-with-bounding-boxes
# def read_bbox(bbox):
#     return np.array([int(i) for i in bbox.split()])

def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        # BytesList won't unpack a string from an EagerTensor.
        value = value.numpy()
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

# def _float_feature(value):
#   """Returns a float_list from a float / double."""
#   return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _bb_feature(bb):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=bb))

#def serialize_example(image,image_name,target,species,yolov5_bb,detic_bb):
# 画像のシリアライズ(イメージ、正解ラベルを付与したデータセットを作成)
def serialize_image(image, image_name, target, species, box):
    feature = {
        'image': _bytes_feature(image),
        'image_name': _bytes_feature(image_name),
        'target': _int64_feature(target),
        'species': _int64_feature(species),
        'box': _bb_feature(box)
        #'yolov5_box': _bb_feature(yolov5_bb),
        #'detic_box': _bb_feature(detic_bb)
    }
    serialize_data = tf.train.Example(features=tf.train.Features(feature=feature))
    return serialize_data.SerializeToString()

In [None]:
# 訓練画像をtfrecファイルに変換
def train_image_to_tfrec():
    tfr_filename = f'{OUTPUT_DIR_PATH}{TRAIN_IMAGES_TFREC}'
    train_df = pd.read_csv(f'{OUTPUT_DIR_PATH}{TRAIN_LABEL_CSV}')
    
    with tf.io.TFRecordWriter(tfr_filename) as writer:
        for index, row in train_df.iterrows():
            image_id = row['image']
            # 正解ラベル(エンコーディング済み)
            target = row['encoded_individual_id']
            # イルカ・クジラの分類(エンコーディング済み)
            species = row['encoded_species']
            
            # 画像の読み込み
            image_path = f"{OUTPUT_TRAIN_IMAGES_PATH}{image_id}"
            image_encoded = tf.io.read_file(image_path)
            image_name = str.encode(image_id)
            
            if type(row['box']) is float:
                box = [-1, -1, -1, -1]
            else:
                #box = list(read_bbox(row['box']))
                box = [int(b) for b in row['box'].split()]

            #example = serialize_example(image_encoded, image_name, target, species, yolov5_bb, detic_bb)
            #serialize_image_data = serialize_image(image_encoded, image_name, target, species, detic_bb)
            serialize_image_data = serialize_image(image_encoded, image_name, target, species, box)

            writer.write(serialize_image_data)

train_image_to_tfrec()

In [None]:
# テスト画像をtfrecファイルに変換
def test_image_to_tfrec():
    tfr_filename = f'{OUTPUT_DIR_PATH}{TEST_IMAGES_TFREC}'
    test_df = pd.read_csv(f'{OUTPUT_DIR_PATH}{TEST_LABEL_CSV}')
    test_df = test_df.head(10)

    with tf.io.TFRecordWriter(tfr_filename) as writer:
        for index, row in test_df.iterrows():
            image_id = row['image']
            # 正解ラベル(正解が分からないので固定値)
            target = -1
            # イルカ・クジラの分類(正解が分からないので固定値)
            species = -1
            
            # 画像の読み込み
            image_path = f"{TEST_IMAGES_PATH}{image_id}"
            image_encoded = tf.io.read_file(image_path)
            image_name = str.encode(image_id)
            
            if type(row['box']) is float:
                box = [-1, -1, -1, -1]
            else:
                #box = list(read_bbox(row['box']))
                box = [int(b) for b in row['box'].split()]

            serialize_image_data = serialize_image(image_encoded, image_name, target, species, box)
            writer.write(serialize_image_data)

test_image_to_tfrec()

In [None]:
label_master = pd.read_csv(f'{OUTPUT_DIR_PATH}{LABEL_MASTER_CSV}')
label_master = label_master['index'].values

In [None]:
# 画像をdecode(前処理：サイズ変更＋正規化)
def decode_image(image_data, box):
    # boxで指定された枠を画像からトリミング
    if box is not None and box[0] != -1:
        left, top, right, bottom = box[0], box[1], box[2], box[3]
        bbs = tf.convert_to_tensor([top, left, bottom - top, right - left])
        image = tf.io.decode_and_crop_jpeg(image_data, bbs, channels=3)
    else:
        image = tf.image.decode_jpeg(image_data, channels=3)

    # サイズを変更
    image = tf.image.resize(image, [IMAGE_SIZE, IMAGE_SIZE])
    # 正規化
    image = tf.cast(image, tf.float32) / 255.0
    return image


# tfrecファイルを読み込んで中身を取得
def read_labeled_tfrecord(data):
    LABELED_TFREC_FORMAT = {
        "image_name": tf.io.FixedLenFeature([], tf.string),
        "image": tf.io.FixedLenFeature([], tf.string),
        "target": tf.io.FixedLenFeature([], tf.int64),
        'box': tf.io.FixedLenFeature([4], tf.int64)
#         'detic_box': tf.io.FixedLenFeature([4], tf.int64)
    }
    
    data = tf.io.parse_single_example(data, LABELED_TFREC_FORMAT)

    box = tf.cast(data['box'], tf.int32)
    image = decode_image(data['image'], box)

    #label_group = tf.cast(data['target'], tf.int32)
    target = tf.cast(data['target'], tf.int32)
    label_group = (label_master == target)

    posting_id = data['image_name']
    return posting_id, image, label_group


# 訓練用の画像ファイル(tfrecファイル)をload
def load_dataset(filenames, ordered = False):
    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False 
        
    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads = AUTO)
    dataset = dataset.with_options(ignore_order)
    dataset = dataset.map(read_labeled_tfrecord, num_parallel_calls = AUTO) 
    return dataset


# 訓練用の画像ファイル(tfrecファイル)を読み込み、必要な前処理を実施してtensorflow-datasetに展開する。
def get_training_dataset(filenames, train_flg=True):
    dataset = load_dataset(filenames, ordered = False)
#     dataset = dataset.map(data_augment, num_parallel_calls = AUTO)
#     dataset = dataset.map(arcface_format, num_parallel_calls = AUTO)
    dataset = dataset.map(lambda posting_id, image, label_group: (image, label_group))
    if train_flg:
        dataset = dataset.repeat()
#     dataset = dataset.shuffle(2048)
#     dataset = dataset.shuffle(20)
    dataset = dataset.batch(2)
    dataset = dataset.prefetch(buffer_size=AUTO)
    return dataset

In [None]:
# 訓練画像(tfrecファイル)を読み込んで、tensorflow-datasetに展開
train_ds = get_training_dataset(f'{OUTPUT_DIR_PATH}{TRAIN_IMAGES_TFREC}')

In [None]:
for index, image in enumerate(train_ds.take(5)):
    print(image[1])

In [None]:
# VGG16のモデルを定義
IMG_SHAPE = (IMAGE_SIZE, IMAGE_SIZE, 3)
VGG16_MODEL = tf.keras.applications.VGG16(
    input_shape=IMG_SHAPE,
    include_top=False,
    weights='imagenet'
)

In [None]:
# 正解ラベルの数を取得
df_label_master = pd.read_csv(f'{OUTPUT_DIR_PATH}{LABEL_MASTER_CSV}')
label_count = len(df_label_master)

# 出力層を追加
VGG16_MODEL.trainable = False
global_average_layer = tf.keras.layers.GlobalAveragePooling2D()
prediction_layer = tf.keras.layers.Dense(label_count, activation='softmax')

In [None]:
# VGG16を元に、学習用のモデルを定義
model = tf.keras.Sequential([
    VGG16_MODEL,
    global_average_layer,
    prediction_layer
])

In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss='categorical_crossentropy',
    metrics=['acc']
)

In [None]:
model.summary()

In [None]:
# 訓練画像の学習
history = model.fit(
    train_ds,
    epochs=5,
    steps_per_epoch=5
#     validation_steps=validation_steps,
#     validation_data=test_ds
)

In [None]:
# モデルを保存
model.save(f'{OUTPUT_DIR_PATH}model_vgg16')

In [None]:
# 訓練画像(tfrecファイル)を読み込んで、tensorflow-datasetに展開
test_ds = get_training_dataset(f'{OUTPUT_DIR_PATH}{TEST_IMAGES_TFREC}', train_flg=False)

In [None]:
pred = model.predict(test_ds)

In [None]:
preds = np.argpartition(pred[0], 5)[:5]
print(preds[0].shape)

label_master2 = pd.read_csv(f'{OUTPUT_DIR_PATH}{LABEL_MASTER_CSV}')
label_master2.head()

In [None]:
test_df = pd.read_csv(f'{INPUT_DATA_PATH}sample_submission.csv')
test_df = test_df.head(10)
test_df

In [None]:
for index in range(0, pred.shape[0]):
    rank = np.argpartition(pred[index], 5)[:5]
    rank = ' '.join([label_master2['individual_id'].iloc[index] for index in rank])
    test_df['predictions'].iloc[index] = rank
#     print(test_df['predictions'].iloc[index])

test_df.to_csv(f'{OUTPUT_DIR_PATH}submmission.csv', index=False)