# Kaggle playground: cat-in-the-dat-ii Resnet50

## Run Setting

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
# train setting
TRAIN_MAP_GEN = True
TRAIN_DATA_NAME = "train_data_ary.npy"
TRAIN_DATA_PATH = "../input/workingdata/" + TRAIN_DATA_NAME

# test setting
TEST_MAP_GEN = True
TEST_DATA_NAME = "test_data_ary.npy"
TEST_DATA_PATH = "../input/workingdata/" + TEST_DATA_NAME



# training or load traind parameter
TRAIN_ENB = True
MODEL_FILE = "model.yaml"
WEIGHTS_FILE = "resnet50.h5"
HISTORY_FILE = "history_dict"

# fit setting
EPOCHS = 100
STEPS_PER_EPOCH = 100
POS_DATA_PER_STEP = 1000
NEG_DATA_PER_STEP = 1000
LR = 1e-2
POS_COEF = 2.0
NEG_COEF = 1.0

# set random seed
import random
random.seed(1)


## data pre process

In [None]:
##########
# open pandas csv
##########
import pandas as pd
sample_submission = pd.read_csv("../input/cat-in-the-dat-ii/sample_submission.csv")
test = pd.read_csv("../input/cat-in-the-dat-ii/test.csv")
train = pd.read_csv("../input/cat-in-the-dat-ii/train.csv")
display(train)

In [None]:
# downsampling

# check pos/neg data num
display(train[train.target == 0].shape, train[train.target == 1].shape)


In [None]:
##########
# load train data
##########
from IPython.display import display
import pandas as pd
import numpy as np

train_data = train.iloc[:,1:24] # reject id,target
train_id = np.array(train.iloc[:,0].astype('int32'))
train_labels = np.array(train.iloc[:,24].astype('int32'))

display(train_labels)
display(np.min(train_labels),np.max(train_labels),np.mean(train_labels))

In [None]:
##########
# load test data
##########
test_data = test.iloc[:,1:24] # reject id,target
test_id = np.array(test.iloc[:,0].astype('int32'))


In [None]:
##########
# convert string to num mapping array
##########
from keras.preprocessing.text import Tokenizer
import numpy as np

##############
# 文字列を適当な数値に変換
# NaNは-1に変換
##############
word_file = 'word_index.txt'
def to_int_ary(str_ary):
    tokenizer = Tokenizer(split="\n")
    tokenizer.fit_on_texts(str_ary) # 数値のマッピング生成

    with open(word_file, 'a') as f:
        print(tokenizer.word_index, file=f)
    
    seq = tokenizer.texts_to_sequences(str_ary) # 文字列を数値に変換
    seq_ary = np.array(seq).flatten()
    return seq_ary

def num_map_gen(data):
    with open(word_file, 'w') as f:
        print("# word index file", file=f)

    data_ary = []
    for column_name, item in data.iteritems():
        with open(word_file, 'a') as f:
            print(column_name, file=f)
        
        if(item.dtype == 'object'):
            data_ary.append(to_int_ary(item.astype('str')))
        elif(item.dtype == 'float64'):
            data_ary.append(item.fillna(-1).astype('float64'))
        else:
            raise Exception("error, data type is unknown. %s" % item.dtype)

    # (23, N) => (N, 23), N個の23要素のデータ列
    return np.array(data_ary).transpose(1,0)

## Select Map Data

In [None]:
train_data_ary = []
if(TRAIN_MAP_GEN): # regenerate
    train_data_ary = num_map_gen(train_data)
    # Save train data
    np.save(TRAIN_DATA_NAME, train_data_ary)
else: # load
    train_data_ary = np.load(TRAIN_DATA_NAME)


In [None]:
def separate_pn(data, labels, ids):
    # get 0(neg) / 1(pos) True/False mask list
    pos_mask = (labels == 1)
    neg_mask = (labels == 0)


    # get pos/neg data/labels
    # data: (112323, 1, 1, 23), labels: (112323)
    pos_data = data[pos_mask]
    pos_labels = labels[pos_mask]
    neg_data = data[neg_mask]
    neg_labels = labels[neg_mask]        
    pos_ids = ids[pos_mask]
    neg_ids = ids[neg_mask]
    
    return [[pos_ids, pos_data, pos_labels], [neg_ids, neg_data, neg_labels]]


In [None]:
pos_set, neg_set = separate_pn(train_data_ary, train_labels, train_id)

## Model Gen

In [None]:
from keras.layers import Input, Dense
from keras.models import Model
from keras.utils import to_categorical
from keras.layers import Input, Dense
from keras.models import Model
from keras.layers import GlobalAveragePooling2D

# input shape
input_shape = shape=(32,32,1,)

# shape = (W, H, BATCH(NONE))
input_layer = Input(input_shape)

#resnet50
from keras.applications.resnet50 import ResNet50
resnet50_model = ResNet50(include_top=False,
                 weights=None,
                 input_tensor=input_layer,
                 input_shape=input_shape,
                 pooling=None,
                 classes=2)
x = resnet50_model.output
x = GlobalAveragePooling2D()(x)
resnet50_prediction = Dense(1, activation = 'sigmoid')(x)

prediction = resnet50_prediction


## fit data proc

In [None]:
# (N,23) => (N,32,32,1)
def data_padding(data):
    from keras.preprocessing import sequence

    # (N,23) => (N, 32)
    data_tmp = sequence.pad_sequences(data, maxlen=32,
                                      padding="post", truncating="post",
                                      dtype="float64", value=0.0)

    # (N,32) => (N,1,32,1)
    data_tmp = data_tmp.reshape(-1,1,32,1)
    
    # (N,1,32,1) => (N,32,32,1)
    data_tmp = sequence.pad_sequences(data_tmp, maxlen=32,
                                      padding="post", truncating="post",
                                      dtype="float64", value=0.0)

    # (N,32,32,1)
    return data_tmp



In [None]:
# downsampling generator
def data_shuffle(pos_set, neg_set):
        # get pos/neg data/labels
        # data: (112323, 1, 1, 23), labels: (112323)
        pos_ids = pos_set[0]
        pos_data = pos_set[1]
        pos_labels = pos_set[2]
        neg_ids = neg_set[0]
        neg_data = neg_set[1]
        neg_labels = neg_set[2]        

        # random select index 0...len(pos_labels) num: POS_DATA_PER_STEP
        pos_selected_index = random.sample(list(range(len(pos_labels))), POS_DATA_PER_STEP)
        neg_selected_index = random.sample(list(range(len(neg_labels))), NEG_DATA_PER_STEP)                          
        
        pos_selected_ids = pos_ids[pos_selected_index]
        pos_selected_data = pos_data[pos_selected_index]
        pos_selected_labels = pos_labels[pos_selected_index]
        neg_selected_ids = neg_ids[neg_selected_index]
        neg_selected_data = neg_data[neg_selected_index]
        neg_selected_labels = neg_labels[neg_selected_index]

        # data: (20000, 1, 1, 23), labels: (20000,)
        selected_ids = np.concatenate([pos_selected_ids, neg_selected_ids])
        selected_data = np.concatenate([pos_selected_data, neg_selected_data])
        selected_labels = np.concatenate([pos_selected_labels, neg_selected_labels])

        p = np.random.permutation(len(selected_labels))
        shuffled_ids = selected_ids[p]
        shuffled_data = selected_data[p]
        shuffled_labels = selected_labels[p]
        
        return [shuffled_ids, shuffled_data, shuffled_labels]

In [None]:
# pos/neg_set:
# [0]: ids (N, 1) / [1]: data (N, 23) / [2]: labels (N, 1) 
def data_callback(pos_set, neg_set, dc_call_num):
    while True:
        ids, data, labels = data_shuffle(pos_set, neg_set)
        pad_data = data_padding(data)

        if(dc_call_num[0] < 5):
            np.savetxt('pad_data0_' + str(dc_call_num[0]) + '.csv', pad_data[0][0])
            np.savetxt('pad_data1_' + str(dc_call_num[0]) + '.csv', pad_data[1][0])
            np.savetxt('pad_data2_' + str(dc_call_num[0]) + '.csv', pad_data[2][0])
            np.savetxt('pad_labels_' + str(dc_call_num[0]) + '.csv', labels[0:9])
            np.savetxt('pad_ids_' + str(dc_call_num[0]) + '.csv', ids[0:9])
        
        dc_call_num[0] = dc_call_num[0] + 1
        
        yield((pad_data, labels))

## Loss Function

In [None]:
from keras import backend as K

# -(y_true * log(y_pred) + (1 - y_true) * log(1-y_pred))
def binary_cost_cross_entropy(y_true, y_pred):
    #L = - (y_true * K.log(y_pred) * POS_COEF + (1 - y_true) * K.log(1-y_pred) * NEG_COEF)
    L = - (y_true * K.log(y_pred)  + (1 - y_true) * K.log(1-y_pred) )

    mean_L = K.mean(L)
    return mean_L
    # loss=K.mean(y_pred)
    # return loss

## Learning

In [None]:
from keras.layers import Input, Dense
from keras.models import Model
from keras.utils import to_categorical

model = Model(inputs=input_layer, outputs=prediction)
display(model.summary())

from keras.optimizers import Adam
model.compile(optimizer=Adam(lr=LR),
#              loss=binary_cost_cross_entropy,
              loss='binary_crossentropy',
              metrics=['accuracy'])

dc_call_num = [0]
# pos: 487677, neg: 112323 => 1epoch: (1,000 + 1,000) x40
history = model.fit_generator(data_callback(pos_set, neg_set, dc_call_num),
                              steps_per_epoch=STEPS_PER_EPOCH, epochs=EPOCHS)


## Save

In [None]:
# Save Model
with open(MODEL_FILE, "w") as yaml_file:
    yaml_file.write(model.to_yaml())

model.save_weights(WEIGHTS_FILE)

# Save history
import pickle
with open(HISTORY_FILE, 'wb') as f:
    pickle.dump(history.history, f)

## History Plot

In [None]:
hist = history.history

import matplotlib.pyplot as plt
fig, (axL, axR) = plt.subplots(ncols=2, figsize=(10,4))

# Plot the loss in the history
axL.plot(hist['loss'],label="loss for training")
axL.set_title('model loss')
axL.set_xlabel('epoch')
axL.set_ylabel('loss')
axL.legend(loc='upper right')

# Plot the acc in the history
axR.plot(hist['accuracy'],label="loss for training")
axR.set_title('model accuracy')
axR.set_xlabel('epoch')
axR.set_ylabel('accuracy')
axR.legend(loc='upper right')

display(fig)

fig.savefig('./history.png')
plt.close()

# release memory

In [None]:
del train
del train_data
del train_labels
del train_id
del train_data_ary
del pos_set
del neg_set

## test

### test data map gen

In [None]:
test_data_ary = []
if(TEST_MAP_GEN):
    test_data_nmap = num_map_gen(test_data)
    test_data_ary = np.array(test_data_nmap)
    np.save(TEST_DATA_NAME, test_data_ary)

else:
    test_data_ary = np.load(TEST_DATA_PATH)

### predict

In [None]:
import numpy as np

test_data_pad = data_padding(test_data_ary)
predict = model.predict(test_data_pad)
display(predict)

### create submission file

In [None]:
submission = pd.DataFrame({'id':test_id,'target':predict.flatten()})
display(submission)
submission.to_csv("submision.csv", index=False)

## Debug

In [None]:

# weight取得
# display(model.layers[2].name)
# display(model.layers[2].get_weights()[0]) # [0]: weights, [1]: bias
    
# from keras import backend as K
# l_in = 0
# l_out = 4
# display(model.layers[l_in].name)
# display(model.layers[l_out].name)


# get_layer_output = K.function([model.layers[l_in].input],
#                               [model.layers[l_out].output])
# layer_output = get_layer_output(train_data_pad[0:2])
# display(train_data_pad[0])
# display(layer_output)
# display(model.predict(train_data_pad[0:9]))

In [None]:
# binary
# predict = model.predict(data_padding(train_data_ary))

# categorical
#predict = model.predict(train_data_pad).argmax(axis=1)

In [None]:
# display(predict.shape)
# display(np.min(predict.flatten()))
# display(np.max(predict))
# display(np.mean(predict))

## Result Memo

## simple full connection
* flatten/64 relu/64 relu/1softmax: 0.1872
* flatten/4  relu/4  relu/1softmax: 0.1872
* flatten/256relu/256relu/1softmax: 0.1872
### softmaxは出力の合計が1なので、2値分類では使えない


* flatten/256relu/256relu/1relu   : 0.8128
* flatten/256relu/256relu/1sigmoid: 0.7290
* flatten/256leaky/256leaky/1sig  : 0.7301
### sigmoidは0-1の実数


### binary_crossentropy, sigmoid, dense(1) だとpredict結果が0.xxxになるので、2値分類っぽくならない
### => categorical_crossentropy, softmax, dense(2) のcategoricalで[0,1]を出力し、argmaxで結果を得る
###    => sigmoidも0.5以上以下で分類すればよいだけだった、学習した結果、どの結果も0になりやすいのは学習のさせ方が悪いようだ。
###       データセットが0に偏ったデータになっていることによる影響がありそうだ。