In [153]:
%%time
import pandas as pd
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
from pandas.api.types import is_datetime64_any_dtype
from pandas.io.json import json_normalize
import json
import random
import os
import glob
import gc
import datetime
import numpy as np
from dateutil.relativedelta import relativedelta
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.model_selection import KFold,StratifiedKFold,GroupKFold

import tensorflow as tf
from tensorflow.keras.models import Sequential,Model
from tensorflow.keras.layers import BatchNormalization,Activation,Dropout,Dense,Input,Embedding,concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import plot_model
from tensorflow.keras.layers import Flatten,Multiply,Add, Conv1D, Conv2D, MaxPooling1D, MaxPooling2D,GRU,LSTM, Bidirectional
from tensorflow.keras.layers import GlobalMaxPool1D,GlobalAveragePooling1D, GlobalMaxPooling1D,SpatialDropout1D
from tensorflow.compat.v1.keras.layers import CuDNNGRU
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, Callback, ReduceLROnPlateau
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Attention,LeakyReLU,PReLU,ELU
from tensorflow.keras import backend as K
from tensorflow.keras import initializers,optimizers
from tensorflow.keras import constraints
from tensorflow.keras import regularizers
from tensorflow.keras.layers import InputSpec, Layer
from tensorflow.keras.optimizers import *
from tensorflow.keras.metrics import AUC
from sklearn.metrics import average_precision_score
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from scipy import sparse
from scipy.sparse import hstack, csr_matrix
from sklearn.decomposition import NMF,LatentDirichletAllocation,TruncatedSVD
from gensim.models import Word2Vec,FastText
from gensim import corpora
import warnings 
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
from matplotlib_venn import venn2
%matplotlib inline

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)
pd.set_option("display.precision", 8)

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

def preprocess(train_df,test_df,feats):
    train_df = train_df.replace([np.inf, -np.inf], np.nan)
    train_df = train_df.fillna(0) 

    test_df = test_df.replace([np.inf, -np.inf], np.nan)
    test_df = test_df.fillna(0)
    
    scaler = StandardScaler()
    train_df[feats] = scaler.fit_transform(train_df[feats])
    test_df[feats] = scaler.transform(test_df[feats])
    
    return train_df[feats], test_df[feats]
    
# seed
seed = 817
seed_everything(seed)

# load csv
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
fitting = pd.read_csv('fitting__fixed.csv')
spec_df = pd.read_pickle('spec_df.pkl')

Wall time: 720 ms


In [3]:
# import pathlib
# p_temp = pathlib.Path('spectrum_raw')

# spec = []
# for file in p_temp.iterdir():
#     spec_df = pd.read_csv(file, sep='\t', header=None)[:511]
#     spec_df.columns = ['wavelength', 'intensity']
#     spec_df['spectrum_filename'] = file.stem + '.dat'
#     spec.append(spec_df)

# spec_df = pd.concat(spec, axis=0)
# display(spec_df.shape)
# display(spec_df.head())
# spec_df.to_pickle('spec_df.pkl')

In [142]:
from tqdm import tqdm
def load_dat(spectrum_file):

    ## Datファイルの読み込み
    dat_df = pd.read_csv(spectrum_dir + spectrum_file,
                         header=None,
                         sep="\t")

    ## カラム名変更
    dat_df.columns = ["WaveLength", "SignalPower"]

    ## 波長データは使わず、光強度のみを使います
    spectrum = dat_df["SignalPower"][:511].values

    return spectrum

def min_max_scale(spectrum):

    ## 波形そのものが重要そうですので、0~1に正規化します
    spectrum = (spectrum - np.min(spectrum)) / (np.max(spectrum) - np.min(spectrum))

    return spectrum

spectrum_dir = "spectrum_raw/"
X_train = np.empty((len(train), 511), dtype="float")
X_train_filelist = train["spectrum_filename"].tolist()
y_train = train["target"].values

for i, spectrum_file in tqdm(enumerate(X_train_filelist)):

    ## datから光強度のみ読み込み
    specrum = load_dat(spectrum_file)

    ## 正規化
    specrum = min_max_scale(specrum)

    X_train[i,:] = specrum

print(X_train.shape, y_train.shape)

X_test = np.empty((len(test), 511), dtype="float")
X_test_filelist = test["spectrum_filename"].tolist()

for i, spectrum_file in tqdm(enumerate(X_test_filelist)):

    ## datから光強度のみ読み込み
    specrum = load_dat(spectrum_file)

    ## 正規化
    specrum = min_max_scale(specrum)

    X_test[i,:] = specrum

print(X_test.shape)

X_train = X_train.reshape(-1, 511, 1)
X_test = X_test.reshape(-1, 511, 1)
y_train = y_train.reshape(-1, 1)

print(X_train.shape, X_test.shape, y_train.shape)

7436it [00:59, 124.20it/s]
12it [00:00, 117.72it/s]

(7436, 511) (7436,)


6952it [00:55, 125.74it/s]

(6952, 511)
(7436, 511, 1) (6952, 511, 1) (7436, 1)





In [156]:
import tensorflow.keras.backend as K
def average_precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision
    
def wave_block(x, filters, kernel_size, n):
    dilation_rates = [2**i for i in range(n)]
    x = Conv1D(filters = filters,
                kernel_size = 1,
                padding = 'same')(x)
    res_x = x
    for dilation_rate in dilation_rates:
        tanh_out = Conv1D(filters = filters,
                            kernel_size = kernel_size,
                            padding = 'same', 
                            activation = 'tanh', 
                            dilation_rate = dilation_rate)(x)
        sigm_out = Conv1D(filters = filters,
                            kernel_size = kernel_size,
                            padding = 'same',
                            activation = 'sigmoid', 
                            dilation_rate = dilation_rate)(x)
        x = Multiply()([tanh_out, sigm_out])
        x = Conv1D(filters = filters,
                    kernel_size = 1,
                    padding = 'same')(x)
        res_x = Add()([res_x, x])
        return res_x

def create_model(num_cols,input_len):
    # numerical feature
    input_num = Input(shape=(len(num_cols),),name='numerical')
    n = input_num
    
    # wavenet
    intensity_inp = Input(shape=(input_len, 1))
    x = wave_block(intensity_inp, 16, 12, 12)
    x1 = BatchNormalization()(x)
    x2 = wave_block(x1, 32, 12, 8)
    x2 = BatchNormalization()(x2)
    x3 = wave_block(x2, 64, 12, 4)
    x3 = BatchNormalization()(x3)
    x4 = wave_block(x3, 128, 12, 1)    
    x4 = BatchNormalization()(x4)
    
    avg_pool_1 = GlobalAveragePooling1D()(x1)
    max_pool_1 = GlobalMaxPooling1D()(x1)
    avg_pool_2 = GlobalAveragePooling1D()(x2)
    max_pool_2 = GlobalMaxPooling1D()(x2)
    avg_pool_3 = GlobalAveragePooling1D()(x3)
    max_pool_3 = GlobalMaxPooling1D()(x3)
    avg_pool_4 = GlobalAveragePooling1D()(x4)
    max_pool_4 = GlobalMaxPooling1D()(x4)
    
    x = concatenate([n,avg_pool_3,max_pool_3,avg_pool_4,max_pool_4])
    x = Dense(256,kernel_initializer='he_normal',)(x)
    x = ELU()(x)
    x = Dropout(0.1)(x)
    x = Dense(128,kernel_initializer='he_normal',)(x)
    x = ELU()(x)
    x = Dropout(0.1)(x)    
    x = Dense(16,kernel_initializer='he_normal',)(x)
    x = ELU()(x)
    x = Dropout(0.1)(x)    
    output = Dense(1,activation='sigmoid')(x)
    
    # model
    model = Model([input_num]+[intensity_inp], output) 
    #model.summary()
    model.compile(loss="binary_crossentropy",optimizer='adam', 
                  metrics=[AUC(curve='PR', num_thresholds=1000)])
    return model

In [157]:
from sklearn.metrics import roc_auc_score,log_loss
from sklearn.model_selection import KFold,StratifiedKFold,GroupKFold,RepeatedStratifiedKFold
def nn_kfold(numerical_cols,
             train_df_num,test_df_num,
             train_df,test_df,
             train_df_y,target,folds):  
    tf.keras.backend.clear_session()
    oof_preds = np.zeros((train_df.shape[0],1))
    oof_preds1 = np.zeros((train_df.shape[0],1))
    oof_preds2 = np.zeros((train_df.shape[0],1))
    oof_preds3 = np.zeros((train_df.shape[0],1))
    oof_preds4 = np.zeros((train_df.shape[0],1))
    oof_preds5 = np.zeros((train_df.shape[0],1))      
    sub_preds = np.zeros((test_df.shape[0],1))
    
    cv_list = []
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df, train_df_y)):        
        print ('FOLD:' + str(n_fold))
        train_num_x = train_df_num.iloc[train_idx].values
        valid_num_x = train_df_num.iloc[valid_idx].values
        test_num_x = test_df_num.values
        
        train_x, train_y = train_df[train_idx], train_df_y[train_idx]
        valid_x, valid_y = train_df[valid_idx], train_df_y[valid_idx]
        test_x = test_df

        
        print ('train_y shape:',train_y.shape,train_y.mean())
        print ('valid_y shape:',valid_y.shape,valid_y.mean())
        
        input_len = 511
        model = create_model(numerical_cols,input_len)
        if n_fold < 1:
            model.summary()
        filepath = str(n_fold) + "_best_model.hdf5" 
            
        es = EarlyStopping(patience=8, mode='min', verbose=1) 
        checkpoint = ModelCheckpoint(monitor='val_loss', filepath=filepath, save_best_only=True,mode='auto') 
        reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=4, verbose=1)

        hist = model.fit([train_num_x]
                         +[train_x]
                         , train_y, batch_size=64, epochs=100, 
                         validation_data=([valid_num_x]
                                          +[valid_x]
                                          , valid_y), callbacks=[es, checkpoint, reduce_lr_loss,
                                                                ], verbose=1)

        model.load_weights(filepath)
        
        if n_fold < 5:
            _oof_preds = model.predict([valid_num_x]+[valid_x], batch_size=128,verbose=1)
            oof_preds1[valid_idx] = _oof_preds.reshape((-1,1))
        if n_fold >= 5 and n_fold < 10:
            _oof_preds = model.predict([valid_num_x]+[valid_x], batch_size=128,verbose=1)
            oof_preds2[valid_idx] = _oof_preds.reshape((-1,1))
        if n_fold >= 10 and n_fold < 15:
            _oof_preds = model.predict([valid_num_x]+[valid_x], batch_size=128,verbose=1)
            oof_preds3[valid_idx] = _oof_preds.reshape((-1,1))
        if n_fold >= 15 and n_fold < 20:
            _oof_preds = model.predict([valid_num_x]+[valid_x], batch_size=128,verbose=1)
            oof_preds4[valid_idx] = _oof_preds.reshape((-1,1))
        if n_fold >= 20 and n_fold < 25:
            _oof_preds = model.predict([valid_num_x]+[valid_x], batch_size=128,verbose=1)
            oof_preds5[valid_idx] = _oof_preds.reshape((-1,1))
            
        sub_preds += model.predict([test_num_x]
                                   +[test_x]
                                   , batch_size=128).reshape((-1,1)) / folds.get_n_splits()   
    oof_preds = (oof_preds1 + oof_preds2 + oof_preds3 + oof_preds4 + oof_preds5
                )/5   
    cv = log_loss(train_df_y, oof_preds)
    print('Full log_loss %.6f' % cv)      
    cv = average_precision_score(train_df_y, oof_preds)
    print('Full AP %.6f' % cv)  

    return oof_preds,sub_preds,cv 

train_df = pd.read_pickle('train_num.pkl')
test_df = pd.read_pickle('test_num.pkl')

target = 'target'

# numerical_cols
numerical_cols = [    
'exc_wl', 'layout_a', 'layout_x', 'layout_y', 'pos_x', 'params0', 'params1', 'params3', 'params4', 'params5', 'params6',
    'rms', 'beta', 'intensity_min', 'intensity_max', 'intensity_mean', 'intensity_std', 'intensity_minmax', 'intensity_q1',
    'intensity_q5', 'intensity_q95', 'intensity_q99', 'diff_params1_params4', 'diff_params2_params5', 'diff_params3_params6',
    'ratio_params1_params4', 'ratio_params1_params3', 'ratio_params3_params6', 'ratio_params4_params6', 'exc_wl_count_target', 
    'layout_a_count_target', 'layout_x_count_target', 'layout_y_count_target', 'pos_x_count_target', 'layout_a_mean_ratio_intensity_minmax'
                     ]

train_x_num, test_x_num = preprocess(train_df,test_df,numerical_cols)

for c in ['exc_wl']:
    lbl = LabelEncoder()
    train_df[c] = lbl.fit_transform(train_df[c])
    test_df[c] = lbl.transform(test_df[c])
    print (train_df[c].nunique()+1)

n_splits= 5
n_repeats = 5
# folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
folds = RepeatedStratifiedKFold(n_splits=n_splits,  n_repeats=n_repeats,  random_state=seed)
train_df,test_df,cv = nn_kfold(numerical_cols,
                               train_x_num,test_x_num,
                               X_train,X_test,
                               y_train,target,folds)



3
FOLD:0
train_y shape: (5948, 1) 0.03160726294552791
valid_y shape: (1488, 1) 0.03225806451612903
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            [(None, 511, 1)]     0                                            
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 511, 16)      32          input_5[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 511, 16)      3088        conv1d[0][0]                     
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 511, 16)      3088        conv1d[0][0]                 

In [158]:
test['target'] = test_df
test['target'].to_csv('atmacup5_submission2.csv',index=False)