In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import gc
import random
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import pkg_resources as pkg
print( f"pandas_profiling version: {pkg.get_distribution('pandas_profiling').version}")

from tqdm import tqdm

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import roc_curve, auc, cohen_kappa_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, f1_score, confusion_matrix
from sklearn.preprocessing import MinMaxScaler, RobustScaler

import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
import tensorflow.keras as keras

from tensorflow.keras.models import Sequential, Model

from tensorflow.keras.layers import InputLayer, SpatialDropout1D
from tensorflow.keras.layers import add, concatenate, GlobalMaxPooling1D, GlobalAveragePooling1D
from tensorflow.keras.layers import Dense, Input, Dropout, BatchNormalization, Activation, Conv1D, Flatten, MaxPooling1D
from tensorflow.keras.layers import Multiply, Add, Concatenate, Flatten, Average, Lambda
from tensorflow.keras.layers import Conv2D, MaxPool2D

from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.callbacks import EarlyStopping, Callback, ModelCheckpoint, ReduceLROnPlateau

from tensorflow.keras import backend as K
from tensorflow_addons.optimizers import CyclicalLearningRate

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        if col != 'time':
            col_type = df[col].dtypes
            if col_type in numerics:
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)  
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


In [None]:
RANDOM_SEED = 98765
DEBUG = True
PROFILE = False
CNN=False # need to try diff model

def seeding(SEED, use_tf=False):
    np.random.seed(SEED)
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    os.environ['TF_CUDNN_DETERMINISTIC'] = str(SEED)
    if use_tf:
        tf.random.set_seed(SEED)
    print('seeding done!!!')

seeding(RANDOM_SEED)

train = pd.read_csv('/kaggle/input/tabular-playground-series-nov-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-nov-2021/test.csv')
submission = pd.read_csv('/kaggle/input/tabular-playground-series-nov-2021/sample_submission.csv')

if DEBUG:
    train = train[:100000]
    
target = train.target
train.drop(['target'], axis=1, inplace=True)
gc.collect()

In [None]:
print('train:',train.shape)
print('test:',test.shape)

## Dealing with skewed data

In [None]:
def minmax_scale(df, cols):
    scaler = MinMaxScaler()
    for col in cols:
        df[col] = scaler.fit_transform(df[col].values.reshape(-1,1))

skewed_cols = ['f46', 'f59', 'f89']
minmax_scale( train, skewed_cols)
minmax_scale( test, skewed_cols)

## New features

In [None]:
## https://www.kaggle.com/c/tabular-playground-series-nov-2021/discussion/286731
## trying suggested by Lukasz Borecki new features based on chunks
## no improvement - commented out for now
#def make_features(df):
#    features = df.columns
#    features=features.drop('id')
#    df['chunk']=df['id']//60000
#    for feature in features:
#        df['chunk_mean_'+str(feature)]=df.groupby(['chunk'])[feature].transform('mean')
#        df['chunk_std_'+str(feature)]=df.groupby(['chunk'])[feature].transform('std')
        
#    df.pop('chunk')
#    df.pop('id')
#    return df

#train = make_features(train)
#test = make_features(test)

#train.drop(['id'], axis=1, inplace=True)
#test.drop(['id'], axis=1, inplace=True)
#print('train:',train.shape)
#print('test:',test.shape)

#train=reduce_mem_usage(train)
#test=reduce_mem_usage(test)
#gc.collect()

In [None]:
scaler = RobustScaler()
train = scaler.fit_transform(train)
test = scaler.transform(test)

## Model and training

In [None]:

def create_model_cnn(units, seq_len, n_features, optimizer, kernel_initializer):
    n_filters=(8,8,8)
    model = Sequential([
        Input(shape=(seq_len, n_features, 1)),
        Conv2D(n_filters[0], kernel_size=(1, n_features), activation="relu"),
        Conv2D(n_filters[1], kernel_size=(3,1), activation="relu"),
        MaxPool2D(pool_size=(2,1)),
        Conv2D(n_filters[2], kernel_size=(3,1), activation="relu"),
        MaxPool2D(pool_size=(2,1)),
        Flatten(),
        Dense(1, activation="sigmoid")
    ])    
    
    
    #model = Sequential()
    #model.add(Input(shape=(seq_len, n_features, 1)))
    #model.add(Conv2D(n_filters[0], kernel_size=(1, n_features), activation="relu"))
    #model.add(Conv2D(n_filters[1], kernel_size=(3,1), activation="relu"))
    #model.add(MaxPool2D(pool_size=(2,1)))
    #model.add(Conv2D(n_filters[2], kernel_size=(3,1), activation="relu"))
    #model.add(MaxPool2D(pool_size=(2,1)))
    #model.add(Flatten())
    #model.add(Dense(1, activation="sigmoid"))
    
    #model.add(Conv1D(units, 2, activation="relu"))
    #model.add(MaxPooling1D())
    #model.add(BatchNormalization())
    #model.add(Conv1D(units, 2, activation="relu", input_shape=(dim,1)))
    #model.add(MaxPooling1D())
    #model.add(BatchNormalization())
    #model.add(Flatten())
    #model.add(Dense(units, activation="relu"))
    #model.add(Dense(units = 1, activation = 'sigmoid'))
    model.compile(optimizer = optimizer, loss = 'binary_crossentropy', metrics = ['accuracy'])    
    return model    


In [None]:
def create_model(units, dim, optimizer, kernel_initializer):
    model = Sequential()
    model.add(Dense(units = units, activation = 'relu', input_dim = dim))
    #model.add(Dropout(0.2))
    model.add(Dense(units = units, activation = 'relu'))
    #model.add(Dropout(0.2))
    model.add(Dense(units = units, activation = 'relu'))
    #model.add(Dropout(0.2))
    model.add(Dense(units = units, activation = 'relu'))
    #model.add(Dropout(0.2))
    #model.add(Dense(units = units, activation = 'relu'))
    #model.add(Dropout(0.2))
    model.add(Dense(units = units, activation = 'relu'))
    model.add(Dense(units = 1, activation = 'sigmoid'))
    model.compile(optimizer = optimizer, loss = 'binary_crossentropy', metrics = ['accuracy'])
    return model    

lr = ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=5, verbose=1)
es = EarlyStopping(monitor="val_loss", patience=60, verbose=1, mode="min", restore_best_weights=True)

In [None]:
UNITS = 32
EPOCHS = 40
BATCH_SIZE = 256
TOTAL_SPLITS = 4
LEARNING_RATE = 0.00123

CNN=True
if CNN:
    train = train.reshape(train.shape[0], train.shape[1], 1)
    test = test.reshape(test.shape[0], test.shape[1], 1)


models = []
histories = []
folds = StratifiedKFold(n_splits=TOTAL_SPLITS, shuffle=True, random_state=RANDOM_SEED)
for fold_n, (train_index, valid_index) in enumerate(folds.split(train, target)):
    print('-'*15, '>', f'Fold {fold_n+1}', '<', '-'*15)
    X_train, X_valid = train[train_index], train[valid_index]
    y_train, y_valid = target[train_index], target[valid_index]    
    
    if CNN:
        model = create_model_cnn(UNITS, train.shape[0], train.shape[1], Adam(learning_rate=LEARNING_RATE), 'glorot_uniform')
    else:
        model = create_model(UNITS, train.shape[1], Adam(learning_rate=LEARNING_RATE), 'glorot_uniform')
    if DEBUG:
        model.summary()
    history = model.fit(X_train, y_train, validation_data=(X_valid, y_valid), batch_size = BATCH_SIZE, epochs = EPOCHS, 
                    verbose=1, shuffle=True, callbacks=[lr, es])
    models.append(model)
    histories.append(history)

## Plotting metrics recorded during training

In [None]:
# summarize history for accuracy
history = histories[0]
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='center right')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='center right')
plt.show()

## Prediction

In [None]:
predicted = []
for model in models:
    predicted.append(model.predict(test))

avg_preds = np.zeros(len(predicted[0]))
for pred in predicted:
    avg_preds += pred.ravel()
avg_pred = avg_preds / len(models)

In [None]:
submission['target'] = avg_pred
submission.to_csv('submission.csv', index=False, float_format='%.6f')
submission.head(20)