# TPS November: simple Keras NN + GPU

This notebook work with simple Keras NN with GPU.

Notebook plan:

1. Modules import.
2. Utils.
3. Data load and prepare
4. NN creation and training.
5. Results.

## Modules

In [None]:
import os # operation system variables
import gc 

import numpy as np
import pandas as pd
import feather

from sklearn.preprocessing import StandardScaler, MinMaxScaler # data preprocessing
from sklearn.pipeline import make_pipeline # additionals modules
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split

from tensorflow import keras # nn modeling
from tensorflow.keras import layers
import tensorflow as tf


Here we activate GPU and swithc off warnings.

In [None]:
import warnings 
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' #suppressing GPU warnings
os.environ["CUDA_VISIBLE_DEVICES"]='1'# GPU using on

Check GPU status.

In [None]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

In [None]:
tf.debugging.set_log_device_placement(True)

# Create some tensors
a = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
b = tf.constant([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
c = tf.matmul(a, b)

print(c)

## 2. Utils

We use only one util to reduce memory usage.

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
#                 if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
#                     df[col] = df[col].astype(np.float16)
#                 elif

                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

## 3. Data load and prepare

At first we load data to our memory, then reduce memory usage.

In [None]:
%%time
train = pd.read_csv('/kaggle/input/tabular-playground-series-nov-2021/train.csv', index_col='id')
test = pd.read_csv('/kaggle/input/tabular-playground-series-nov-2021/test.csv', index_col='id')
sub = pd.read_csv('/kaggle/input/tabular-playground-series-nov-2021/sample_submission.csv')

In [None]:
%%time
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

In [None]:
train.info()

In [None]:
y = train.pop('target')
submission_index = test.index
features = list(train.columns)

Here we do preprocessing: standartization and normalization. 
We have to do same operation with test data as train data.

In [None]:
numerical_transformer = make_pipeline(
    StandardScaler(), #Standardization
    MinMaxScaler(),    #Normalization
)

preprocessor = make_column_transformer(
    (numerical_transformer, features), #since all features are numerical/continous
)

train = preprocessor.fit_transform(train)
test = preprocessor.transform(test)

In [None]:
x_train, x_val, y_train, y_val = train_test_split(train, y, test_size=0.33)

## 4. NN creation and training

We use simple NN model with 3 hidden layers (128, 64, 32 neurons).

In [None]:
input_shape = [x_train.shape[1]]
PATIENCE = 10
MIN_DELTA = 0.0005

model = keras.Sequential([
    layers.BatchNormalization(input_shape=input_shape),
    layers.Dense(units=166, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.4),
    layers.Dense(units=100, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.4),
    layers.Dense(units=50, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.4),
    layers.Dense(units=1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['AUC'])

#early_stopping = keras.callbacks.EarlyStopping( # it's important feature to stop our training procedure early
#    patience=PATIENCE,
#    min_delta=MIN_DELTA,
#    restore_best_weights=True,
#)

In [None]:
model.summary()

In [None]:
%%time

BATCH_SIZE = 128
EPOCHS = 40

history = model.fit(
    x_train, y_train,
    validation_data=(x_val, y_val),
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    #callbacks=[early_stopping], 
    verbose=1 # we need it to control ou NN training
)

## 5. Results

Predict our results and save them to `submission.csv`

In [None]:
%%time
y_pred_f = model.predict(test).ravel()

In [None]:
y_pred_f 

In [None]:
#y_pred_f = np.array([1 if x>=0.5 else 0 for x in y_pred_f])

In [None]:
#y_pred_f 

In [None]:
sub['target'] = y_pred_f
sub.to_csv('submission.csv', index = 0)
sub