# TPS November: simple ensemble NN, LGBM and logreg

This notebook work with simple ensemble models

Notebook plan:

1. Modules import.
2. Utils.
3. Data load and prepare
4. Models train
5. Results.

## Modules

In [None]:
import os # operation system variables
import gc 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # plotting
from sklearn.model_selection import train_test_split # creat train and test datasets to modeling
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score # report and metrics modules 
from sklearn.preprocessing import StandardScaler, MinMaxScaler # data preprocessing
from sklearn.pipeline import make_pipeline # additionals modules
from sklearn.compose import make_column_transformer

# ML models upload
from sklearn.linear_model import LogisticRegression

# Additional models
import xgboost as xgb, lightgbm as lgbm

from tensorflow import keras # nn modeling
from tensorflow.keras import layers
import tensorflow as tf

In [None]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' #suppressing GPU warnings

## 2. Utils

We use only one util to reduce memory usage.

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
#                 if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
#                     df[col] = df[col].astype(np.float16)
#                 elif

                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

## 3. Data load and prepare

At first we load data to our memory, then reduce memory usage.

In [None]:
%%time
train = pd.read_csv('/kaggle/input/tabular-playground-series-nov-2021/train.csv', index_col='id')
test = pd.read_csv('/kaggle/input/tabular-playground-series-nov-2021/test.csv', index_col='id')
sub = pd.read_csv('/kaggle/input/tabular-playground-series-nov-2021/sample_submission.csv')

In [None]:
%%time
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

In [None]:
y = train.pop('target')
submission_index = test.index
features = list(train.columns)

In [None]:
numerical_transformer = make_pipeline(
    StandardScaler(), #Standardization
    MinMaxScaler(),    #Normalization
)

preprocessor = make_column_transformer(
    (numerical_transformer, features), #since all features are numerical/continous
)

train = preprocessor.fit_transform(train)
test = preprocessor.transform(test)

In [None]:
x_train, x_val, y_train, y_val = train_test_split(train, y, test_size=0.33)

## 4. Models train

In [None]:
model_lr = LogisticRegression(solver='liblinear', verbose=1)
model_lr.fit(train, y)

In [None]:
input_shape = [x_train.shape[1]]
PATIENCE = 10
MIN_DELTA = 0.0005

model = keras.Sequential([
    layers.BatchNormalization(input_shape=input_shape),
    layers.Dense(units=128, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.4),
    layers.Dense(units=128, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.4),
    layers.Dense(units=128, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.4),
    layers.Dense(units=1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['AUC'])

In [None]:
%%time

BATCH_SIZE = 128
EPOCHS = 20

history = model.fit(
    x_train, y_train,
    validation_data=(x_val, y_val),
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    #callbacks=[early_stopping], 
    verbose=1 # we need it to control ou NN training
)

In [None]:
model_lgbm = lgbm.LGBMClassifier(num_iterations = 400,
    objective = "binary",
    metric = "auc")
model_lgbm.fit(train, y)

## 5. Results

Predict our results and save them to `submission.csv`

In [None]:
%%time
y_pred_f = model.predict(test).ravel()

In [None]:
%%time
y_pred_f_logref = model_lr.predict_proba(test)

In [None]:
y_pred_f_lgbm = model_lgbm.predict_proba(test)

In [None]:
sub['target'] = (y_pred_f+y_pred_f_logref[:,1]+y_pred_f_lgbm[:,1])/3

In [None]:
sub.head()

In [None]:
sub.to_csv('submission.csv', index = 0)
sub