# **Tabular Playground Series - Nov 2021 with ANN**

## 1. Introduction

<div align='left'><font size="3" color="#000000"> The dataset is used for this competition is synthetic, but based on a real dataset and generated using a CTGAN. The original dataset deals with predicting identifying spam emails via various extracted features from the email. Although the features are anonymized, they have properties relating to real-world features.
</font></div>

* The Artificial Neural Network consists of an input layer, a hidden layer, and an output layer.

> <center><img src="https://elogeel.files.wordpress.com/2010/05/050510_1627_multilayerp1.png" width="500px"></center>

Source and credit to https://www.kaggle.com/mirichoi0218/ann-making-model-for-binary-classification#7.-ANN-Model-Summary-&-Compare

## 2. Data Acquisition

### Import Libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datatable as dt
import matplotlib.pyplot as plt

# Keras
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras import backend as K
from tensorflow.keras.layers import Dropout
from keras import callbacks

# Scoring
from sklearn.metrics import confusion_matrix, accuracy_score, plot_confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score,classification_report

# Removes warning
import warnings
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### Load Dataset

In [None]:
# Using DataTable for faster loading
train_df = dt.fread('../input/tabular-playground-series-nov-2021/train.csv').to_pandas()
test_df = dt.fread('../input/tabular-playground-series-nov-2021/test.csv').to_pandas()

### Memory Reduction

In [None]:
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                #if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                #    df[col] = df[col].astype(np.float16)
                #el
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        #else:
            #df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB --> {:.2f} MB (Decreased by {:.1f}%)'.format(
        start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
# Reduce Memory Usage
train_df = reduce_mem_usage(train_df)
test_df = reduce_mem_usage(test_df)

## 3. EDA

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df.describe()

In [None]:
train_df.shape

In [None]:
test_df.shape

In [None]:
missing_train = train_df.isnull().sum().sum()
missing_test = test_df.isnull().sum().sum()
print('Total missing value in train dataset is:', missing_train)
print('Total missing value in test dataset is:', missing_test)

In [None]:
train_df['target'].value_counts()

## 4. Data Splitting

In [None]:
X = train_df.drop(["id", "target"], axis=1)
y = train_df['target']

# freeing up some memory
del train_df

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
#scaling the data

scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## 5. Building the ANN

## 6. Training the ANN

In [None]:
# create model
model = Sequential()
model.add(Dropout(0.5))
model.add(Dense(128, activation='swish'))
model.add(Dense(128, activation='swish'))
model.add(Dense(1, activation='sigmoid'))

# Compile model

model.compile(optimizer = "adam",loss = 'binary_crossentropy', metrics = ['AUC'])

earlystopping = callbacks.EarlyStopping(monitor='val_loss',
                                        mode=min,
                                        verbose=1,
                                        patience=83)

In [None]:
# Fit the model
history = model.fit(X_train, y_train,validation_data=(X_test,y_test), batch_size = 2048, epochs = 2000,callbacks =[earlystopping])

In [None]:
# summarize history for acc
plt.plot(history.history['auc'])
plt.plot(history.history['val_auc'])
plt.title('Model AUC')
plt.ylabel('AUC')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='lower right')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper right')
plt.show()

## 7. Submission

In [None]:
test_df = test_df.drop(["id"], axis=1)

In [None]:
test_df = scaler.transform(test_df)

In [None]:
pred = model.predict(test_df)

In [None]:
sub=pd.read_csv("../input/tabular-playground-series-nov-2021/sample_submission.csv")
sub

In [None]:
sub['target'] = pred

In [None]:
sub.head()

In [None]:
sub.to_csv("submission.csv",index=False)