Notebook Sections

1. [EDA](#EDA)
2. [Dataset Preprocessing](#Dataset-Preprocessing)
3. [Build and Train the Deep Learning model](#Build-and-Train-the-Deep-Learning-model)
4. [Prediction and Submission](#Prediction-and-Submission)

In [None]:
import numpy as np
import pandas as pd
# Setup plotting
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
# Set Matplotlib defaults
plt.rc('figure', autolayout=True)
plt.rc('axes', labelweight='bold', labelsize='large',
       titleweight='bold', titlesize=18, titlepad=10)


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import plot_model, to_categorical

# EDA

In [None]:
df = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv', index_col='row_id')
test_df = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv', index_col='row_id')
df.head()

In [None]:
df.describe()

The features are already around the same scale so they do not need to be normalized. 

In [None]:
df.info()

In [None]:
df.shape

In [None]:
print('# of NaN in train_data : ', df.isna().sum().sum())
print('# of NaN in test_data : ', test_df.isna().sum().sum())

In [None]:
df.target.unique()

In [None]:
targets = df.target.unique()
counts = df['target'].value_counts()
plt.figure(figsize = (10, 10))
plt.bar(targets, counts, 
        width = 0.4)
plt.title('Target Count')
plt.legend()
plt.xticks(rotation = 50)

Target counts are evenly distributed

# Dataset Preprocessing

In [None]:
X = df.copy()
y = X.pop('target')

# encode class values as integers
le = LabelEncoder()
y = le.fit_transform(y)
le.classes_

In [None]:
# stratify - make sure classes are evenly represented across splits
X_train, X_valid, y_train, y_valid = \
    train_test_split(X, y, stratify=y, train_size=0.75)

input_shape = [X_train.shape[1]]

# Build and Train the Deep Learning model

In [None]:
# build the model
model = Sequential([
    Dense(512, activation = 'relu', input_shape=input_shape),
    Dense(512, activation = 'relu'),
    Dropout(0.3),
    Dense(512, activation = 'relu'),
    Dropout(0.3),
    Dense(10, activation = 'softmax')
])
model.compile(optimizer='adam', 
              loss='sparse_categorical_crossentropy', 
              metrics=['accuracy'])

In [None]:
# train the model
early_stopping = EarlyStopping(
    patience=10, 
    min_delta=0.001, 
    restore_best_weights=True)

history = model.fit(X_train, y_train, 
         validation_data=(X_valid, y_valid), 
         epochs=200, 
         batch_size=512,
         callbacks=[early_stopping])

In [None]:
history_df = pd.DataFrame(history.history)

print(("Best Validation Loss: {:0.4f}" + \
      "\nBest Validation Accuracy: {:0.4f}")\
     .format(history_df['val_loss'].min(),
            history_df['val_accuracy'].max()))

history_df.loc[:, ['loss', 'val_loss']].plot(title='Cross-entropy')
history_df.loc[:, ['accuracy', 'val_accuracy']].plot(title='Accuracy')

# Prediction and Submission

In [None]:
y_pred = model.predict(test_df)
y_pred = np.argmax(y_pred, axis = 1)
y_pred = le.inverse_transform(y_pred)
y_pred

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-feb-2022/sample_submission.csv')
submission['target'] = y_pred
submission.to_csv("submission.csv", index=False)
submission