In [None]:
# Import Packages
import numpy as np 
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import log_loss
import os


In [None]:
# Setting paths
root_path = '/kaggle/input/tabular-playground-series-jun-2021'
train_path = os.path.join(root_path, 'train.csv')
test_path = os.path.join(root_path, 'test.csv')
sample_sub_path = os.path.join(root_path, 'sample_submission.csv')


In [None]:
# Set seed
seed = 10
np.random.seed(seed)

In [None]:
# Read training data
df_train = pd.read_csv(train_path)

# Convert classes to numeric
# df_train['target'] = df_train['target'].str[-1]

In [None]:
# Target histogram to check class distribution
df_train['target'].hist()

In [None]:
# X and y for training set
X = df_train.iloc[:, 1:-1]
y = df_train[['target']]

# Check the shape of the dataset
print(f'Training set shape: {X.shape}')

So we've got 200 000 rows and 75 features!

In [None]:
# Are all columns numeric?
for col in X.columns:
    if df_train[col].dtypes != 'int64':
        print(col)

In [None]:
# Encoding the target variable 
y = pd.get_dummies(y)
print(f'New shape: {y.shape}')

In [None]:
# Creating validation split
X_train, X_val, y_train, y_val = train_test_split(X, y,
                                                    test_size=0.2,
                                                    stratify=y,
                                                    random_state=seed)

# Checking split shapes
print(f'X_train shape: {X_train.shape}\nX_test shape: {X_val.shape}')
print(f'y_train shape: {y_train.shape}\ny_val shape: {y_val.shape}')


In [None]:
# Get number of features to set the input shape of the model
n_features = X_train.shape[1]

# Create Early Stopping
early_stop = EarlyStopping(patience=3)

# Create the Keras model
model = Sequential()

# Create model architecture
model.add(Dense(350,
               activation='tanh',
               input_shape=(n_features, )))
model.add(Dense(150,
               activation='relu'))
model.add(Dense(100,
               activation='relu'))
model.add(Dense(9, activation='softmax'))

print(model.summary())

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.fit(X_train, y_train, epochs=20, validation_split=0.2, callbacks=[early_stop])

# predict_proba to get probabilities per class
y_pred = model.predict(X_val)

# Evaluation
logloss = log_loss(y_val, y_pred)
print(f'Log loss: {logloss}')

In [None]:
# Reading test data
df_test = pd.read_csv(test_path)

# Creating testing set
X_test = df_test.iloc[:, 1:]

In [None]:
# Creating predictions to be submitted
predictions = model.predict(X_test)
sub = pd.DataFrame(predictions, columns=['Class_1','Class_2','Class_3','Class_4','Class_5','Class_6','Class_7','Class_8','Class_9'])
sub = pd.concat([df_test['id'], sub], axis=1)
sub.head()

# Creating submission
sub.to_csv('submission.csv', index=False)