In [None]:
from numpy.random import seed
seed(9)
import tensorflow
tensorflow.random.set_seed(9)
import pandas as pd       
import numpy as np
import matplotlib.pyplot as plt    
import seaborn as sns


In [None]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
from sklearn import metrics
from sklearn.model_selection import train_test_split as tts
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import callbacks
from keras.optimizers import Adam

In [None]:
train_data = pd.read_csv('/kaggle/input/tabular-playground-series-jun-2021/train.csv')
test_data = pd.read_csv('/kaggle/input/tabular-playground-series-jun-2021/test.csv')

In [None]:
display(train_data.shape)
display(test_data.shape)
display(train_data.head(5))
display(test_data.head(3))

In [None]:
train_data = train_data.set_index('id')
test_data = test_data.set_index('id')

In [None]:
corr = train_data.corr()
corr

In [None]:
target_counts = train_data.target.value_counts()
target_counts

In [None]:
x_train_data = train_data.drop('target', axis = 1)
y_train_data = train_data['target']
x_train_data.head()

Let us convert the outliers which are above 90% quantile to fit into the non-outlier range.

In [None]:
# Calculate the adjustment required for outliers
adjustment = x_train_data.quantile(.75) + (x_train_data.quantile(.75) - x_train_data.quantile(.25))*1.5
adjustment

In [None]:
quantiles = x_train_data.quantile(.9)

In [None]:
#  Convert outliers into acceptable range for train data
for col in x_train_data.columns:
    x_train_data.loc[x_train_data[col] > quantiles[col], col]= adjustment[col]
x_train_data

In [None]:
x_train_data.max(axis=1).max()

In [None]:
# Visualize the distribution offeatures from '0 to 24'
plt.figure(figsize=(15,10))
ax=sns.boxplot(x="variable", y="value", data=pd.melt(x_train_data.iloc[:,:25]))
ax.set_xticklabels(ax.get_xticklabels(),rotation=30)
plt.show()

In [None]:
# Visualize the distribution offeatures from '25 t0 49'
plt.figure(figsize=(15,10))
ax=sns.boxplot(x="variable", y="value", data=pd.melt(x_train_data.iloc[:,25:50]))
ax.set_xticklabels(ax.get_xticklabels(),rotation=30)
plt.show()

In [None]:
# Visualize the distribution offeatures from '50 t0 74'
plt.figure(figsize=(15,10))
ax=sns.boxplot(x="variable", y="value", data=pd.melt(x_train_data.iloc[:,50:]))
ax.set_xticklabels(ax.get_xticklabels(),rotation=30)
plt.show()

In [None]:
#  Convert outliers into acceptable range for test data
for col in test_data.columns:
    test_data.loc[test_data[col] > quantiles[col], col]= adjustment[col]
display(test_data)

display(test_data.max(axis=1).max())
test_data.shape

In [None]:
# set call back with early stopping
early_stopping = callbacks.EarlyStopping(monitor='val_loss',patience=5,
    min_delta=0.0000001,restore_best_weights=True,verbose=1,)

# additional callback
plateau = callbacks.ReduceLROnPlateau(monitor="val_loss",factor = 0.5,                                     
    patience = 5,min_delt = 0.0000001, cooldown = 0, verbose=1) 

In [None]:
# Factorize target classes
cls_enc = {'Class_1': 0,'Class_2': 1,'Class_3': 2,'Class_4': 3,
            'Class_5': 4,'Class_6': 5,'Class_7': 6,'Class_8': 7,'Class_9': 8}
y_train_data = y_train_data.map(cls_enc).astype('int')
y_train_data.tail()

In [None]:
#  Convert target classes into categorical one hot encoding
from tensorflow.keras.utils import to_categorical
y_train_final = to_categorical(y_train_data)
y_train_final

In [None]:
X_train, X_val, y_train, y_val = tts(x_train_data, y_train_final, test_size = 0.2, stratify = y_train_data)

In [None]:
# Set grid parameters for testing neural network performance
output_dim = [4,8,16]
learn_rate = [.01, .001, .0001]
batch_size = [128,256,512,1024, 5120]
valid_loss_results = []

In [None]:
# Build and evaluate the model
for op_dim in output_dim:
    for lr in learn_rate:
        for b_size in batch_size:
            train_model = keras.Sequential([layers.Input(shape = (75,)),
                layers.Embedding(18, op_dim, input_length = 75),
                layers.Flatten(),
                layers.Dense(256, activation = 'relu'),
                layers.BatchNormalization(),
                layers.Dropout(0.4),
                layers.Dense(128, activation = 'relu'),
                layers.BatchNormalization(),
                layers.Dropout(0.4),
                layers.Dense(64, activation = 'relu'),
                layers.BatchNormalization(),
                layers.Dropout(0.3),
                layers.Dense(32, activation = 'relu'),
                layers.BatchNormalization(),
                layers.Dropout(0.2),    
                layers.Dense(9, activation = 'softmax'),
            ])
            train_model.compile(loss='categorical_crossentropy', optimizer = keras.optimizers.Adam(learning_rate=lr), metrics='accuracy')
            train_model.fit(X_train, y_train, batch_size = b_size, epochs = 50,
                        validation_split=.2,callbacks=[early_stopping, plateau], verbose=0)
            val_loss = train_model.evaluate(X_val, y_val, verbose = 0)[0]
            results = [op_dim,lr,b_size,val_loss]
            valid_loss_results.append(results)
            print('Loop ',op_dim,lr,b_size, ' is completed')

In [None]:
results=valid_loss_results.copy()

In [None]:
pd.DataFrame(results, columns=["op_dim","learn_rate","batch_size","validation_loss"]).sort_values("validation_loss")

In [None]:
# Train, evaluate teh fina model and predict on test data
y_pred = np.zeros((100000,9))
loss = []
val_loss = []
k=range(1,6)

for i in k:
    iter_model = keras.Sequential([
    layers.Input(shape = 75,),
    layers.Embedding(18, 4, input_length = 75),
    layers.Flatten(),
    layers.Dense(256, activation = 'relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.4),
    layers.Dense(128, activation = 'relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.4),
    layers.Dense(64, activation = 'relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(32, activation = 'relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.2),    
    layers.Dense(9, activation = 'softmax'),
    ])

    iter_model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=.0001), metrics=['accuracy'])

    history=iter_model.fit(x_train_data, y_train_final, epochs = 50, validation_split=.1, callbacks= [early_stopping,plateau], batch_size=256, verbose=0)
    y_pred += iter_model.predict(test_data)
    loss.append(np.mean(history.history['loss']))
    val_loss.append(np.mean(history.history['val_loss']))
    print("Iteration %i completed" % i)

In [None]:
print('Loss: {}'.format(loss))
print('Validation loss: {}'.format(val_loss))

In [None]:
preds_test = y_pred/len(k)
preds_test

In [None]:
predictions_dl=pd.DataFrame(preds_test, columns= ['Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6', 'Class_7', 'Class_8', 'Class_9'])
predictions_dl.insert(0,'id',range(200000,300000))
predictions_dl

In [None]:
predictions_dl.to_csv('predictions_dl.csv', index=False)
print("Submission was successfully saved!")

Let us train and test using a XGboost model

In [None]:
y_train_gbc= pd.factorize(y_train_data)[0]

In [None]:
import xgboost as xgb
xgbc = xgb.XGBClassifier()

In [None]:
y_train_gbc.shape

In [None]:
X_train, X_val, y_train, y_val = tts(x_train_data, y_train_gbc, test_size = 0.2, stratify = y_train_data)
xgbc.fit(X_train, y_train)

In [None]:
from sklearn.metrics import log_loss
y_pred = xgbc.predict_proba(X_val)
log_loss(y_val, y_pred)

In [None]:
preds_test_xgb = xgbc.predict_proba(test_data)
print(preds_test_xgb)

In [None]:
predictions_xgb=pd.DataFrame(preds_test_xgb, columns= ['Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6', 'Class_7', 'Class_8', 'Class_9'])
predictions_xgb

In [None]:
predictions_xgb=pd.DataFrame(predictions_xgb, columns= ['Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6', 'Class_7', 'Class_8', 'Class_9'])
predictions_xgb.insert(0,'id',range(200000,300000))
predictions_xgb

In [None]:
predictions_xgb.to_csv('predictions_xgb.csv', index=False)
print("Submission was successfully saved!")

Lastly we  train and test using a LogisticRegression model

In [None]:
from sklearn.linear_model import LogisticRegression
lrc = LogisticRegression(max_iter=500,multi_class="multinomial")
lrc.fit(X_train, y_train)

In [None]:
preds = lrc.predict_proba(X_val)
print(preds.shape)
print(preds)
print(log_loss(y_val, preds))

In [None]:
preds_test_lr = lrc.predict_proba(test_data)
print(preds_test_lr)

In [None]:
predictions_lr=pd.DataFrame(preds_test_lr, columns= ['Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6', 'Class_7', 'Class_8', 'Class_9'])
predictions_lr.insert(0,'id',range(200000,300000))
predictions_lr

In [None]:
predictions_lr.to_csv('predictions_lr.csv', index=False)
print("Submission was successfully saved!")

Our final predictions will be the ensembled predictions (avearge of all our 3 models)

In [None]:
comb_predictions = (predictions_dl+predictions_xgb+predictions_lr)/3
comb_predictions.id = comb_predictions.id.astype(int)
comb_predictions

In [None]:
comb_predictions.to_csv('my_submission.csv', index=False)
print("Submission was successfully saved!")