In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.utils import plot_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score
le = LabelEncoder()

In [None]:
data = pd.read_csv('/kaggle/input/gender-classification/Transformed Data Set - Sheet1.csv')
data.head()

In [None]:
# Get the data summary
data.describe()

From the table above, we know that there is no missing value. However, since the data has only a few features and only 66 rows of data, I suspect that there is anomaly in the data. That is, there is maybe some rows that have same feature values but different labels.

In [None]:
# List all the anomaly (same feature values but got different labels)
grouping = data.groupby(list(data.columns)[:-1]).apply(lambda x: x.Gender.nunique())
grouping[grouping.eq(2)]

If we see, there is 2 condition of feature values that have different labels. One of them is shown below.

In [None]:
data[(data['Favorite Color']=='Cool') &
     (data['Favorite Music Genre'] == 'Rock') &
     (data['Favorite Beverage']=="Vodka") &
     (data['Favorite Soft Drink']=='Coca Cola/Pepsi')]

In this case, nothing we can do now. If it is because of the mistake happened during the data entry (human error), then we can drop those values. However, we can't just do that since it will make bias in our model (because in reality, it is reasonable if both genders have same interests. In other cases, we can also add more features (ask more question in questionnaire).

## Preprocess the Data

Because the data is still in text and categorical, then we can change the data into numeric using one-hot encoding. Hence, it can be used as an input value to our Neural Network.

In [None]:
# Split the features and labels
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

# Change the data into one-hot encoding (for features) and change label to 0-1
X = pd.get_dummies(X)
y = le.fit_transform(y)

In [None]:
X.head()

In [None]:
print("Shape of new data: ", X.shape)

## Train the Model using KFold (Split = 5)

Since we have an anomaly in our data, then I prefer to use KFold and try to find best train-test split scenario that gives best accuracy in both train and test set.

In [None]:
def train_model(X_train, X_test, y_train, y_test):
  model = tf.keras.models.Sequential([
        tf.keras.Input(shape=(20), dtype='float32'),
        tf.keras.layers.Dense(units=1024, activation='relu'),
        tf.keras.layers.Dropout(0.4),
        tf.keras.layers.Dense(units=1, activation='sigmoid')
  ])

  model.compile(optimizer=Adam(lr=0.0001),
                loss='binary_crossentropy',
                metrics=['accuracy'])
  
  # Callback to reduce learning rate if no improvement in validation loss for certain number of epochs
  reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, min_lr=1e-8, verbose=0)
  # Callback to stop training if no improvement in validation loss for certain number of epochs
  early_stop = EarlyStopping(monitor='val_loss', patience=20, verbose=0)

  history = model.fit(
    X_train, y_train,
    epochs=1000,
    validation_data=(X_test, y_test),
    callbacks=[reduce_lr, early_stop],
    verbose=0
  )

  tr_loss, tr_acc = model.evaluate(X_train, y_train)
  loss, accuracy = model.evaluate(X_test, y_test)

  return model, history, tr_loss, tr_acc, loss, accuracy

In [None]:
kfold = KFold(n_splits=5, random_state=42, shuffle=True)

loss_arr = []
acc_arr = []
trloss_arr = []
tracc_arr = []

temp_acc = 0

for train, test in kfold.split(data):
  model, history, trloss_val, tracc_val, loss_val, acc_val = train_model(X.iloc[train], X.iloc[test], y[train], y[test])
  if acc_val > temp_acc:
    print("Model changed")
    temp_acc = acc_val
    model.save('best_model.h5')
    train_index = train
    test_index = test
    best_history = history
  trloss_arr.append(trloss_val)
  tracc_arr.append(tracc_val)
  loss_arr.append(loss_val)
  acc_arr.append(acc_val)

In [None]:
# Compile the Train and Test Accuracy from KFold
pd.DataFrame({
    'Train Accuracy': tracc_arr,
    'Test Accuracy': acc_arr},
    index=range(1,6))

We have saved our best model and also its train-test split index, so let's try to import them and use it in different scenario.

In [None]:
print("Train Index (Best Split): ", train_index)
print("Test Index (Best Split): ", test_index)

## Import the Best Data from K-Fold Iteration

In [None]:
# Import model
modeltf = tf.keras.models.load_model('best_model.h5')

In [None]:
# Plot the model architecture
plot_model(model, show_shapes=True)

In [None]:
modeltf.summary()

In [None]:
# Plot the accuracy of the best model for each epoch of training
plt.plot(best_history.history['accuracy'])
plt.plot(best_history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

# Plot the accuracy of the best model for each epoch of loss
plt.plot(best_history.history['loss'])
plt.plot(best_history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
# Using best split from KFold
train_loss, train_acc = modeltf.evaluate(X.iloc[train_index], y[train_index])
test_loss, test_acc = modeltf.evaluate(X.iloc[test_index], y[test_index])

print("\n==============================")
print("Train Accuracy: ", train_acc)
print("Train Loss: ", train_loss)
print("==============================")
print("Test Accuracy: ", test_acc)
print("Test Loss: ", test_loss)

In [None]:
y_pred = modeltf.predict(X.iloc[test_index])
y_pred = (y_pred > 0.5)

In [None]:
compare_res = pd.DataFrame({
    'Y test': y[test_index],
    'Y pred': y_pred.astype(int).reshape(len(y[test_index]))
}, index=test_index)
compare_res

Let's take a look at the data that is wrongly classified.

In [None]:
wrong_res_index = compare_res[compare_res['Y test'] != compare_res['Y pred']].index.values

wrong_res = data.iloc[wrong_res_index,:]
gender_pred = np.where(wrong_res['Gender']=='F', 'M', 'F')
# wrong_res.loc[:,'Gender_Pred'] = gender_pred
wrong_res = wrong_res.assign(Gender_Pred = gender_pred)
wrong_res

From index 22, we can see above that it is wrongly classified (so the model predict in opposite of the actual value). If we see the data in our train (that have the same value with index 22), we can see that it is from our data anomaly (which have 2 different labels from the same feature values). Because M is trained more than F, then the model will predict the value in M (as shown above).

In [None]:
data[(data.index.isin(train_index)) &
     (data['Favorite Color'] == 'Cool') &
     (data['Favorite Music Genre'] == 'Rock') &
     (data['Favorite Beverage'] == "Vodka") &
     (data['Favorite Soft Drink'] == 'Coca Cola/Pepsi')]

If we see the train data that have the same feature values with index 38 (the index that is wrongly classified), we see nothing.

In [None]:
data[(data.index.isin(train_index)) &
     (data['Favorite Color']=='Cool') &
     (data['Favorite Music Genre'] == 'Pop') &
     (data['Favorite Beverage'] == "Whiskey") &
     (data['Favorite Soft Drink'] == 'Fanta')]

In [None]:
cm = confusion_matrix(y[test_index], y_pred)
sns.heatmap(cm.T, square=True, annot=True, fmt='d', cbar=False, cmap="YlGnBu")
plt.xlabel('Actual Label')
plt.ylabel('Predicted Label')

It's for fun only. Let's try to evaluate the model performance using different train_test_split! :D

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Using split scenario from train_test_split
train_loss, train_acc = modeltf.evaluate(X_train, y_train)
test_loss, test_acc = modeltf.evaluate(X_test, y_test)

print("\n==============================")
print("Train Accuracy: ", train_acc)
print("Train Loss: ", train_loss)
print("==============================")
print("Test Accuracy: ", test_acc)
print("Test Loss: ", test_loss)

## **OMG! WE GOT 1.0 ACCURACY SCORE IN THE TEST SET XD**