In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

sns.set_style('whitegrid')
plt.style.use('fivethirtyeight')

In [None]:
# Read the data
train = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')


In [None]:
# check the data
print(train.shape)
print()
train.head()

In [None]:
# check for NULL
train.isnull().sum()

In [None]:
# fill NaN with mean value
train['Age'] = train['Age'].fillna(train['Age'].mean())
train.head()

In [None]:
# fill NaN with mean values
train['Fare'] = train['Fare'].fillna(train['Fare'].mean())
train.head()

In [None]:
# fill NaN for Embarked column to the most frequent value in the column
train['Embarked'] = train['Embarked'].fillna(train['Embarked'].value_counts().index[0])
train.Embarked.value_counts()

In [None]:
# get first character of Cabin
train['Cabin'] = train['Cabin'].str[0]

# fill NaN to the most frequent value in the column
train['Cabin'] = train['Cabin'].fillna(train['Cabin'].value_counts().index[0])
train.Cabin.value_counts()

In [None]:
# identify feature columns and label column
X = train.copy().drop(columns=['PassengerId', 'Name', 'Ticket'])
y = X.pop('Survived')
X

In [None]:
# Label encoding for categoricals
for colname in X.select_dtypes('object'):
    X[colname], _ = X[colname].factorize()

    
# All discrete features should now have integer dtypes (double-check this before using MI!)
discrete_features = X.dtypes == int

In [None]:
# calculate MI scores for our features
from sklearn.feature_selection import mutual_info_regression

def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

mi_scores = make_mi_scores(X, y, discrete_features)
mi_scores[::3]  # show a few features with their MI scores

In [None]:
def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")


plt.figure(dpi=100, figsize=(8, 5))
plot_mi_scores(mi_scores)

In [None]:
X = train.copy().drop(columns=['PassengerId', 'Name', 'Ticket', 'Parch', 'Age', 'SibSp'])
y = X.pop('Survived')

# Label encoding for categoricals
for colname in X.select_dtypes('object'):
    X[colname], _ = X[colname].factorize()

    
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX = scaler.fit_transform(X)

rescaledX

In [None]:
survived_class = [1, 0]

features = ['Sex','Pclass', 'Fare', 'Embarked', 'Cabin']
label = 'Survived'


In [None]:
from sklearn.model_selection import train_test_split


# Split data 70%-30% into training set and test set
x_train, x_test, y_train, y_test = train_test_split(rescaledX,
                                                    y,
                                                    test_size=0.30,
                                                    random_state=42)

print ('Training Set: %d, Test Set: %d \n' % (len(x_train), len(x_test)))

In [None]:
# Set data types for float features
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')

# Set data types for f labels
y_train = np.asarray(y_train).reshape((-1,1))
y_test = np.asarray(y_test).reshape((-1,1))

print('Ready...')

In [None]:
import tensorflow
from tensorflow import keras
from tensorflow.keras import models
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import utils
from tensorflow.keras import optimizers

# Set random seed for reproducability
tensorflow.random.set_seed(0)

print("Libraries imported.")
print('Keras version:',keras.__version__)
print('TensorFlow version:',tensorflow.__version__)

In [None]:
# Define a classifier network
hl = 10 # Number of hidden layer nodes

model = Sequential()
model.add(Dense(hl, input_dim=len(features), activation='relu'))
model.add(Dense(hl, input_dim=hl, activation='relu'))
model.add(Dense(1, input_dim=hl, activation='sigmoid'))

print(model.summary())

In [None]:
#hyper-parameters for optimizer
learning_rate = 0.001
opt = optimizers.Adam(lr=learning_rate)

model.compile(loss='binary_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])

# Train the model over 50 epochs using 10-observation batches and using the test holdout dataset for validation
num_epochs = 50
history = model.fit(x_train, y_train, epochs=num_epochs, batch_size=10, validation_data=(x_test, y_test))

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt

epoch_nums = range(1,num_epochs+1)
training_loss = history.history["loss"]
validation_loss = history.history["val_loss"]
plt.plot(epoch_nums, training_loss)
plt.plot(epoch_nums, validation_loss)
plt.xlabel('epoch')
plt.ylabel('loss')
plt.legend(['training', 'validation'], loc='upper right')
plt.show()

In [None]:
class_probabilities = model.predict(x_test)
predictions = np.where(class_probabilities > 0.5, 1, 0) #np.argmax(class_probabilities, axis=1)

true_labels = y_test #np.argmax(y_test, axis=1)
cm = confusion_matrix(true_labels, predictions)
cm

In [None]:
print (np.arange(2))

In [None]:
# Tensorflow doesn't have a built-in confusion matrix metric, so we'll use SciKit-Learn
import numpy as np
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline


class_probabilities = model.predict(x_test)
predictions = np.where(class_probabilities > 0.5, 1, 0)  # np.argmax(class_probabilities, axis=1)
true_labels = y_test  # np.argmax(y_test, axis=1)

# Plot the confusion matrix
cm = confusion_matrix(true_labels, predictions)
plt.imshow(cm, interpolation="nearest", cmap=plt.cm.Blues)
plt.colorbar()
tick_marks = np.arange(2)
plt.xticks(tick_marks,  rotation=85)
plt.yticks(tick_marks)
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.show()

In [None]:
# fill NaN with mean value
test['Age'] = test['Age'].fillna(test['Age'].mean())

# fill NaN with mean values
test['Fare'] = test['Fare'].fillna(test['Fare'].mean())

# fill NaN for Embarked column to the most frequent value in the column
test['Embarked'] = test['Embarked'].fillna(test['Embarked'].value_counts().index[0])

# convert Sex (male/female) to 1 and 0
test['Sex'] = test.Sex.astype('category').cat.codes

# convert Embarked (S,C,Q) to numbers (2,0,1) respectively
test['Embarked'] = test.Embarked.astype('category').cat.codes


# get first character of Cabin
train['Cabin'] = train['Cabin'].str[0]
# fill NaN to the most frequent value in the column
train['Cabin'] = train['Cabin'].fillna(train['Cabin'].value_counts().index[0])



In [None]:
# Label encoding for categoricals
for colname in test.select_dtypes('object'):
    test[colname], _ = test[colname].factorize()
    
test.head()

In [None]:
test_file = test.drop(columns=['PassengerId', 'Name', 'Ticket', 'Parch', 'Age', 'SibSp'], axis=1)

rescaled_test = scaler.fit_transform(test_file)
rescaled_test

In [None]:
# check for null
test.isnull().sum()

In [None]:
final_predictions = model.predict(rescaled_test)

In [None]:
final_predictions

In [None]:
# convert the probability to 0 and 1
final_predictions = np.where(final_predictions > 0.5, 1, 0).reshape(-1)

In [None]:
final_predictions

In [None]:
num_zeros = (final_predictions == 0).sum()
num_ones = (final_predictions == 1).sum()
print (num_zeros)
print (num_ones)

In [None]:
output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': final_predictions})
output.groupby('Survived').count()

In [None]:
output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': final_predictions})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")