In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

First of all, we import to the train data as "train" and "train_orig". "train_orig" same as "train" for if we do something wrong, we can access the original train data.

In [None]:
train = pd.read_csv("/kaggle/input/titanic/train.csv")
train_orig = pd.read_csv("/kaggle/input/titanic/train.csv")

We are looking at the first five lines in our data. This way we can recognize features.

In [None]:
train.head()

We are looking at correlations. The important thing for us is the "Survived" column that we will predict.

The correlation increases as it approaches 1. This shows that there is a linear link between the two variables. (It doesn't matter if it is negative, only the direction of linearity is changing.)

In [None]:
train.corr()

In order for our model to work and perform better, we have to deal with missing data.

In [None]:
train.isnull().any()

We have 3 missing features. Let's use the "pandas_profilling" library for a closer look. (It didn't work on my personal computer due to version problems, but I will share the file with you.)

In [None]:
from pandas_profiling import ProfileReport
prof = ProfileReport(train)
prof.to_file(output_file='output.html')

In [None]:
train[train["Age"].isnull()]

In [None]:
import seaborn as sns

sns.distplot(train["Age"])

In [None]:
train["Age"].max()

In [None]:
train["Age"].min()

We fill the Age's missing values with median. But otherway we can fill with mean. It is a matter of distribution, but below you can see that the distribution does not matter much when changed either way.

In [None]:
train["Age"].median()

In [None]:
train["Age"] = train["Age"].fillna(train["Age"].median())

In [None]:
sns.distplot(train["Age"])

In [None]:
train_orig["Age"].mean()

In [None]:
train_orig["Age"] = train_orig["Age"].fillna(train["Age"].mean())

In [None]:
sns.distplot(train_orig["Age"])

We fill  the Embarked's missing values with "S" I prefer it because it is the most frequently used.

In [None]:
train[train["Embarked"].isnull()]

In [None]:
train["Embarked"][train["Embarked"].isnull()] ="S"

In [None]:
train.isnull().any()

We're deleting the Cabin column because there are too many minus values.

In [None]:
train=train.drop("Cabin", axis=1)

In [None]:
train=train.drop("Name", axis=1)

# We do the same for test data as we do on train data.

In [None]:
test = pd.read_csv("/kaggle/input/titanic/test.csv")
test_orig = pd.read_csv("/kaggle/input/titanic/test.csv")
test.head()

In [None]:
test.isnull().any()

In [None]:
sns.distplot(test["Age"])

In [None]:
test["Age"] = test["Age"].fillna(test["Age"].median())

In [None]:
sns.distplot(test["Age"])

In [None]:
test[test["Fare"].isnull()]

In [None]:
test["Fare"][test["Pclass"]==3].median()

In [None]:
test["Fare"][test["Fare"].isnull()]=test["Fare"][test["Pclass"]==3].median()

In [None]:
test= test.drop("Cabin",axis=1)

In [None]:
test= test.drop("Name",axis=1)

In [None]:
train.columns

In [None]:
test.columns

In [None]:
train["Sex"].unique()

In [None]:
test["Sex"].unique()

In [None]:
train.dtypes

We want to give numerical values to our model, we convert the object type values to numeric values.
We use one_hot for those with lesser values.
We use one_hot for those with many different values

In [None]:
columns_to_onehot = ["Sex", "Embarked"]
columns_to_label=["Ticket"]

In [None]:
train["Ticket"]

In [None]:
train=pd.get_dummies(train, columns=columns_to_onehot)

In [None]:
train

In [None]:
test=pd.get_dummies(test, columns=columns_to_onehot)

In [None]:
test

In [None]:
test.dtypes

In [None]:
from sklearn.preprocessing import LabelEncoder

le= LabelEncoder()


In [None]:
train["Ticket"]=le.fit_transform(train["Ticket"])
test["Ticket"]=le.fit_transform(test["Ticket"])

In [None]:
pi = test["PassengerId"]

We will keep Passenger_Id separate and use it for Submission.

In [None]:
train = train.drop("PassengerId", axis=1)

In [None]:
test = test.drop("PassengerId", axis=1)

In [None]:
X_train = train.drop("Survived", axis=1)
y_train = train["Survived"]
X_test = test

In [None]:
X_train

In [None]:
y_train

In [None]:
X_test

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers as L

We build our neural network model. Going from high neuron count to low usually increases our performance.

We use droput because we try to avoid overfitting.

In [None]:
model = Sequential(name='titanic_model')

model.add(L.InputLayer(input_shape=(11,))) # necessary to use model.summary()

model.add(L.Dense(2048, activation='relu'))
model.add(L.Dropout(0.4))
model.add(L.Dense(1024, activation='relu'))
model.add(L.Dropout(0.4))
model.add(L.Dense(512, activation='relu'))
model.add(L.Dropout(0.4))
model.add(L.Dense(256, activation='relu'))
model.add(L.Dropout(0.4))
model.add(L.Dense(128, activation='relu'))
model.add(L.Dropout(0.4))
model.add(L.Dense(64, activation='relu'))

model.add(L.Dense(32, activation='relu'))
model.add(L.Dense(1, activation='sigmoid')) # output layer, use sigmoid for binary

model.summary()

In [None]:
model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(0.0001), metrics=['accuracy'])

In [None]:
history = model.fit(X_train, y_train,
                    batch_size=16, 
                    epochs=500)

In [None]:
import matplotlib.pyplot as plt
print(history.history.keys())
# summarize history for accuracy
plt.plot(history.history['loss'])
plt.plot(history.history['accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'acc'], loc='upper left')
plt.show()


In [None]:
preds = model.predict(test)

In [None]:
preds

In [None]:
submission = pd.read_csv("/kaggle/input/titanic/gender_submission.csv")
submission['Survived'] = [0 if pred < 0.5 else 1 for pred in preds]
submission.head(20)

In [None]:
from IPython.display import FileLink


submission.to_csv('submission.csv',index=False)
FileLink(r'submission.csv')