In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras import optimizers
import matplotlib.pyplot as plt

Hey! This is my first neural network I have ever written. I did not set myself the task of making the highest possible accuracy, all I wanted was to understand what neural networks are.

**I would be glad to receive constructive criticism! :)**

# **Data analysis and preparation**

In [None]:
train_data = pd.read_csv('../input/titanic/train.csv')
test_data = pd.read_csv('../input/titanic/test.csv')
data = [train_data, test_data]

In [None]:
train_data.shape

In [None]:
test_data.shape

In [None]:
train_data.head(15)

In [None]:
test_data.head(15)

**The first thing that comes to mind is to change the data in the "Sex" column. I can replace "male" with 1 and "female" with 0:**

In [None]:
train_data.Sex.unique()   # I make sure there are only 2 unique values in this column

In [None]:
for column in data:
    column['Sex'] = column['Sex'].map({'male' : 1, 'female' : 0}).astype(int)

In [None]:
train_data.head()

In [None]:
test_data.head()

**Similar can be done with the "Embarked" column:**

In [None]:
train_data.Embarked.unique()

In [None]:
# as we can see, some cells in the dataset are not filled

In [None]:
train_data.Embarked.value_counts()

In [None]:
for column in data:      # so, I think, nothing terrible will happen if I replace "nan" with "S"
    column['Embarked'] = column['Embarked'].fillna('S')

In [None]:
for column in data:
    column['Embarked'] = column['Embarked'].map({'S': 0, 'C': 1, 'Q' : 2}).astype(int)

In [None]:
train_data.head()

In [None]:
test_data.head()

**"SibSp" and "Parch" can be combined into one column and called "Family":**

In [None]:
for column in data:
    column['Family'] = column['SibSp'] + column['Parch']

In [None]:
train_data.head()

In [None]:
test_data.head()

**The values from the "Fare" column can be divided into 4 groups:**

In [None]:
for column in data:    # replace missing values with mean
    column['Fare'] = column['Fare'].fillna(train_data['Fare'].median())

In [None]:
train_data['FareGroups'] = pd.qcut(train_data['Fare'], 4)

In [None]:
train_data.head()

In [None]:
train_data.FareGroups.unique()

In [None]:
for column in data:
    column.loc[column['Fare'] <= 7.91, 'Fare'] = 0
    column.loc[(column['Fare'] > 7.91) & (column['Fare'] <= 14.454), 'Fare'] = 1
    column.loc[(column['Fare'] > 14.454) & (column['Fare'] <= 31), 'Fare']   = 2
    column.loc[ column['Fare'] > 31, 'Fare'] = 3
    column['Fare'] = column['Fare'].astype(int)

In [None]:
train_data.head()

In [None]:
test_data.head()

**I will do something similar with the "Age" column:**

In [None]:
for column in data:
    lapses_number = column['Age'].isna().sum()
    mean_value = column['Age'].mean()
    standard_deviation = column['Age'].std()
    filler = np.random.randint(mean_value - standard_deviation, mean_value + standard_deviation, size=lapses_number)
    column['Age'][np.isnan(column['Age'])] = filler
    column['Age'] = column['Age'].astype(int)
    train_data['AgeGroups'] = pd.cut(train_data['Age'], 5)

In [None]:
for column in data:
    column.loc[column['Age'] <= 16, 'Age'] = 0
    column.loc[(column['Age'] > 16) & (column['Age'] <= 32), 'Age'] = 1
    column.loc[(column['Age'] > 32) & (column['Age'] <= 48), 'Age'] = 2
    column.loc[(column['Age'] > 48) & (column['Age'] <= 64), 'Age'] = 3
    column.loc[column['Age'] > 64, 'Age'] = 4

In [None]:
train_data.head()

In [None]:
test_data.head()

**Names are unlikely to give useful information to the neural network, but prefixes (Mr., Mrs., etc.) can help:**

In [None]:
for column in data:
    column['Prefix'] = column.Name.str.extract(' ([A-Za-z]+)\.')

In [None]:
train_data.Prefix.value_counts()

In [None]:
test_data.Prefix.value_counts()

In [None]:
for column in data:   # I replace rare prefixes with "Other"
    column['Prefix'] = column['Prefix'].replace(['Dr', 'Rev', 'Mile', 'Col', 'Major', 'Jonkheer', 'Mme',\
                                                 'Capt', 'Sir', 'Don', 'Ms', 'Lady', 'Countess', 'Dona'], 'Other')

In [None]:
for column in data:
    column['Prefix'] = column['Prefix'].map({"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Other": 5})
    column['Prefix'] = column['Prefix'].fillna(0)
    column['Prefix'] = column['Prefix'].astype(int)

In [None]:
train_data.head()

In [None]:
test_data.head()

**It remains only to remove unnecessary columns:**

In [None]:
train_data = train_data.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'FareGroups', 'AgeGroups'], 1)
test_data = test_data.drop(['Name', 'SibSp', 'Parch', 'Ticket', 'Cabin'], 1)

In [None]:
train_data.head()

In [None]:
test_data.head()

# **Building and training the model**

In [None]:
X_train = train_data.drop('Survived', axis=1)
y_train = train_data['Survived']
X_test = test_data.drop("PassengerId", axis=1).copy()

In [None]:
X_train.shape, y_train.shape, X_test.shape

In [None]:
X_train.head(2)

In [None]:
y_train.head()

In [None]:
X_test.head(2)

In [None]:
X_val = X_train[:220]
X_part = X_train[220:]
y_val = y_train[:220]
y_part = y_train[220:]

In [None]:
model = models.Sequential()
model.add(layers.Dense(16, activation='relu', input_shape=(X_part.shape)))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
history = model.fit(X_part, 
                    y_part,
                    epochs=50,
                    batch_size=1,
                    validation_data=(X_val, y_val))

In [None]:
history.history.keys()

In [None]:
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(loss) + 1)
plt.figure(figsize=(20,10))
plt.plot(epochs, loss, 'b', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
plt.clf()
accuracy = history.history['accuracy']
val_accuracy = history.history['val_accuracy']
epochs = range(1, len(accuracy) + 1)
plt.figure(figsize=(20,10))
plt.plot(epochs, accuracy, 'b', label='Training accuracy')
plt.plot(epochs, val_accuracy, 'r', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
y_test = model.predict(X_test)
y_test[:10]

In [None]:
y_test = (y_test > 0.5).astype(int).reshape(X_test.shape[0])
y_test[:], len(y_test)


In [None]:
result = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': y_test})
result.to_csv('submission.csv', index=False)