In [None]:
import numpy as np
import pandas as pd

import keras.layers as layers
from keras.models import Sequential
from sklearn.preprocessing import Normalizer
import tensorflow as tf

import seaborn as sns
sns.set_theme(palette='magma')
import matplotlib.pyplot as plt

# EDA

In [None]:
train = pd.read_csv("/kaggle/input/titanic/train.csv")
test = pd.read_csv("/kaggle/input/titanic/test.csv")
train.head()

In [None]:
train.info()

On a quick glance most of the classes seem intact with no missing values, except for "Age" which has a few missing counterparts, "Cabin" where majority of the data is missing and "Embarked" with two empty rows which can be dropped.

In [None]:
train.dropna(subset=['Embarked'], how='all', inplace=True)
train = train.drop(['PassengerId'], axis=1)

In [None]:
for x in train.columns:
    print(f"{x}\n{train[x].unique()[:10]}")

Getting a slice of all unique values in each column shows us what is the data like and how can the feature engineering be address for each specific column. The ones with few distinct categorical differences can be one hot encoded, such as SipSp (siblings and spouse), Sex (M F), Embarked (Port of boarding), Pclass (Ticket class) etc. whilst the broader categorical features can be refined down and extracted from to create new and potetially useful features.

In [None]:
f, axes = plt.subplots(1, 3, figsize=(25, 8))
sns.barplot(x="Sex", y="Survived", hue="Pclass", data=train, ax=axes[0])
sns.barplot(x="Pclass", y="Survived", data=train, ax=axes[1]);
sns.barplot(x="Pclass", y="Fare", data=train, ax=axes[2]);

- From the above charts what we infer is that females had a much higher chance of survival over men, and individuals from Pclass 1 had much higher survival rate than the other two classes, which is the highest class tier aboard.

- Females in Pclass 1 and Pclass 2 have similar survival rates, and the highest among all other counter parts, whereas men of only Pclass 1 have some chance of surviving, Men of Pclass 2 and Pclass 3 have the lowest chances of survival, almost approximately 1/8th of Pclass 1 women. 

- Pclass 1 costs significantly more than Pclass 2 and Pclass 3, with the average ticket fare being more than triple of the next.

In [None]:
f, axes = plt.subplots(1, 4, figsize=(25, 8))
sns.histplot(x="Age", hue="Survived", data=train, ax=axes[0]);
sns.histplot(x="Fare", bins=50, hue="Survived", data=train, ax=axes[1]);
sns.histplot(x="Parch", hue="Survived", data=train, ax=axes[2]);
sns.histplot(x="SibSp", bins=50, hue="Survived", data=train, ax=axes[3]);

- Individuals below the age of 10 show the highest survival ration with more survivors by count than dead and almost none above the the age of 60 show any chance at survivng.

- Going by the fares, the lower tiers tickets below a fare of 20 have the worst survival ratio with only around 1/3rd of the population making it, fares above 50 have a significantly better ratio with more survivors than casualities.

- Individuals with no parents or children (Parch) have a equally likely chance of making it or not, whereas having anny relative makes the chance much better, same way around for having siblings or spouse (SibSp) shows a increase in chances of survival proportinal to count.

- The price range is very broad with some extreme outliers above 500, these(3) have been removed to improve distribution.

In [None]:
train.drop(train[train["Fare"]>500].index, inplace=True)
train["Fare"].describe()
# max fare drops to 263 when values above 500 are dropped, 3 rows

In [None]:
# Columns "Cabin", "Ticket", "Name" comparatively have some complex logistics, many missing values and possibly a lot of extractable data
# Age has lot of missing values
# Fare column might prove of significance to get values for the cabin
cols = ["Fare", "Cabin", "Ticket", "Name", "Age"]
train[cols][:5]

In [None]:
print(train[train["Cabin"]!=np.nan].values[:200])
train["Ticket"].value_counts() #tickets with most family members, largest family being of 7 members

- Cabin not only has a vast majority of values missing but also shows repeating cabin numbers and multiple cabin allocations on single entries, this will be a complex situation to tackle. Each cabin entry has an alphbet value followed by an integer, this can be associated with pClass and maybe something can be inferred from the ticket number they reside with to reach optimal way to engineer missing values. Some entries do have a combination of different chars ('F G73') though that will be a minor problem to suffice.

- For the Age and Ticket numeric variable, binning will be suitable, after engineering a methodology to fill in the nan values of age.

- Indexing by the ticket values which were repeated we can get insight into the families that were onboard the titanic with the same ticket numbers. Interestingly, looking at the largest families onboard we see that there are no survivors, whereas earlier from the charts having more family equated a higher rate of survival.

In [None]:
train[train["Ticket"] == "CA. 2343"] # The CA. 2343 family, no one survived.

 # Feature Engineering

In [None]:
# extracting titles of individuals to make use of the names along with getting some idea for the age by inference
def nameExtract(x):
    x = x.lower().split(",")[1].split(".")[0].replace(" ", "")
    return x
    
train["Title"] = train["Name"].apply(lambda x:nameExtract(x))
test["Title"] = test["Name"].apply(lambda x:nameExtract(x))

In [None]:
f, axes = plt.subplots(2, 1, figsize=(25, 10), sharex=True)
sns.histplot(x="Title", hue="Survived", data=train, ax=axes[0]);
sns.barplot(x="Title", y="Age", data=train, ax=axes[1]);

- Looking closely at the survival differences, it is very much more apparent that the ratio of men (mr)* is a lot higher than others, but the ratio of survivability is also the worst. Despite being almost 2 times more men onboard, the number of survivors is half than that on women.

- Women (mrs, miss) show a high survival ratio with more survivors than dead, with adult women (mrs) showing comparatively more survivors.

- Now that we have the titles, deciphering the average bin of ages based on the titles of their non-missing counterparts we can get a decent extimation for filling out the values of the missing.

In [None]:
train["Title"].unique()

In [None]:
for x in train["Title"].unique():
    train[train["Title"]==x] = train[train["Title"]==x].fillna(train[train["Title"]==x].Age.mean())
    
for x in test["Title"].unique():
    test[test["Title"]==x] = test[test["Title"]==x].fillna(test[test["Title"]==x].Age.mean())

f, axes = plt.subplots(1, 1, figsize=(25, 8), sharex=True)
sns.barplot(x="Title", y="Age", data=train, ax=axes);

All missing age values have been replaced with the means age of their respective title distribution of the Age, thus we see the graph not change proportions.

In [None]:
test = test.fillna(40)

In [None]:
def cabinExtract(x):
    try:
        x = [n.lower() for n in x if n.isalpha()][0]
    except:
        return np.nan
    return x

train["CabinLetter"] = train["Cabin"].apply(lambda x:cabinExtract(x))

In [None]:
# bins for plotting
train["FareBin10"] = train["Fare"].apply(lambda x:round(x/10)*10) # creating bins of 10 for fare
train["AgeBin5"] = train["Age"].apply(lambda x:round(x/5)*5) # creating bings of 5 for age
test["FareBin10"] = test["Fare"].apply(lambda x:round(x/10)*10)
test["AgeBin5"] = test["Age"].apply(lambda x:round(x/5)*5)

In [None]:
# We get much simpler information to work with, constituting of 8 symbols
train["CabinLetter"].value_counts()

In [None]:
f, axes = plt.subplots(2, 2, figsize=(25, 8))
sns.histplot(x="CabinLetter", hue="Pclass", data=train, ax=axes[0,0]);
sns.barplot(x="CabinLetter", y="Fare", hue="Pclass", data=train, ax=axes[0,1]);
sns.histplot(x="CabinLetter", hue="Sex", data=train, ax=axes[1,0]);
sns.barplot(x="CabinLetter", y="Fare", hue="Survived", data=train, ax=axes[1,1]);

- Plotting the simplified cabin letters lebelled with the Pclass they associate with shows a clear indication for most of the letters, (c, a, b, t) all fall purely under Pclass 1, CabinLetter (e, g, d, f) have mixed Pclasses with (e, d) mostly with Pclass 1. (g, f) have no Pclass 1 members. (g) is purely Pclass 3.
- CabinLetter (g) only constitues of women in the available data points, (a) constitutes of almost all males, rest Letters have a similar Sex distribution.

- For now lets say we can save our Cabin column and more on without much guilt.

# Modelling and Predictions

In [None]:
cols = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked', 'Fare', 'Age', 'Title'] # 'Title'
cat_cols = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked', 'Title'] # 'Title'
num_cols= ['Fare', 'Age']
training = train[cols]#.astype("Float16")
testing = test[cols]
label = train["Survived"].values.astype("float16")

In [None]:
combined = pd.concat([training, testing]).astype("object")
combined.info()

In [None]:
transformer = Normalizer()
transformed = pd.DataFrame()
transformed[["Age", "Fare"]]=transformer.fit_transform(combined[["Age", "Fare"]])

- Applying normalization to values of both training and testing sets
- Getting dummies for categorical columns using pandas for both train and test sets

In [None]:

combined = pd.get_dummies(combined[cat_cols])
combined[["Age", "Fare"]]=transformed[["Age", "Fare"]].astype("float16")
print(combined.shape)
combined.info()

In [None]:
'''
CATEGORICAL_COLUMNS = cols[:6]
NUMERIC_COLUMNS = cols[6:]

feature_columns = []
for feature_name in CATEGORICAL_COLUMNS:
  vocabulary = training[feature_name].unique()
  feature_columns.append(tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocabulary))

for feature_name in NUMERIC_COLUMNS:
  feature_columns.append(tf.feature_column.numeric_column(feature_name, dtype=tf.float32))

label = train["Survived"].values.astype("float16")

def make_input_fn(data_df, label_df, num_epochs=10, shuffle=True, batch_size=32):
  def input_function():
    ds = tf.data.Dataset.from_tensor_slices((dict(training), label))
    if shuffle:
      ds = ds.shuffle(10)
    ds = ds.batch(batch_size).repeat(num_epochs)
    return ds
  return input_function

train_input_fn = make_input_fn(training, label)
eval_input_fn = make_input_fn(training, label, num_epochs=1, shuffle=False)

linear_est = tf.estimator.DNNLinearCombinedClassifier(linear_feature_columns=feature_columns)
linear_est.train(train_input_fn)
result = linear_est.evaluate(eval_input_fn)
print(result)
''';

In [None]:
print(training.shape)
print(testing.shape)

In [None]:
training = combined[:886]
testing = combined[886:]
print(training.shape)
print(testing.shape)

In [None]:
model = Sequential()
model.add(layers.Input(43,))
model.add(layers.Dense(50,))
model.add(layers.Dense(50,))
model.add(layers.Dense(50,))
model.add(layers.Dense(50,))
model.add(layers.Dense(50,))
model.add(layers.Dense(50,))
model.add(layers.Dense(50,))
model.add(layers.Dense(50,))
model.add(layers.Dense(50,))
model.add(layers.Dense(50,))
model.add(layers.Dense(50,))
model.add(layers.Dense(50,))
model.add(layers.Dense(50,))
model.add(layers.Dense(50,))
model.add(layers.Dense(50,))
model.add(layers.Dense(50,))
model.add(layers.Dense(50,))
model.add(layers.Dense(50,))
model.add(layers.Dense(50,))
model.add(layers.Dense(50,))
model.add(layers.Dense(1,))

model.compile(
    optimizer=tf.keras.optimizers.Adamax(
        learning_rate=0.0005),
    loss=tf.keras.losses.BinaryCrossentropy(
        from_logits=False,
        label_smoothing=0.0,
        reduction="auto"),
    metrics=['accuracy'])

In [None]:
history = model.fit(training, label, batch_size=128, epochs=200, verbose=False)
plt.plot(history.history['accuracy'])

score = model.evaluate(training, label, verbose=0)
print("Test loss:", score[0])
print("Test accuracy:", score[1])

In [None]:
preds = model.predict(testing)

In [None]:
out=[]
for idx, x in enumerate(preds):
    out.append(round(preds[idx][0]))

In [None]:
pd.read_csv("../input/titanic/gender_submission.csv").head()

In [None]:
test["Survived"] = out
out = test[["PassengerId", "Survived"]]

In [None]:
out.head()

In [None]:
out.to_csv("./out.csv", index = False)