In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# To train a TensorFlow model which predicts titanic survivours

In [None]:
#import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf

**Load Dataset**

In [None]:
titanic = pd.read_csv("/kaggle/input/titanic-dataset/Titanic-Dataset.csv")

#drop columns that are not required
titanic.drop(["PassengerId","Name","Ticket"],inplace=True, axis=1)


In [None]:
#convert numeric colums to float32
for col in titanic.select_dtypes(include="number").columns:
    titanic[col] = titanic[col].astype(float)

In [None]:
#add a small number to numeric colums to avoid divide by 0 error
for col in titanic.select_dtypes(include="number").columns:
    if (col != "Survived"):
        titanic[col] = titanic[col]+1e-10

In [None]:
#extract informatoin from dataset
print(titanic.info())
print(titanic.describe())
print(titanic.select_dtypes(include="object").value_counts())
print(titanic["Sex"].value_counts())
print(titanic["Cabin"].value_counts())
print(titanic["Embarked"].value_counts())

In [None]:
#visualize data
titanic.hist(bins=50, figsize=(12,8))
plt.show()

In [None]:
#check for correlations
corr_matrix = titanic.select_dtypes(include="number").corr()
print(corr_matrix["Survived"].sort_values(ascending=False))

In [None]:
from pandas.plotting import scatter_matrix
#create scatter matrix
attributes = ["Survived","Fare","Parch","SibSp", "Age", "Pclass"]
scatter_matrix(titanic[attributes], figsize=(12,8))
plt.show()



# Create stratified train and test set based on Sex and Embarked

In [None]:
from sklearn.model_selection import train_test_split

titanic["temp"]=titanic["Sex"].astype(str) + titanic["Embarked"].astype(str)

#train test split
train_set, test_set = train_test_split(titanic, test_size=0.1,stratify=titanic["temp"], random_state=42)

print(train_set.shape)
print(test_set.shape)

train_set.drop("temp", axis=1, inplace=True)
test_set.drop("temp", axis=1, inplace=True)
titanic.drop("temp", axis=1, inplace=True)

# Prepare Data For model

In [None]:
#divide train and test sets into inputs and labels
train_inputs = train_set.drop("Survived", axis=1)
train_labels = train_set["Survived"].copy()

test_inputs = test_set.drop("Survived", axis=1)
test_labels = test_set["Survived"].copy()



In [None]:
#Write custom class to detect Cluster Similarity
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import rbf_kernel

class ClusterSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, n_init=10,  gamma=1.0, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state
        self.n_init = n_init
    
    def fit(self, X, y=None, sample_weight=None):
        self.kmeans_ = KMeans(self.n_clusters, random_state=self.random_state, n_init=self.n_init)
        self.kmeans_.fit(X,sample_weight=sample_weight)
        return self #always return self
    def transform(self, X):
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)
    
    def get_feature_names_out(self, names=None):
        return [f"Cluster {i} similarity"for i in range(self.n_clusters)]

In [None]:
#custom functions for ratio pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, MinMaxScaler, StandardScaler, FunctionTransformer
from sklearn.compose import make_column_selector

def column_ratio(X):
    return X[:,[0]]/X[:,[1]]

def ratio_name(function_transformer, feature_names_in):
    return ["ratio"] #feature names out

def ratio_pipeline():
    return make_pipeline(
        SimpleImputer(strategy="median"),
        FunctionTransformer(column_ratio, feature_names_out = ratio_name),
        StandardScaler())

#log pipeline 

log_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    FunctionTransformer(np.log, feature_names_out = "one-to-one"),
    StandardScaler())

#cluster_simil
cluster_simil = ClusterSimilarity(n_clusters=10, n_init=10, gamma=1., random_state=42)

default_num_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    StandardScaler())

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"))

preprocessing_complex = ColumnTransformer([
    ("FareByPclass", ratio_pipeline(), ["Fare", "Pclass"]),
    ("AgeBySibSp", ratio_pipeline(), ["Age","SibSp"]),
    ("AgeByParch", ratio_pipeline(), ["Age","Parch"]),
    ("log", log_pipeline, ["Age", "Fare"]),
    ("geo", cluster_simil, ["Fare"]),
    ("cat", cat_pipeline, make_column_selector(dtype_include=object)),
    
],
remainder = default_num_pipeline)

# Lets Transform Data

In [None]:
#prepare trining data
train_inputs_processed = preprocessing_complex.fit_transform(train_inputs)
test_inputs_processed = preprocessing_complex.transform(test_inputs)

print(train_inputs_processed.shape)
print(test_inputs_processed.shape)
preprocessing_complex.get_feature_names_out()

**Conver to Numpy Array**

In [None]:
train_inputs_array = train_inputs_processed.toarray()
train_labels_array = np.array(train_labels)

test_inputs_array = test_inputs_processed.toarray()
test_labels_array = np.array(test_labels)

print(train_inputs_array)
print(train_labels_array)

print(test_inputs_array)
print(test_labels_array)

# Define and Train TensorFlow Model

In [None]:
import tensorflow as tf

#Building Model using Subclassing API
class CustomModel(tf.keras.Model):
    def __init__(self, units1=300, units2=100, units3=10, activation_hidden="relu", activation_output="softmax", **kwargs):
        super().__init__(**kwargs) #needed to support naming the model
        self.flatten_layer = tf.keras.layers.Flatten()
        self.normalization_layer = tf.keras.layers.Normalization()
        self.hidden_layer1 = tf.keras.layers.Dense(units1, activation=activation_hidden)
        self.hidden_layer2 = tf.keras.layers.Dense(units2, activation=activation_hidden)
        self.concat_layer = tf.keras.layers.Concatenate()
        self.output_layer = tf.keras.layers.Dense(units3, activation=activation_output)
        
    def call(self,inputs):
        flatten = self.flatten_layer(inputs)
        normalized = self.normalization_layer(flatten)
        hidden1 = self.hidden_layer1(normalized)
        hidden2 = self.hidden_layer2(hidden1)
        concat = self.concat_layer([normalized,hidden2])
        output = self.output_layer(concat)
        return output
    
model3 = CustomModel(300,100,1,"relu","sigmoid",name="my_custom_model")

model = model3



In [None]:

#Compiling and Fit Model
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

checkpoint_cb = tf.keras.callbacks.ModelCheckpoint("my_checkpoints", save_weights_only=True)
early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)

model.compile(loss="MeanSquaredError", optimizer=optimizer, metrics=["accuracy"])
history = model.fit(train_inputs_array, train_labels_array, epochs=30, validation_data=(test_inputs_array, test_labels_array), callbacks=[checkpoint_cb, early_stopping_cb])



In [None]:
#Plot training and Validation loss
import matplotlib.pyplot as plt
import pandas as pd

pd.DataFrame(history.history).plot(
    figsize=(8,5), grid=True, xlabel="Epoch", style=["r--","r--.","b-","b-."]
)

In [None]:
#Evaluate model
loss, accuracy = model.evaluate(test_inputs_array, test_labels_array)
print("Test Accuracy is %.2f" %(accuracy))



In [None]:
test_labels_predicted_array = model.predict(test_inputs_array)
print(test_labels_predicted_array)

# Note predictions from TF model are float between 0 to 1, need to convert these to either 0 or 1 based on threshold

In [None]:
#check precision recall curve
from sklearn.metrics import precision_recall_curve
precisions, recalls, thresholds = precision_recall_curve(test_labels_array,test_labels_predicted_array)

threshold = 0.32

plt.figure(figsize=(12,8))
plt.plot(thresholds, precisions[:-1], "b--", label="Precision", linewidth=2)
plt.plot(thresholds, recalls[:-1], "g-", label="Recall", linewidth=2)
plt.vlines(threshold,0,1.0,"k","dotted", label="threshold")
plt.xlabel("Threshold")
plt.ylabel("Precision and Recall")
plt.title("Precisoin Recall Curve")
plt.legend()
plt.grid()
plt.show

In [None]:
#Another plot of Precision Recall
plt.figure(figsize=(12,8))
plt.plot(recalls, precisions, linewidth=2, label="Precision/Recall curve")
plt.vlines(threshold,0,1.0,"k","dotted", label="threshold")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precisoin/Recall Curve")
plt.legend()
plt.grid()
plt.show

**Fixing Threshold as 0.32 based on abvoe observation**

In [None]:
test_labels_predicted_binary = (test_labels_predicted_array > threshold).astype(int)
print(test_labels_predicted_binary)

In [None]:
#Now check other matrix

from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
cm = confusion_matrix(test_labels_array, test_labels_predicted_binary)
precision = precision_score(test_labels_array, test_labels_predicted_binary)
recall = recall_score(test_labels_array, test_labels_predicted_binary)
f1 = f1_score(test_labels_array, test_labels_predicted_binary)
accuracy = accuracy_score(test_labels_array, test_labels_predicted_binary)

print("Confusion Matrix")
print(cm)
print(" ")

print("Precision: %.3f \n" %(precision))
print("Recall: %.3f \n" %(recall))
print("F1 Score: %.3f \n" %(f1))
print("Accuracy: %.3f \n" %(accuracy))
