### Titanic - Machine Learning from Disaster

### Extract Data from CSV

In [3]:
!pip install imbalanced-learn



In [4]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import tensorflow as tf

raw_csv_data = pd.read_csv('train.csv')
raw_csv_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [5]:
print(raw_csv_data["Sex"].unique())

['male' 'female']


In [6]:
categorical = pd.get_dummies(raw_csv_data[["Pclass","Sex", "SibSp", "Parch", "Embarked"]], drop_first=True) # drop first avoids a dummy trap
numerical = raw_csv_data[['Age','Fare']].fillna(raw_csv_data[['Age','Fare']].mean()) # Fill NaNs smartly
inputs = pd.concat([numerical,categorical], axis=1)

targets = raw_csv_data["Survived"]


### Balance the dataset

In [8]:
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer

# Fill NaN values using mean for numerical data
imputer = SimpleImputer(strategy='mean')
inputs_imputed = imputer.fit_transform(inputs)  # This returns a NumPy array

smote = SMOTE()

unscaled_inputs_equal_priors, target_equal_priors = smote.fit_resample(inputs_imputed, targets)



### Standardize the Inputs

In [10]:
scaled_inputs = preprocessing.scale(unscaled_inputs_equal_priors)

### Shuffle the Data

In [12]:
shuffled_indices = np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_indices)

shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = target_equal_priors[shuffled_indices]

### Split the dataset into train and validate and initialize the test

In [34]:
samples_count = shuffled_inputs.shape[0]
train_samples_count = int(0.8*samples_count)
validation_samples_count = samples_count - train_samples_count

# Splitting data
train_inputs = shuffled_inputs[:train_samples_count]
train_targets = shuffled_targets[:train_samples_count]

validation_inputs = shuffled_inputs[train_samples_count:]
validation_targets = shuffled_targets[train_samples_count:]

train_columns = inputs.columns  # 'inputs' is from training preprocessing

# Preprocess test data
test_data = pd.read_csv('test.csv')
test_categorical = pd.get_dummies(test_data[["Pclass", "Name", "Sex", "SibSp", "Parch", "Ticket", "Embarked"]], drop_first=True)
test_numerical = test_data[['Age', 'Fare']]
test_inputs = pd.concat([test_numerical, test_categorical], axis=1)

# Reindex to match the training columns exactly, filling missing columns with 0
test_inputs = test_inputs.reindex(columns=train_columns, fill_value=0)

# Scale using the same scaler as training (optional if using `preprocessing.scale`)
test_inputs = preprocessing.scale(test_inputs)


In [36]:
import numpy as np

def validate_and_clean_data(inputs, targets=None):
    """
    Validates and cleans input and target arrays:
    - Replaces NaNs and Infs with safe defaults
    - Ensures targets are integers (0 or 1)
    
    Parameters:
        inputs (np.ndarray): Feature matrix.
        targets (np.ndarray, optional): Target labels. Can be None for test sets.
    
    Returns:
        cleaned_inputs (np.ndarray), cleaned_targets (np.ndarray or None)
    """
    
    # Clean inputs
    cleaned_inputs = np.nan_to_num(inputs, nan=0.0, posinf=1e6, neginf=-1e6)
    
    if targets is not None:
        # Check for NaNs/Infs in targets
        cleaned_targets = np.nan_to_num(targets, nan=0.0, posinf=1, neginf=0)
        
        # Force integer 0 or 1
        cleaned_targets = (cleaned_targets > 0.5).astype(int)
        
        # Sanity check
        unique = np.unique(cleaned_targets)
        if not np.array_equal(unique, [0, 1]) and not np.array_equal(unique, [0]) and not np.array_equal(unique, [1]):
            raise ValueError(f"Targets contain invalid values: {unique}")
        
        return cleaned_inputs, cleaned_targets
    
    return cleaned_inputs, None


In [38]:
# Clean data before saving
train_inputs, train_targets = validate_and_clean_data(train_inputs, train_targets)
validation_inputs, validation_targets = validate_and_clean_data(validation_inputs, validation_targets)
test_inputs, _ = validate_and_clean_data(test_inputs)

# Save cleaned data
np.savez('titanic_data_train', inputs=train_inputs, targets=train_targets)
np.savez('titanic_data_validation', inputs=validation_inputs, targets=validation_targets)
np.savez('titanic_data_test', inputs=test_inputs)


### Data

In [40]:
# let's create a temporary variable npz, where we will store each of the three Audiobooks datasets
npz = np.load('titanic_data_train.npz',allow_pickle=True)

# we extract the inputs using the keyword under which we saved them
# to ensure that they are all floats, let's also take care of that
train_inputs = npz['inputs'].astype(float)
# targets must be int because of sparse_categorical_crossentropy (we want to be able to smoothly one-hot encode them)
train_targets = npz['targets'].astype(int)

# we load the validation data in the temporary variable
npz = np.load('titanic_data_validation.npz',allow_pickle=True)
# we can load the inputs and the targets in the same line
validation_inputs, validation_targets = npz['inputs'].astype(float), npz['targets'].astype(int)

# we load the test data in the temporary variable
npz = np.load('titanic_data_test.npz',allow_pickle=True)

test_inputs = npz['inputs'].astype(float)

In [42]:
print("Train inputs shape:", train_inputs.shape)
print("Train targets shape:", train_targets.shape)
print("Validation inputs shape:", validation_inputs.shape)
print("Validation targets shape:", validation_targets.shape)


Train inputs shape: (878, 8)
Train targets shape: (878,)
Validation inputs shape: (220, 8)
Validation targets shape: (220,)


### Create the Model

In [44]:
# Input and output sizes
import tensorflow as tf

# Model definition
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(train_inputs.shape[1],)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train
early_stopping = tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)

model.fit(train_inputs, train_targets,
          epochs=100,
          validation_data=(validation_inputs, validation_targets),
          callbacks=[early_stopping],
          verbose=2)



Epoch 1/100
28/28 - 1s - 23ms/step - accuracy: 0.7084 - loss: 0.6021 - val_accuracy: 0.7727 - val_loss: 0.5241
Epoch 2/100
28/28 - 0s - 2ms/step - accuracy: 0.7904 - loss: 0.4888 - val_accuracy: 0.7773 - val_loss: 0.4782
Epoch 3/100
28/28 - 0s - 1ms/step - accuracy: 0.7916 - loss: 0.4567 - val_accuracy: 0.7955 - val_loss: 0.4713
Epoch 4/100
28/28 - 0s - 1ms/step - accuracy: 0.7984 - loss: 0.4352 - val_accuracy: 0.7909 - val_loss: 0.4636
Epoch 5/100
28/28 - 0s - 1ms/step - accuracy: 0.8052 - loss: 0.4321 - val_accuracy: 0.7955 - val_loss: 0.4584
Epoch 6/100
28/28 - 0s - 1ms/step - accuracy: 0.8166 - loss: 0.4215 - val_accuracy: 0.7909 - val_loss: 0.4601
Epoch 7/100
28/28 - 0s - 1ms/step - accuracy: 0.8144 - loss: 0.4131 - val_accuracy: 0.7955 - val_loss: 0.4552
Epoch 8/100
28/28 - 0s - 1ms/step - accuracy: 0.8200 - loss: 0.4183 - val_accuracy: 0.7955 - val_loss: 0.4552
Epoch 9/100
28/28 - 0s - 1ms/step - accuracy: 0.8121 - loss: 0.4232 - val_accuracy: 0.8000 - val_loss: 0.4520
Epoch 10/

<keras.src.callbacks.history.History at 0x14fa9cc40>

In [48]:
# Predict probabilities for the test data
test_prediction_probabilities = model.predict(test_inputs)

# Convert probabilities to binary predictions (0 or 1)
test_predictions = (test_prediction_probabilities >= 0.5).astype(int)

#View the first few predictions
print(test_predictions[:10])

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 887us/step
[[0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [1]
 [0]
 [1]
 [0]]


In [50]:
submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Survived': test_predictions.ravel()  # flatten to 1D array
})
submission.to_csv('titanic_predictions.csv', index=False)