## Imports

In [3]:
# usual imports 
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, Dropout
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# model imports
# logistic regression
from sklearn.linear_model import LogisticRegression
# Random Forest
from sklearn.ensemble import RandomForestClassifier
# Support Vector Machines
from sklearn.svm import SVC
# Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB
# K-Neighbors
from sklearn.neighbors import KNeighborsClassifier
# Gradient Boost
# Ada Boost
from sklearn.tree import DecisionTreeClassifier # requirement for ada gradient boost
from sklearn.ensemble import AdaBoostClassifier 
print('imports finished')

imports finished


## Load Data

In [4]:
train_data = pd.read_csv('./train.csv')
test_data = pd.read_csv('./test.csv')

print('data loaded')

data loaded


In [5]:
# split data
train_dataset, val_dataset = train_test_split(train_data, test_size=0.3, random_state=42)
print('data splitted')

data splitted


In [6]:
# extract labels
train_labels = train_dataset.pop('Survived')
val_labels = val_dataset.pop('Survived')

print('train labels shape: ', str(train_labels.shape))
print('val labels shape: ', str(val_labels.shape))

print('labels extracted')

train labels shape:  (623,)
val labels shape:  (268,)
labels extracted


## EDA - Data Exploration

### Shapes

In [7]:
print('train shape: ' + str(train_dataset.shape))
print('val shape: ' + str(val_dataset.shape))
print('test data shape: '+ str(test_data.shape))

train shape: (623, 11)
val shape: (268, 11)
test data shape: (418, 11)


### Traininig data info

In [8]:
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 623 entries, 445 to 102
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  623 non-null    int64  
 1   Pclass       623 non-null    int64  
 2   Name         623 non-null    object 
 3   Sex          623 non-null    object 
 4   Age          499 non-null    float64
 5   SibSp        623 non-null    int64  
 6   Parch        623 non-null    int64  
 7   Ticket       623 non-null    object 
 8   Fare         623 non-null    float64
 9   Cabin        139 non-null    object 
 10  Embarked     622 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 58.4+ KB


### Check categorical columns

In [9]:
cat_cols = train_dataset.select_dtypes(include=('object')).columns.to_list()
print(cat_cols)
print('number of cat_cols: ' + str(len(cat_cols)))

['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']
number of cat_cols: 5


## Data Preprocessing

### Removing Columns - from EDA 

In [10]:
# removing passengerId, cabin, column 
passengerId = test_data['PassengerId']
train_dataset = train_dataset.drop(columns=['Cabin', 'PassengerId'])
val_dataset = val_dataset.drop(columns=['Cabin', 'PassengerId'])
test_data = test_data.drop(columns=['Cabin', 'PassengerId'])
print('Columns removed')

Columns removed


### Handle Missing Values

#### Training dataset

In [11]:
# handle Age column missing values
train_ages = train_dataset['Age']
train_non_nan_ages = train_ages[~np.isnan(train_ages)]
train_ages_mean = train_non_nan_ages.mean()
train_dataset['Age'] = train_dataset['Age'].fillna(train_ages_mean)
# handle Embarked
train_embarked = train_dataset['Embarked']
train_embarked_mode = train_embarked.mode()
train_dataset['Embarked'].replace(' ', np.nan, inplace=True) # Replacing non-visible missing values with NaN
train_dataset['Embarked'] = train_dataset['Embarked'].fillna('S')
print('training dataset missing values handled')

training dataset missing values handled


In [12]:
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 623 entries, 445 to 102
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    623 non-null    int64  
 1   Name      623 non-null    object 
 2   Sex       623 non-null    object 
 3   Age       623 non-null    float64
 4   SibSp     623 non-null    int64  
 5   Parch     623 non-null    int64  
 6   Ticket    623 non-null    object 
 7   Fare      623 non-null    float64
 8   Embarked  623 non-null    object 
dtypes: float64(2), int64(3), object(4)
memory usage: 48.7+ KB


#### Val dataset

In [13]:
# handle Age column missing values
val_ages = val_dataset['Age']
val_non_nan_ages = val_ages[~np.isnan(val_ages)]
val_ages_mean = val_non_nan_ages.mean() # handling missing values with the mean of the ages
val_dataset['Age'] = val_dataset['Age'].fillna(val_ages_mean)
# handle Embarked
val_embarked = val_dataset['Embarked']
val_embarked_mode = val_dataset.mode()
val_dataset['Embarked'].replace(' ', np.nan, inplace=True) # Replacing non-visible missing values with NaN
val_dataset['Embarked'] = val_dataset['Embarked'].fillna('S')
print('val dataset missing values handled')

val dataset missing values handled


In [14]:
val_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 268 entries, 709 to 430
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    268 non-null    int64  
 1   Name      268 non-null    object 
 2   Sex       268 non-null    object 
 3   Age       268 non-null    float64
 4   SibSp     268 non-null    int64  
 5   Parch     268 non-null    int64  
 6   Ticket    268 non-null    object 
 7   Fare      268 non-null    float64
 8   Embarked  268 non-null    object 
dtypes: float64(2), int64(3), object(4)
memory usage: 20.9+ KB


#### Test data

In [15]:
# handle Age column missing values
test_ages = test_data['Age']
test_non_nan_ages = test_ages[~np.isnan(test_ages)]
test_ages_mean = test_non_nan_ages.mean()
test_data['Age'] = test_data['Age'].fillna(test_ages_mean)
# handle Embarked
test_data['Embarked'].replace(' ', np.nan, inplace=True) # Replacing non-visible missing values with NaN
test_data['Embarked'] = test_data['Embarked'].fillna('S') # imputing 'S' from mode 
# handle Fare
test_data['Fare'].replace(' ', np.nan, inplace=True)
test_data['Fare'] = test_data['Fare'].fillna(7.75) # 7.75 from mode
print('test data missing values handled')

test data missing values handled


In [16]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Name      418 non-null    object 
 2   Sex       418 non-null    object 
 3   Age       418 non-null    float64
 4   SibSp     418 non-null    int64  
 5   Parch     418 non-null    int64  
 6   Ticket    418 non-null    object 
 7   Fare      418 non-null    float64
 8   Embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(4)
memory usage: 29.5+ KB


### Handle Categorical values (encoding)

In [17]:
# init the encoder
encoder = OneHotEncoder(handle_unknown='ignore')
# encoder = TargetEncoder()

# get categorical columns
cat_cols = [cname for cname in train_dataset.columns if 
           train_dataset[cname].dtype == "object"]

print('processing the following cat cols: ' + str(cat_cols))

# fit the encoder in the training data and then use it on the other datasets
train_data_encoded = encoder.fit_transform(train_dataset[cat_cols])
val_data_encoded = encoder.transform(val_dataset[cat_cols])
test_data_encoded = encoder.transform(test_data[cat_cols])

# convert encoded datasets to pandas dataframes 
train_data_encoded_df = pd.DataFrame(train_data_encoded.toarray(), columns=encoder.get_feature_names_out(cat_cols))
val_data_encoded_df = pd.DataFrame(val_data_encoded.toarray(), columns=encoder.get_feature_names_out(cat_cols))
test_data_encoded_df = pd.DataFrame(test_data_encoded.toarray(), columns=encoder.get_feature_names_out(cat_cols))

# drop cat cols from original dataset
train_dataset = train_dataset.drop(columns=cat_cols, axis=1)
val_dataset = val_dataset.drop(columns=cat_cols, axis=1)
test_data = test_data.drop(columns=cat_cols, axis=1)

# Concatenate with the rest of the features (here we do .reset_index(drop=True) to reset the prevs persisted indexes)
train_dataset_encoded = pd.concat([train_dataset.reset_index(drop=True), train_data_encoded_df], axis=1)
val_dataset_encoded = pd.concat([val_dataset.reset_index(drop=True), val_data_encoded_df], axis=1)
test_dataset_encoded = pd.concat([test_data.reset_index(drop=True), test_data_encoded_df], axis=1)

print(train_dataset_encoded.shape)
print(val_dataset_encoded.shape)
print(test_dataset_encoded.shape)

print('categorical data handled')

processing the following cat cols: ['Name', 'Sex', 'Ticket', 'Embarked']
(623, 1128)
(268, 1128)
(418, 1128)
categorical data handled


### Normalization

In [18]:
# initialized the scaler 
scaler = MinMaxScaler()

# normalize train data
train_data_scaled = scaler.fit_transform(train_dataset_encoded)
val_data_scaled = scaler.transform(val_dataset_encoded)
test_data_scaled = scaler.transform(test_dataset_encoded)

print(train_data_scaled.shape)
print(val_data_scaled.shape)
print(test_data_scaled.shape)

# Verify that data has been normalized correctly
print(np.max(train_data_scaled))
print(np.min(train_data_scaled))

print('data normalized')

(623, 1128)
(268, 1128)
(418, 1128)
data normalized


## Build the models

### Models Implementation

In [20]:
# logistic regression model
logistic_regression_model = LogisticRegression()
logistic_regression_model.fit(train_data_scaled, train_labels)

# random forest model
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_model.fit(train_data_scaled, train_labels)

# support vector classifier model
svm_model = SVC(kernel="linear")
svm_model.fit(train_data_scaled, train_labels)

# naive bayes multinomial classifier
nb_model = MultinomialNB()
nb_model.fit(train_data_scaled, train_labels)

# k-nearest neighbors classifier
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(train_data_scaled, train_labels)

# Adaboost classifier
adaboost_base_classifier = DecisionTreeClassifier(max_depth=1)
adaboost_model = AdaBoostClassifier(adaboost_base_classifier, n_estimators=50, random_state=42)
adaboost_model.fit(train_data_scaled, train_labels)

print('models set')

models set


## Make Predictions

### Generate predictions for Logistic Regression Model

In [1]:
def model_metrics(train_pred):
    accuracy = accuracy_score(val_labels, train_pred)
    confusion = confusion_matrix(val_labels, train_pred)
    report = classification_report(val_labels, train_pred)

    print("Accuracy:", accuracy)
    print("Confusion Matrix:\n", confusion)
    print("Classification Report:\n", report)
    
    return accuracy

#### train predictions 

In [22]:
val_preds_regression_model = logistic_regression_model.predict(val_data_scaled)
regression_model_accuracy = model_metrics(val_preds_regression_model)
print('train preds made for regression model')

val_preds_random_forest_model = random_forest_model.predict(val_data_scaled)
random_forest_model_accuracy = model_metrics(val_preds_random_forest_model)
print('train preds made for random forest model')

val_preds_svm_model = svm_model.predict(val_data_scaled)
svm_model_accuracy = model_metrics(val_preds_svm_model)
print('train preds made for svm model')

val_preds_nb_model = nb_model.predict(val_data_scaled)
nb_model_accuracy = model_metrics(val_preds_nb_model)
print('train preds made for nb model')

val_preds_knn_model = knn_model.predict(val_data_scaled)
knn_model_accuracy = model_metrics(val_preds_knn_model)
print('train preds for knn model')

val_preds_adaboost_model = adaboost_model.predict(val_data_scaled)
adaboost_model_accuracy = model_metrics(val_preds_adaboost_model)
print('train preds for adaboost model')

Accuracy: 0.8097014925373134
Confusion Matrix:
 [[137  20]
 [ 31  80]]
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.87      0.84       157
           1       0.80      0.72      0.76       111

    accuracy                           0.81       268
   macro avg       0.81      0.80      0.80       268
weighted avg       0.81      0.81      0.81       268

train preds made for regression model
Accuracy: 0.8208955223880597
Confusion Matrix:
 [[147  10]
 [ 38  73]]
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.94      0.86       157
           1       0.88      0.66      0.75       111

    accuracy                           0.82       268
   macro avg       0.84      0.80      0.81       268
weighted avg       0.83      0.82      0.82       268

train preds made for random forest model
Accuracy: 0.8171641791044776
Confusion Matrix:
 [[141  16]
 [ 33  78]]
Classi

### Submission

#### Real predictions

In [23]:
preds = nb_model.predict(test_data_scaled)
print('train preds made')

train preds made


#### Generate submission

In [24]:
# generate data frame for submision
submission = pd.DataFrame({
    "PassengerId": passengerId,
    "Survived": preds
})

print('submission set')

submission set


In [25]:
submission.head(20)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0


In [27]:
# write the file to submission
submission.to_csv('./submissions/titanic_dissaster_submission_nb_first_attempt.csv', index=False, header=True)
print('submission file generated')

submission file generated
