## Imports

In [2]:
# usual imports 
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, Dropout
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# model imports
# logistic regression
from sklearn.linear_model import LogisticRegression
# Random Forest
from sklearn.ensemble import RandomForestClassifier
# Support Vector Machines
from sklearn.svm import SVC
# Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB
# K-Neighbors
from sklearn.neighbors import KNeighborsClassifier
# Gradient Boost
# Ada Boost
from sklearn.tree import DecisionTreeClassifier # requirement for ada gradient boost
from sklearn.ensemble import AdaBoostClassifier 
print('imports finished')

2023-12-21 12:13:40.870988: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


imports finished


## Load Data

In [3]:
train_dataset = pd.read_csv('./train.csv')
test_data = pd.read_csv('./test.csv')
print('data loaded')

data loaded


In [None]:
# split data
# train_dataset, val_dataset = train_test_split(train_data, test_size=0.3, random_state=42)
# print('data splitted')

In [4]:
# extract labels
train_labels = train_dataset.pop('Survived')
# val_labels = val_dataset.pop('Survived')

print('train labels shape: ', str(train_labels.shape))
# print('val labels shape: ', str(val_labels.shape))

print('labels extracted')

train labels shape:  (891,)
labels extracted


## EDA - Data Exploration

### Shapes

In [None]:
print('train shape: ' + str(train_dataset.shape))
# print('val shape: ' + str(val_dataset.shape))
print('test data shape: '+ str(test_data.shape))

### Traininig data info

In [None]:
# train_dataset.info()

### Check categorical columns

In [None]:
# cat_cols = train_dataset.select_dtypes(include=('object')).columns.to_list()
# print(cat_cols)
# print('number of cat_cols: ' + str(len(cat_cols)))

## Data Cleaning

### Removing Columns - from EDA 

In [5]:
# removing passengerId, cabin, column 
passengerId = test_data['PassengerId']

# dropping unnecesary columns
train_dataset = train_dataset.drop(columns=['Cabin', 'PassengerId'])
# val_dataset = val_dataset.drop(columns=['Cabin', 'PassengerId'])
test_data = test_data.drop(columns=['Cabin', 'PassengerId'])
print('Columns removed')

Columns removed


### Handle Missing Values

#### Training dataset

In [6]:
# handle Age column missing values
train_ages = train_dataset['Age']
train_non_nan_ages = train_ages[~np.isnan(train_ages)]
train_ages_mean = train_non_nan_ages.mean()
train_dataset['Age'] = train_dataset['Age'].fillna(train_ages_mean)
# handle Embarked missing values
train_embarked = train_dataset['Embarked']
train_embarked_mode = train_embarked.mode()
train_dataset['Embarked'].replace(' ', np.nan, inplace=True) # Replacing non-visible missing values with NaN
train_dataset['Embarked'] = train_dataset['Embarked'].fillna('S')
print('training dataset missing values handled')
train_dataset.info()

training dataset missing values handled
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Name      891 non-null    object 
 2   Sex       891 non-null    object 
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Ticket    891 non-null    object 
 7   Fare      891 non-null    float64
 8   Embarked  891 non-null    object 
dtypes: float64(2), int64(3), object(4)
memory usage: 62.8+ KB


#### Val dataset

In [None]:
# # handle Age column missing values
# val_ages = val_dataset['Age']
# val_non_nan_ages = val_ages[~np.isnan(val_ages)]
# val_ages_mean = val_non_nan_ages.mean() # handling missing values with the mean of the ages
# val_dataset['Age'] = val_dataset['Age'].fillna(val_ages_mean)
# # handle Embarked
# val_embarked = val_dataset['Embarked']
# val_embarked_mode = val_dataset.mode()
# val_dataset['Embarked'].replace(' ', np.nan, inplace=True) # Replacing non-visible missing values with NaN
# val_dataset['Embarked'] = val_dataset['Embarked'].fillna('S')
# print('val dataset missing values handled')

In [None]:
# val_dataset.info()

#### Test data

In [7]:
# handle Age column missing values
test_ages = test_data['Age']
test_non_nan_ages = test_ages[~np.isnan(test_ages)]
test_ages_mean = test_non_nan_ages.mean()
test_data['Age'] = test_data['Age'].fillna(test_ages_mean)
# handle Embarked
test_data['Embarked'].replace(' ', np.nan, inplace=True) # Replacing non-visible missing values with NaN
test_data['Embarked'] = test_data['Embarked'].fillna('S') # imputing 'S' from mode 
# handle Fare
test_data['Fare'].replace(' ', np.nan, inplace=True)
test_data['Fare'] = test_data['Fare'].fillna(7.75) # 7.75 from mode
print('test data missing values handled')
test_data.info()

test data missing values handled
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Name      418 non-null    object 
 2   Sex       418 non-null    object 
 3   Age       418 non-null    float64
 4   SibSp     418 non-null    int64  
 5   Parch     418 non-null    int64  
 6   Ticket    418 non-null    object 
 7   Fare      418 non-null    float64
 8   Embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(4)
memory usage: 29.5+ KB


### Handle Categorical values (encoding)

In [None]:
# init the encoder
encoder = OneHotEncoder(handle_unknown='ignore')
# encoder = TargetEncoder()

# get categorical columns
cat_cols = [cname for cname in train_dataset.columns if 
           train_dataset[cname].dtype == "object"]

print('processing the following cat cols: ' + str(cat_cols))

# fit the encoder in the training data and then use it on the other datasets
train_data_encoded = encoder.fit_transform(train_dataset[cat_cols])
# val_data_encoded = encoder.transform(val_dataset[cat_cols])
test_data_encoded = encoder.transform(test_data[cat_cols])

# convert encoded datasets to pandas dataframes 
train_data_encoded_df = pd.DataFrame(train_data_encoded.toarray(), columns=encoder.get_feature_names_out(cat_cols))
# val_data_encoded_df = pd.DataFrame(val_data_encoded.toarray(), columns=encoder.get_feature_names_out(cat_cols))
test_data_encoded_df = pd.DataFrame(test_data_encoded.toarray(), columns=encoder.get_feature_names_out(cat_cols))

# drop cat cols from original dataset
train_dataset = train_dataset.drop(columns=cat_cols, axis=1)
# val_dataset = val_dataset.drop(columns=cat_cols, axis=1)
test_data = test_data.drop(columns=cat_cols, axis=1)

# Concatenate with the rest of the features (here we do .reset_index(drop=True) to reset the prevs persisted indexes)
train_dataset_encoded = pd.concat([train_dataset.reset_index(drop=True), train_data_encoded_df], axis=1)
# val_dataset_encoded = pd.concat([val_dataset.reset_index(drop=True), val_data_encoded_df], axis=1)
test_dataset_encoded = pd.concat([test_data.reset_index(drop=True), test_data_encoded_df], axis=1)

print(train_dataset_encoded.shape)
# print(val_dataset_encoded.shape)
print(test_dataset_encoded.shape)

print('categorical data handled')

### Normalization

In [None]:
# initialized the scaler 
scaler = MinMaxScaler()

# normalize train data
train_data_scaled = scaler.fit_transform(train_dataset_encoded)
# val_data_scaled = scaler.transform(val_dataset_encoded)
test_data_scaled = scaler.transform(test_dataset_encoded)

print(train_data_scaled.shape)
# print(val_data_scaled.shape)
print(test_data_scaled.shape)

print('data normalized')

In [None]:
# Verify that data has been normalized correctly
print(np.max(train_data_scaled))
print(np.min(train_data_scaled))

## Build the models

In [None]:
# put the train labels again before splitting
train_labels_df = train_labels.to_frame(name='Survived')
train_dataset = pd.concat([train_dataset, train_labels_df], axis=1)
train_dataset

In [None]:
# split data
train_data, val_data = train_test_split(train_dataset, test_size=0.3, random_state=42)
# print('data splitted')

### Models Implementation

In [None]:
# logistic regression model
logistic_regression_model = LogisticRegression()
logistic_regression_model.fit(train_data_scaled, train_labels)

# random forest model
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_model.fit(train_data_scaled, train_labels)

# support vector classifier model
svm_model = SVC(kernel="linear")
svm_model.fit(train_data_scaled, train_labels)

# naive bayes multinomial classifier
nb_model = MultinomialNB()
nb_model.fit(train_data_scaled, train_labels)

# k-nearest neighbors classifier
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(train_data_scaled, train_labels)

# Adaboost classifier
adaboost_base_classifier = DecisionTreeClassifier(max_depth=1)
adaboost_model = AdaBoostClassifier(adaboost_base_classifier, n_estimators=50, random_state=42)
adaboost_model.fit(train_data_scaled, train_labels)

print('models set')

## Make Predictions

### Generate predictions for Logistic Regression Model

In [None]:
def model_metrics(train_pred):
    accuracy = accuracy_score(val_labels, train_pred)
    confusion = confusion_matrix(val_labels, train_pred)
    report = classification_report(val_labels, train_pred)

    print("Accuracy:", accuracy)
    print("Confusion Matrix:\n", confusion)
    print("Classification Report:\n", report)
    
    return accuracy

#### train predictions 

In [None]:
val_preds_regression_model = logistic_regression_model.predict(val_data_scaled)
regression_model_accuracy = model_metrics(val_preds_regression_model)
print('train preds made for regression model')

val_preds_random_forest_model = random_forest_model.predict(val_data_scaled)
random_forest_model_accuracy = model_metrics(val_preds_random_forest_model)
print('train preds made for random forest model')

val_preds_svm_model = svm_model.predict(val_data_scaled)
svm_model_accuracy = model_metrics(val_preds_svm_model)
print('train preds made for svm model')

val_preds_nb_model = nb_model.predict(val_data_scaled)
nb_model_accuracy = model_metrics(val_preds_nb_model)
print('train preds made for nb model')

val_preds_knn_model = knn_model.predict(val_data_scaled)
knn_model_accuracy = model_metrics(val_preds_knn_model)
print('train preds for knn model')

val_preds_adaboost_model = adaboost_model.predict(val_data_scaled)
adaboost_model_accuracy = model_metrics(val_preds_adaboost_model)
print('train preds for adaboost model')

### Submission

#### Real predictions

In [None]:
preds = nb_model.predict(test_data_scaled)
print('train preds made')

#### Generate submission

In [None]:
# generate data frame for submision
submission = pd.DataFrame({
    "PassengerId": passengerId,
    "Survived": preds
})

print('submission set')

In [None]:
submission.head(20)

In [None]:
# write the file to submission
submission.to_csv('./submissions/titanic_dissaster_submission_nb_first_attempt.csv', index=False, header=True)
print('submission file generated')