## Imports

In [1]:
## Imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing  import MinMaxScaler, MaxAbsScaler

# model imports
# logistic regression
from sklearn.linear_model import LogisticRegression
# Random Forest
from sklearn.ensemble import RandomForestClassifier
# Support Vector Machines
from sklearn.svm import SVC
# Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB
# K-Neighbors
from sklearn.neighbors import KNeighborsClassifier
# Gradient Boost
# Ada Boost
from sklearn.tree import DecisionTreeClassifier # requirement for ada gradient boost
from sklearn.ensemble import AdaBoostClassifier

print('imports done')

imports done


## Load Data

In [2]:
train_data = pd.read_csv('./train_processed.csv')
test_data = pd.read_csv('./test_processed.csv')
print('data loaded')

data loaded


In [3]:
# split data
train_dataset, val_dataset = train_test_split(train_data, test_size=0.3, random_state=42)
print('data splitted')

data splitted


In [4]:
# extract labels
train_labels = train_dataset.pop('Survived')
val_labels = val_dataset.pop('Survived')

print('train labels shape: ', str(train_labels.shape))
print('val labels shape: ', str(val_labels.shape))

print('labels extracted')

train labels shape:  (623,)
val labels shape:  (268,)
labels extracted


In [5]:
# Extract passengerId

passengerId = test_data['PassengerId']
train_dataset = train_dataset.drop(columns=['PassengerId'])
val_dataset = val_dataset.drop(columns=['PassengerId'])
test_data = test_data.drop(columns=['PassengerId'])
print('passengerId copied and removed')

passengerId copied and removed


## Normalization

Using MinMax scaler we noticed that it was complaining because train_data_encoded is sparse, this means that most of its values are zero. so, we're going to use MaxAbsScaler()

In [6]:
scaler = MinMaxScaler()

train_data_scaled = scaler.fit_transform(train_dataset)
val_data_scaled = scaler.transform(val_dataset)
test_data_scaled = scaler.transform(test_data)

print('train shape: ', train_data_scaled.shape)
print('val shape: ', val_data_scaled.shape)
print('test shape: ', test_data_scaled.shape)

print('max value train scaled: ', np.max(train_data_scaled))
print('max value val scaled: ', np.max(val_data_scaled))
print('max value test: ', np.max(test_data_scaled))

train shape:  (623, 12)
val shape:  (268, 12)
test shape:  (418, 12)
max value train scaled:  1.0
max value val scaled:  1.0666666666666667
max value test:  1.5


## Build and train the model

In [7]:
# logistic regression model
logistic_regression_model = LogisticRegression()
logistic_regression_model.fit(train_data_scaled, train_labels)

# random forest model
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_model.fit(train_data_scaled, train_labels)

# support vector classifier model
svm_model = SVC(kernel="linear")
svm_model.fit(train_data_scaled, train_labels)

# naive bayes multinomial classifier
nb_model = MultinomialNB()
nb_model.fit(train_data_scaled, train_labels)

# k-nearest neighbors classifier
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(train_data_scaled, train_labels)

# Adaboost classifier
adaboost_base_classifier = DecisionTreeClassifier(max_depth=1)
adaboost_model = AdaBoostClassifier(adaboost_base_classifier, n_estimators=50, random_state=42)
adaboost_model.fit(train_data_scaled, train_labels)

print('models set')

models set


## Make Predictions

### Function to show metrics

In [8]:
def model_metrics(train_pred):
    accuracy = accuracy_score(val_labels, train_pred)
    confusion = confusion_matrix(val_labels, train_pred)
    report = classification_report(val_labels, train_pred)

    print("Accuracy:", accuracy)
    print("Confusion Matrix:\n", confusion)
    print("Classification Report:\n", report)
    
    return accuracy

### Train and predict with different models

In [9]:
# Logistic regression
val_preds_regression_model = logistic_regression_model.predict(val_data_scaled)
regression_model_accuracy = model_metrics(val_preds_regression_model)

logistic_regresion_preds = logistic_regression_model.predict(test_data_scaled)
print('train preds made for regression model')

Accuracy: 0.7947761194029851
Confusion Matrix:
 [[133  24]
 [ 31  80]]
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.85      0.83       157
           1       0.77      0.72      0.74       111

    accuracy                           0.79       268
   macro avg       0.79      0.78      0.79       268
weighted avg       0.79      0.79      0.79       268

train preds made for regression model


In [14]:
# Random forest model
val_preds_random_forest_model = random_forest_model.predict(val_data_scaled)
random_forest_model_accuracy = model_metrics(val_preds_random_forest_model)

random_forest_model_preds = random_forest_model.predict(test_data_scaled)
print('train preds made for random forest model')

Accuracy: 0.8134328358208955
Confusion Matrix:
 [[136  21]
 [ 29  82]]
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.87      0.84       157
           1       0.80      0.74      0.77       111

    accuracy                           0.81       268
   macro avg       0.81      0.80      0.81       268
weighted avg       0.81      0.81      0.81       268

train preds made for random forest model


In [17]:
# Support vector machine
val_preds_svm_model = svm_model.predict(val_data_scaled)
svm_model_accuracy = model_metrics(val_preds_svm_model)

svm_model_preds = svm_model.predict(test_data_scaled)
print('train preds made for svm model')

Accuracy: 0.7910447761194029
Confusion Matrix:
 [[134  23]
 [ 33  78]]
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.85      0.83       157
           1       0.77      0.70      0.74       111

    accuracy                           0.79       268
   macro avg       0.79      0.78      0.78       268
weighted avg       0.79      0.79      0.79       268

train preds made for svm model


In [14]:
# Naive bayes
val_preds_nb_model = nb_model.predict(val_data_scaled)
nb_model_accuracy = model_metrics(val_preds_nb_model)

nb_model_preds = nb_model.predict(test_data_scaled)
print('train preds made for nb model')

Accuracy: 0.7910447761194029
Confusion Matrix:
 [[134  23]
 [ 33  78]]
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.85      0.83       157
           1       0.77      0.70      0.74       111

    accuracy                           0.79       268
   macro avg       0.79      0.78      0.78       268
weighted avg       0.79      0.79      0.79       268

train preds made for nb model


In [23]:
# K - nearest neighbors
val_preds_knn_model = knn_model.predict(val_data_scaled)
knn_model_accuracy = model_metrics(val_preds_knn_model)

knn_model_preds = knn_model.predict(test_data_scaled)
print('train preds for knn model')

Accuracy: 0.8022388059701493
Confusion Matrix:
 [[139  18]
 [ 35  76]]
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.89      0.84       157
           1       0.81      0.68      0.74       111

    accuracy                           0.80       268
   macro avg       0.80      0.79      0.79       268
weighted avg       0.80      0.80      0.80       268

train preds for knn model


In [26]:
# adaboost
val_preds_adaboost_model = adaboost_model.predict(val_data_scaled)
adaboost_model_accuracy = model_metrics(val_preds_adaboost_model)

adaboost_model_preds = adaboost_model.predict(test_data_scaled)
print('train preds for adaboost model')

Accuracy: 0.7985074626865671
Confusion Matrix:
 [[135  22]
 [ 32  79]]
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.86      0.83       157
           1       0.78      0.71      0.75       111

    accuracy                           0.80       268
   macro avg       0.80      0.79      0.79       268
weighted avg       0.80      0.80      0.80       268

train preds for adaboost model


## Submissions

In [15]:
# generate data frame for submision
submission = pd.DataFrame({
    "PassengerId": passengerId,
    "Survived": nb_model_preds
})

print('submission set')

submission set


In [16]:
# write the file to submission
submission.to_csv('./submissions/titanic_dissaster_nb_model_preds_2.csv', index=False, header=True)
print('submission file generated')

submission file generated
