## This is the original data from Titanic competition plus some changes that I applied to it to be better suited for binary logistic regression:

* Merged the train and test data.

* Removed the 'ticket' and 'cabin' attributes.

* Moved the 'Survived' attribute to the last column.

* Added extra zero columns for categorical inputs to be better suited for One-Hot-Encoding.

* Substituted the values of 'Sex' and 'Embarked' attributes with binary and categorical values respectively.

* Filled the missing values in 'Age' and 'Fare' attributes with the median of the data.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/titanic/train_and_test2.csv')
data.head()

In [None]:
data.shape

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
data = data.rename(columns = {'2urvived': 'survived'}, inplace = False)
data.head()

In [None]:
data = data.fillna(0)

In [None]:
data.isnull().sum()

In [None]:
import seaborn as sns

In [None]:
data.corr()

In [None]:
sns.heatmap(data.corr())

In [None]:
sns.jointplot('Fare', 'sibsp', data=data, kind = 'hex')

In [None]:
sns.countplot(data.Sex)

In [None]:
sns.jointplot('Fare', 'sibsp', data=data, kind = 'reg')

In [None]:
sns.distplot(data['Age'])

In [None]:
sns.countplot('survived', data=data)

In [None]:
sns.barplot('survived', 'Fare', data=data)

In [None]:
sns.boxplot('survived', 'Age', data=data)

In [None]:
X = data.drop('survived', axis='columns')
y = data.survived

In [None]:
X.shape

In [None]:
y.shape

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

## Logistic Regression y = (1 / 1 + e^-z)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
model = LogisticRegression()
model.fit(X_train, y_train)
y_predlr = model.predict(X_test)
cm = confusion_matrix(y_predlr, y_test)
cm

In [None]:
cr = classification_report(y_predlr, y_test)
print(cr)

In [None]:
acc_lr = accuracy_score(y_predlr, y_test)
acc_lr

In [None]:
from sklearn.metrics import matthews_corrcoef
mc = matthews_corrcoef(y_predlr, y_test)
mc

## ANN with Keras

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
import matplotlib.pyplot as plt

In [None]:
model = Sequential()
# add first hidden layer with input diamension
model.add(Dense(units = 32, activation='relu', kernel_initializer = 'he_uniform', input_dim = 27))
# add second hidden layer
model.add(Dense(units = 16, activation='relu', kernel_initializer = 'he_uniform'))
# add output layer
model.add(Dense(units = 1, activation = 'sigmoid', kernel_initializer = 'glorot_uniform'))

In [None]:
# now we compile the model
model.compile(optimizer = 'adamax', loss = 'binary_crossentropy', metrics = ['accuracy'])
# train the model
model.fit(X_train, y_train, batch_size = 128, epochs = 50, verbose = 1)

In [None]:
acc = model.evaluate(X_test, y_test)

In [None]:
model.summary()

In [None]:
y_ann = model.predict(X_test)
y_ann = y_ann > 0.5

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_ann, y_test)
cm

In [None]:
from sklearn.metrics import matthews_corrcoef
mc_ann = matthews_corrcoef(y_ann, y_test)
mc_ann

## Support Vector Machine (SVM)

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
model_svm = SVC()
model_svm.fit(X_train, y_train)
y_svm = model_svm.predict(X_test)
acc_svm = accuracy_score(y_svm, y_test)
cm_svm = confusion_matrix(y_svm, y_test)
acc_svm

In [None]:
cm_svm

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_svm, y_test))

In [None]:
mc_svm = matthews_corrcoef(y_svm, y_test)
mc_svm

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier()
model_rf.fit(X_train, y_train)
y_rf = model_rf.predict(X_test)
acc_rf = accuracy_score(y_rf, y_test)
cm_rf = confusion_matrix(y_rf, y_test)
acc_rf

In [None]:
cm_rf

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_rf, y_test))

In [None]:
mc_rf = matthews_corrcoef(y_rf, y_test)
mc_rf

## Score of all models

In [None]:
print("Logistic Regression Accuracy : ", acc_lr)
print("Artificial Neural Network Accuracy : ", acc)
print("Support Vector Machine Accuracy : ", acc_svm)
print("Random Forest Accuracy : ", acc_rf)