In [1]:
# by SH

# titanic dataset from a Stanford course:
# https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv

# classification model that predicts whether a passenger would survive the sinking of the Titanic

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


In [3]:
# Loading and reviewing data

df = pd.read_csv("titanic.csv", engine='python')
df.shape

(887, 8)

In [4]:
df.describe()

Unnamed: 0,Survived,Pclass,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
count,887.0,887.0,887.0,887.0,887.0,887.0
mean,0.385569,2.305524,29.471443,0.525366,0.383315,32.30542
std,0.487004,0.836662,14.121908,1.104669,0.807466,49.78204
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.25,0.0,0.0,7.925
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.1375
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
df.head(5)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 887 entries, 0 to 886
Data columns (total 8 columns):
Survived                   887 non-null int64
Pclass                     887 non-null int64
Name                       887 non-null object
Sex                        887 non-null object
Age                        887 non-null float64
Siblings/Spouses Aboard    887 non-null int64
Parents/Children Aboard    887 non-null int64
Fare                       887 non-null float64
dtypes: float64(2), int64(4), object(2)
memory usage: 55.6+ KB


In [7]:
df.tail(5)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
882,0,2,Rev. Juozas Montvila,male,27.0,0,0,13.0
883,1,1,Miss. Margaret Edith Graham,female,19.0,0,0,30.0
884,0,3,Miss. Catherine Helen Johnston,female,7.0,1,2,23.45
885,1,1,Mr. Karl Howell Behr,male,26.0,0,0,30.0
886,0,3,Mr. Patrick Dooley,male,32.0,0,0,7.75


In [8]:
# Check for null values
df.isnull().values.any()

False

In [9]:
# Check class distribution
num_obs = len(df)
num_true = len(df.loc[df['Survived'] == 1])
num_false = len(df.loc[df['Survived'] == 0])
print("Number of True cases:  {0} ({1:2.2f}%)".format(num_true, (num_true/num_obs) * 100))
print("Number of False cases: {0} ({1:2.2f}%)".format(num_false, (num_false/num_obs) * 100))

Number of True cases:  342 (38.56%)
Number of False cases: 545 (61.44%)


In [10]:
# label encoding
from sklearn.preprocessing import LabelEncoder

labelEncoder = LabelEncoder()
df['Sex'] = labelEncoder.fit_transform(df['Sex'].astype(str))

df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,1,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,0,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,0,26.0,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,0,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,1,35.0,0,0,8.05


In [11]:
# Spliting the data
# 100 for testing, rest for training

from sklearn.model_selection import train_test_split

X = df.drop(['Survived', 'Name'], axis=1)
y = df['Survived']

#split_test_size = 100/887
split_test_size = 100/len(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_test_size, random_state=42) 

In [12]:
print("{0:0.2f}% in training set".format((len(X_train)/len(df.index)) * 100))
print("{0:0.2f}% in test set".format((len(X_test)/len(df.index)) * 100))

88.73% in training set
11.27% in test set


In [13]:
print("{0} in training set".format(len(X_train)))
print("{0} in test set".format(len(X_test)))

787 in training set
100 in test set


In [14]:
X_train.shape , y_train.shape

((787, 6), (787,))

In [15]:
X_test.shape , y_test.shape 

((100, 6), (100,))

# Naive Bayes

In [16]:
# Training Initial Algorithm - Naive Bayes
from sklearn.naive_bayes import GaussianNB

# create Gaussian Naive Bayes model object and train it with the data
nb_model = GaussianNB()

nb_model.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [17]:
# Performance on Training Data
# predict values using the training data
nb_predict_train = nb_model.predict(X_train)

# import the performance metrics library
from sklearn import metrics

# Accuracy
print("Training Accuracy: {0:.4f}".format(metrics.accuracy_score(y_train, nb_predict_train)))
print()

Training Accuracy: 0.8030



In [18]:
# Performance on Testing Data
nb_predict_test = nb_model.predict(X_test)

print("Testing Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, nb_predict_test)))

Testing Accuracy: 0.6900


In [19]:
print("Confusion Matrix")
print("{0}".format(metrics.confusion_matrix(y_test, nb_predict_test)))
print("")

print("Classification Report")
print(metrics.classification_report(y_test, nb_predict_test))

Confusion Matrix
[[50 10]
 [21 19]]

Classification Report
              precision    recall  f1-score   support

           0       0.70      0.83      0.76        60
           1       0.66      0.47      0.55        40

    accuracy                           0.69       100
   macro avg       0.68      0.65      0.66       100
weighted avg       0.68      0.69      0.68       100



# Logistic Regression

In [20]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(C=0.7, solver='liblinear', random_state=42)
#lr_model = LogisticRegression(penalty='l2', solver='liblinear')
lr_model.fit(X_train, y_train.ravel())
lr_predict_test = lr_model.predict(X_test)

# metrics
print("Testing Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, lr_predict_test)))
print(metrics.confusion_matrix(y_test, lr_predict_test) )
print("")
print("Classification Report")
print(metrics.classification_report(y_test, lr_predict_test))

Testing Accuracy: 0.7000
[[51  9]
 [21 19]]

Classification Report
              precision    recall  f1-score   support

           0       0.71      0.85      0.77        60
           1       0.68      0.47      0.56        40

    accuracy                           0.70       100
   macro avg       0.69      0.66      0.67       100
weighted avg       0.70      0.70      0.69       100



# SVC

In [21]:
from sklearn.svm import SVC

svc_model = SVC(kernel='rbf', gamma='scale')
svc_model.fit(X_train, y_train.ravel())
svc_predict_test = svc_model.predict(X_test)

# metrics
print("Testing Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, svc_predict_test)))
print(metrics.confusion_matrix(y_test, svc_predict_test) )
print("")
print("Classification Report")
print(metrics.classification_report(y_test, svc_predict_test))

Testing Accuracy: 0.6100
[[57  3]
 [36  4]]

Classification Report
              precision    recall  f1-score   support

           0       0.61      0.95      0.75        60
           1       0.57      0.10      0.17        40

    accuracy                           0.61       100
   macro avg       0.59      0.53      0.46       100
weighted avg       0.60      0.61      0.52       100



# Random Forest

In [22]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
#rf_model = RandomForestClassifier(random_state=42, n_estimators=1000)      # Create random forest object
rf_model = RandomForestClassifier(n_estimators=100) 
rf_model.fit(X_train, y_train.ravel()) 

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [23]:
# Predict Training Data
rf_predict_train = rf_model.predict(X_train)

print("Training Accuracy: {0:.4f}".format(metrics.accuracy_score(y_train, rf_predict_train)))

Training Accuracy: 0.9848


In [24]:
# Predict Test Data
rf_predict_test = rf_model.predict(X_test)

print("Testing Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, rf_predict_test)))

Testing Accuracy: 0.7700


In [25]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

kfold = KFold(n_splits=10, random_state=7, shuffle=True)

In [26]:
result = cross_val_score(rf_model, X_train, y_train.ravel(), cv=kfold, scoring='accuracy')
print("Mean Accuracy = %.2f%% - SD Accuracy = %.2f%%" % (result.mean()*100, result.std()*100))

Mean Accuracy = 80.95% - SD Accuracy = 3.53%


In [27]:
result = cross_val_score(rf_model, X_test, y_test.ravel(), cv=kfold, scoring='accuracy')
print("Mean Accuracy = %.2f%% - SD Accuracy = %.2f%%" % (result.mean()*100, result.std()*100))

Mean Accuracy = 74.00% - SD Accuracy = 12.00%


# KNeighborsClassifier

In [28]:
from sklearn.neighbors import KNeighborsClassifier


nn_model = KNeighborsClassifier(n_neighbors=5, metric="minkowski",p=2)      
nn_model.fit(X_train, y_train.ravel()) 

result = cross_val_score(nn_model, X_train, y_train.ravel(), cv=kfold, scoring='accuracy')
print("Mean Accuracy = %.2f%% - SD Accuracy = %.2f%%" % (result.mean()*100, result.std()*100))

result = cross_val_score(nn_model, X_test, y_test.ravel(), cv=kfold, scoring='accuracy')
print("Mean Accuracy = %.2f%% - SD Accuracy = %.2f%%" % (result.mean()*100, result.std()*100))

Mean Accuracy = 69.00% - SD Accuracy = 3.53%
Mean Accuracy = 72.00% - SD Accuracy = 8.72%


In [29]:
# Predict Training Data
nn_predict_train = nn_model.predict(X_train)

print("Training Accuracy: {0:.4f}".format(metrics.accuracy_score(y_train, nn_predict_train)))

Training Accuracy: 0.8069


In [30]:
# Predict Test Data
nn_predict_test = nn_model.predict(X_test)

print("Testing Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, nn_predict_test)))

Testing Accuracy: 0.6800


# GradientBoostingClassifier

In [31]:
from sklearn.ensemble import GradientBoostingClassifier

model_GB = GradientBoostingClassifier(n_estimators=1000)
model_GB.fit(X_train , y_train)
y_pred = model_GB.predict(X_test)
print("Testing Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, y_pred)))
target_names = ['Not-Surived', 'Survived']
print(metrics.classification_report(y_test, y_pred,target_names=target_names))

Testing Accuracy: 0.7600
              precision    recall  f1-score   support

 Not-Surived       0.78      0.83      0.81        60
    Survived       0.72      0.65      0.68        40

    accuracy                           0.76       100
   macro avg       0.75      0.74      0.75       100
weighted avg       0.76      0.76      0.76       100



In [32]:
from sklearn.model_selection import GridSearchCV

num_estimators = [100, 200, 500]
learn_rates = [0.01, 0.02, 0.05, 0.1]
max_depths = [4, 6, 8]

param_grid = {'n_estimators': num_estimators,
              'learning_rate': learn_rates,
              'max_depth': max_depths}

grid_search = GridSearchCV(GradientBoostingClassifier(min_samples_split=2),
                           param_grid, cv=3, return_train_score=True)
grid_search.fit(X_train, y_train)

grid_search.best_params_

{'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 200}

In [33]:
model_GB = GradientBoostingClassifier(n_estimators=200)
#model_GB = GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=4)
model_GB.fit(X_train , y_train)
y_pred = model_GB.predict(X_test)
print("Testing Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, y_pred)))
target_names = ['Not-Surived', 'Survived']
print(metrics.classification_report(y_test, y_pred,target_names=target_names))

Testing Accuracy: 0.8100
              precision    recall  f1-score   support

 Not-Surived       0.83      0.87      0.85        60
    Survived       0.78      0.72      0.75        40

    accuracy                           0.81       100
   macro avg       0.80      0.80      0.80       100
weighted avg       0.81      0.81      0.81       100



# SMOTE

In [34]:
#SMOTE
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_sample(X_train, y_train)
print (y_train.value_counts() , np.bincount(y_train))

Using TensorFlow backend.


1    485
0    485
Name: Survived, dtype: int64 [485 485]


In [35]:
# random forest with SMOTE
rf_model.fit(X_train, y_train.ravel()) 
#Predict Training Data
rf_predict_train = rf_model.predict(X_train)

print("Training Accuracy: {0:.4f}".format(metrics.accuracy_score(y_train, rf_predict_train)))

#Predict Test Data
rf_predict_test = rf_model.predict(X_test)

print("Testing Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, rf_predict_test)))

Training Accuracy: 0.9876
Testing Accuracy: 0.8000
