In [0]:
import pandas as pd
import numpy as np

---
## Load Dataset

In [152]:
!wget -O titanic.csv https://gist.githubusercontent.com/michhar/2dfd2de0d4f8727f873422c5d959fff5/raw/ff414a1bcfcba32481e4d4e8db578e55872a2ca1/titanic.csv

--2019-07-29 03:00:16--  https://gist.githubusercontent.com/michhar/2dfd2de0d4f8727f873422c5d959fff5/raw/ff414a1bcfcba32481e4d4e8db578e55872a2ca1/titanic.csv
Resolving gist.githubusercontent.com (gist.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to gist.githubusercontent.com (gist.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10305 (10K) [text/plain]
Saving to: ‘titanic.csv’


2019-07-29 03:00:16 (87.4 MB/s) - ‘titanic.csv’ saved [10305/10305]



The dataset used in this project is about the titanic incident. Berikut deskripsi mengenai dataset tersebut

1.   PassengerId = id of passenger
1.   Survived = Survival of passenger
2.   Pclass = Class in ship of passenger
1.   Name = Name of passenger
2.   Sex = Gender of passenger
1.   Age = Age of passenger
2.   SibSp = Number of siblings/spouses abroad
1.   Parch = Number of parents/children abroad
2.   Ticket = Ticket number
1.  Cabin = Cabin number
2.  Embarked = Port of Embarkation












In [153]:
df = pd.read_csv("titanic.csv", sep="\t")

# take a look at the dataset
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Data Exploratory Analysis and Preprocessing

## Statistical Information

In [154]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 12 columns):
PassengerId    156 non-null int64
Survived       156 non-null int64
Pclass         156 non-null int64
Name           156 non-null object
Sex            156 non-null object
Age            126 non-null float64
SibSp          156 non-null int64
Parch          156 non-null int64
Ticket         156 non-null object
Fare           156 non-null float64
Cabin          31 non-null object
Embarked       155 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 14.7+ KB


In [155]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,156.0,156.0,156.0,126.0,156.0,156.0,156.0
mean,78.5,0.346154,2.423077,28.141508,0.615385,0.397436,28.109587
std,45.177428,0.477275,0.795459,14.61388,1.056235,0.870146,39.401047
min,1.0,0.0,1.0,0.83,0.0,0.0,6.75
25%,39.75,0.0,2.0,19.0,0.0,0.0,8.00315
50%,78.5,0.0,3.0,26.0,0.0,0.0,14.4542
75%,117.25,1.0,3.0,35.0,1.0,0.0,30.37185
max,156.0,1.0,3.0,71.0,5.0,5.0,263.0


In [156]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age             30
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          125
Embarked         1
dtype: int64

## Remove unneeded features

Five features will be removed with the following reasons :


*   PassengerId and Ticket: Both represent the identity of an object so they are redudant 
*   Cabin : Contains to many missing values
*   Name : Irrelevant towards survival
*   Fare : Redudant because shows the same information with Pclass feature

In [157]:
df.drop(["PassengerId", "Cabin", "Name", "Fare", "Ticket"], axis=1, inplace=True)
df.head(2)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,3,male,22.0,1,0,S
1,1,1,female,38.0,1,0,C


## Change categorical features to numeric

Sklearn algorithm requires the data to be numeric type more specifically integer. The following section will change Sex, Embarked, and Age to numeric

In [0]:
# change sex feature to numeric
for index, value in df["Sex"].items():
    if value == "male":
        df.iloc[index, 2] = 0 # male -> 0
    else:
        df.iloc[index, 2] = 1 # female -> 1

# change embarked feature to numeric
for index, value in df["Embarked"].items():
    if value == 'S':
        df.iloc[index, 6] = 0 # S -> 0
    elif value == 'C':
        df.iloc[index, 6] = 1 # C -> 1
    else:
        df.iloc[index, 6] = 2 # Q -> 1

In [159]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 7 columns):
Survived    156 non-null int64
Pclass      156 non-null int64
Sex         156 non-null int64
Age         126 non-null float64
SibSp       156 non-null int64
Parch       156 non-null int64
Embarked    156 non-null int64
dtypes: float64(1), int64(6)
memory usage: 8.6 KB


Age will be changed after removing null values

## Remove rows that contain null values

As the features that contain null values is Age, the method to handle null values is removal. Because that feature contains information that cannot be replaced

In [160]:
# Before removal
print(df.shape)

df.dropna(axis=0, inplace=True)

# After removal
print(df.shape)

(156, 7)
(126, 7)


Change age attribute to integer

In [0]:
# change age feature to integer
df["Age"] = df["Age"].astype('int')

In [162]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 126 entries, 0 to 155
Data columns (total 7 columns):
Survived    126 non-null int64
Pclass      126 non-null int64
Sex         126 non-null int64
Age         126 non-null int64
SibSp       126 non-null int64
Parch       126 non-null int64
Embarked    126 non-null int64
dtypes: int64(7)
memory usage: 7.9 KB


## Normalize dataset

Logistic regression will be optimized using normalized data

In [163]:
from sklearn import preprocessing
X = np.asarray(df[["Pclass", "Sex", "Age", "SibSp", "Parch", "Embarked"]])
X = preprocessing.StandardScaler().fit(X).transform(X)
X[:5]

array([[ 0.8149405 , -0.7713214 , -0.41959477,  0.26047943, -0.49423044,
        -0.49913719],
       [-1.65933669,  1.2964764 ,  0.6804092 ,  0.26047943, -0.49423044,
         1.3506065 ],
       [ 0.8149405 ,  1.2964764 , -0.14459378, -0.62655862, -0.49423044,
        -0.49913719],
       [-1.65933669,  1.2964764 ,  0.47415846,  0.26047943, -0.49423044,
        -0.49913719],
       [ 0.8149405 , -0.7713214 ,  0.47415846, -0.62655862, -0.49423044,
        -0.49913719]])

In [164]:
y = np.asarray(df[["Survived"]])
y[:5]

array([[0],
       [1],
       [1],
       [1],
       [0]])

# Train Model

## Train/Test SplitData

The data will be converted to train and test dataset. With 0.8 portion for the trainset and the rest for testset

In [165]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print("Train set : ", X_train.shape, y_train.shape)
print("Test set : ", X_test.shape, y_test.shape)

Train set :  (100, 6) (100, 1)
Test set :  (26, 6) (26, 1)


## Train model

Three classifier will be used in this project namely: 


1.   Logistic Regression
1.  KNearest Neighbor
2.   Decision Tree



In [167]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix

logReg = LogisticRegression().fit(X_train, y_train)
decTree = DecisionTreeClassifier().fit(X_train, y_train)
kNN = KNeighborsClassifier().fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  


Predict x_test labels

In [0]:
yhat_logReg = logReg.predict(X_test)
yhat_decTree = decTree.predict(X_test)
yhat_kNN = kNN.predict(X_test)

# Evaluate Model

## Jaccard Index

In [174]:
from sklearn.metrics import jaccard_score

print("Logistic Regression :")
print(jaccard_score(y_test, yhat_logReg))
print()

print("Decision Tree :")
print(jaccard_score(y_test, yhat_decTree))
print()

print("KNN :")
print(jaccard_score(y_test, yhat_kNN))
print()

Logistic Regression :
0.7272727272727273

Decision Tree :
0.2857142857142857

KNN :
0.6



## Confusion Matrix

In [175]:
from sklearn.metrics import classification_report

print("Logistic Regression : \n")
print(classification_report(y_test, yhat_logReg))

print("Decision Tree : \n")
print(classification_report(y_test, yhat_decTree))

print("KNN: \n")
print(classification_report(y_test, yhat_kNN))

Logistic Regression : 

              precision    recall  f1-score   support

           0       0.88      0.94      0.91        16
           1       0.89      0.80      0.84        10

    accuracy                           0.88        26
   macro avg       0.89      0.87      0.88        26
weighted avg       0.88      0.88      0.88        26

Decision Tree : 

              precision    recall  f1-score   support

           0       0.67      0.75      0.71        16
           1       0.50      0.40      0.44        10

    accuracy                           0.62        26
   macro avg       0.58      0.57      0.58        26
weighted avg       0.60      0.62      0.61        26

KNN: 

              precision    recall  f1-score   support

           0       0.80      1.00      0.89        16
           1       1.00      0.60      0.75        10

    accuracy                           0.85        26
   macro avg       0.90      0.80      0.82        26
weighted avg       0.88  