In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from pandas import Series, DataFrame
from pylab import rcParams

from sklearn import tree, preprocessing, model_selection
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import classification_report

from IPython.display import Image, display

In [2]:
url="https://raw.githubusercontent.com/BigDataGal/Python-for-Data-Science/master/titanic-train.csv"
titanic = pd.read_csv(url)

In [3]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
tit_input = titanic.drop(titanic.columns[[0,1,3,8,10,11]], axis = 1) 

In [5]:
tit_input.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,male,22.0,1,0,7.25
1,1,female,38.0,1,0,71.2833
2,3,female,26.0,0,0,7.925
3,1,female,35.0,1,0,53.1
4,3,male,35.0,0,0,8.05


In [6]:
tit_input.dtypes

Pclass      int64
Sex        object
Age       float64
SibSp       int64
Parch       int64
Fare      float64
dtype: object

In [7]:
tit_input.Sex.value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [8]:
tit_input = tit_input.join(pd.get_dummies(tit_input['Sex'], prefix='Sex', drop_first=True))

In [9]:
tit_input.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Sex_male
0,3,male,22.0,1,0,7.25,1
1,1,female,38.0,1,0,71.2833,0
2,3,female,26.0,0,0,7.925,0
3,1,female,35.0,1,0,53.1,0
4,3,male,35.0,0,0,8.05,1


In [10]:
tit_input.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
Pclass      891 non-null int64
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Sex_male    891 non-null uint8
dtypes: float64(2), int64(3), object(1), uint8(1)
memory usage: 42.7+ KB


In [11]:
tit_input[tit_input.Age.isnull() == True].head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Sex_male
5,3,male,,0,0,8.4583,1
17,2,male,,0,0,13.0,1
19,3,female,,0,0,7.225,0
26,3,male,,0,0,7.225,1
28,3,female,,0,0,7.8792,0


In [12]:
avg_age = round(tit_input.Age.mean(),0)
tit_input.Age = tit_input.Age.fillna(avg_age)

In [13]:
tit_input.Age.head(10)

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
5    30.0
6    54.0
7     2.0
8    27.0
9    14.0
Name: Age, dtype: float64

In [14]:
X = tit_input.drop(['Sex'], axis=1)
y = titanic['Survived']

In [15]:
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male
0,3,22.0,1,0,7.25,1
1,1,38.0,1,0,71.2833,0
2,3,26.0,0,0,7.925,0
3,1,35.0,1,0,53.1,0
4,3,35.0,0,0,8.05,1


In [16]:
# split data randomly into 70% training and 30% test
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.3, random_state=0)

## Train the model and make predictions

In [17]:
# train the decision tree with entropy
mydtree = tree.DecisionTreeClassifier(criterion='entropy', max_depth=4, random_state=3)
mydtree.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=3,
            splitter='best')

In [18]:
# now use this model to make predictions with the test data
y_pred = mydtree.predict(X_test)

In [19]:
y_pred

array([0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0], dtype=int64)

## Evaluate the model's performance

In [20]:
# how did the model has performed
count_misclassified = (y_test !=y_pred).sum()
print('Misclassified sample: {}'.format(count_misclassified))
accuracy = metrics.accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}'.format(accuracy))

Misclassified sample: 47
Accuracy: 0.82


In [21]:
from sklearn.model_selection import KFold
cv = KFold(n_splits = 5,
           random_state=12)

In [22]:
fold_accuracy = []

# titanic_train["Sex"] = encoded_sex

for train_fold, valid_fold in cv.split(X,y):
    train = X.loc[train_fold] # Extract train data with cv indices
    valid = X.loc[valid_fold] # Extract valid data with cv indices
    
    train_y = y.loc[train_fold]
    valid_y = y.loc[valid_fold]
    
    model = mydtree.fit(X = train, 
                           y = train_y)
    valid_acc = model.score(X = valid, 
                            y = valid_y)
    fold_accuracy.append(valid_acc)    

print("Accuracy per fold: ", fold_accuracy, "\n")
print("Average accuracy: ", sum(fold_accuracy)/len(fold_accuracy))

Accuracy per fold:  [0.8212290502793296, 0.8202247191011236, 0.797752808988764, 0.7752808988764045, 0.8595505617977528] 

Average accuracy:  0.8148076078086749


In [23]:
from sklearn.model_selection import cross_val_score

In [24]:
scores = cross_val_score(estimator= mydtree,     # Model to test
                X= X,  
                y = y,      # Target variable
                scoring = "accuracy",               # Scoring metric    
                cv=cv)                              # Cross validation folds

print("Accuracy per fold: ")
print(scores)
print("Average accuracy: ", scores.mean())

Accuracy per fold: 
[0.82122905 0.82022472 0.79775281 0.7752809  0.85955056]
Average accuracy:  0.8148076078086749


In [25]:
metrics.confusion_matrix(y_test, y_pred)

# metrics.confusion_matrix(y_test, y_pred)

array([[148,  20],
       [ 27,  73]], dtype=int64)

In [26]:
metrics.accuracy_score(y_test, y_pred)

0.8246268656716418

In [27]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.88      0.86       168
           1       0.78      0.73      0.76       100

   micro avg       0.82      0.82      0.82       268
   macro avg       0.82      0.81      0.81       268
weighted avg       0.82      0.82      0.82       268



In [28]:
probs = mydtree.predict_proba(X_test)
probs[:5]

array([[0.86111111, 0.13888889],
       [0.93975904, 0.06024096],
       [1.        , 0.        ],
       [0.01282051, 0.98717949],
       [0.51612903, 0.48387097]])

In [29]:
metrics.roc_auc_score(y_test,probs[:,1])

0.8998511904761904

- The model accuracy is above **80 %**
- with K-Fold and cross_val_score the average accuracy of the model is above **80 %**
- Also the roc_aur_score is **89 %**