In [950]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score
import graphviz

In [951]:
titanic = sns.load_dataset('titanic')
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [952]:
titanic.shape

(891, 15)

In [953]:
titanic['class'] = titanic['class'].cat.codes
titanic['deck'] = titanic['deck'].cat.codes

In [954]:
titanic['adult_male'] = titanic['adult_male'].map({True:1, False:0})
titanic['alone'] = titanic['alone'].map({True:1, False:0})
titanic['sex'] = titanic['sex'].map({'male':1, 'female':0})
titanic['alive'] = titanic['alive'].map({'no':0, 'yes':1})

In [955]:
titanic[titanic['embarked'].isna()]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
61,1,1,0,38.0,0,0,80.0,,0,woman,0,1,,1,1
829,1,1,0,62.0,0,0,80.0,,0,woman,0,1,,1,1


In [956]:
titanic = titanic.drop([61,829])

In [957]:
lb = LabelEncoder()
titanic['embarked'] = lb.fit_transform(titanic['embarked'])

In [958]:
titanic['who'] = lb.fit_transform(titanic['who'])

In [959]:
titanic['embark_town'] = lb.fit_transform(titanic['embark_town'])

In [960]:
titanic.dtypes

survived         int64
pclass           int64
sex              int64
age            float64
sibsp            int64
parch            int64
fare           float64
embarked         int32
class             int8
who              int32
adult_male       int64
deck              int8
embark_town      int32
alive            int64
alone            int64
dtype: object

In [961]:
titanic['sex'] = lb.fit_transform(titanic['sex'])

In [962]:
titanic.loc[titanic['age'].isna(), 'age'] = titanic['age'].mean()

In [963]:
titanic.isna().sum()

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
deck           0
embark_town    0
alive          0
alone          0
dtype: int64

# Modelling

In [964]:
X = titanic.iloc[:,1:]
y = titanic.iloc[:, 0]

In [965]:
X = X.drop('alive', axis = 1)

In [966]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 42)

In [967]:
tree = DecisionTreeClassifier()

In [968]:
tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [969]:
print('{0:.2f}'.format(accuracy_score(y_test, tree.predict(X_test))*100))

78.65


In [970]:
confusion_matrix(y_test, tree.predict(X_test))

array([[46, 12],
       [ 7, 24]], dtype=int64)

In [971]:
print('{0:.2f}'.format(cross_val_score(tree, X_train, y_train, scoring = 'accuracy', cv = 10).mean()*100))

78.37


In [972]:
img = export_graphviz(tree, filled = True, rounded = True, out_file='./tree.dot')

In [973]:
!dot -Tpng tree.dot -o tree.png

In [2]:
![](img src = './tree.png')

'[]' is not recognized as an internal or external command,
operable program or batch file.


# Bagging 

In [977]:
bag = BaggingClassifier(tree, n_estimators = 5000, bootstrap = True, bootstrap_features=True)

In [978]:
bag.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
         bootstrap=True, bootstrap_features=True, max_features=1.0,
         max_samples=1.0, n_estimators=5000, n_jobs=None, oob_score=False,
         random_state=None, verbose=0, warm_start=False)

In [979]:
accuracy_score(y_test, bag.predict(X_test))*100

83.14606741573034

In [980]:
confusion_matrix(y_test, bag.predict(X_test))

array([[51,  7],
       [ 8, 23]], dtype=int64)

In [981]:
print('{0:.2f}'.format(cross_val_score(bag, X_train, y_train, scoring = 'accuracy', cv = 10).mean()*100))

83.00


> - <i><b>The accuracy went up to 83.14(Bagging) from 78.65(Decision Tree)</b></i><br>
- <i><b>Cross Val Score went up to 83(Bagging) from 78.37(Decision Tree)</b></i><br><br>
<b> Hence, bagging increases the performance for sure!</b>