# Bagging

In [1]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = sns.load_dataset('titanic')

In [3]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [4]:
df.dropna(inplace=True)

# Data Preprocessing

In [5]:
X = df[['pclass','sex','age']]

In [6]:
from sklearn import preprocessing
lb = preprocessing.LabelBinarizer()

In [7]:
X['sex'] = lb.fit_transform(X['sex'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [8]:
X.head()

Unnamed: 0,pclass,sex,age
1,1,0,38.0
3,1,0,35.0
6,1,1,54.0
10,3,0,4.0
11,1,0,58.0


In [9]:
y = df[['survived']]

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.4,random_state=42)

In [13]:
from sklearn.model_selection import cross_val_score,cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [14]:
def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    '''
    print the accuracy score, classification report and confusion matrix of classifier
    '''
    if train:
        '''
        training performance
        '''
        print("Train Result:\n")
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_train, clf.predict(X_train))))
        print("Classification Report: \n {}\n".format(classification_report(y_train, clf.predict(X_train))))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_train, clf.predict(X_train))))

        res = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
        print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
        print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))
        
    elif train==False:
        '''
        test performance
        '''
        print("Test Result:\n")        
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_test, clf.predict(X_test))))
        print("Classification Report: \n {}\n".format(classification_report(y_test, clf.predict(X_test))))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_test, clf.predict(X_test))))    
        

In [15]:
clf = DecisionTreeClassifier(random_state=42)

In [16]:
clf.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')

In [17]:
print_score(clf,X_train,y_train,X_test,y_test,train=True)

Train Result:

accuracy score: 0.9725

Classification Report: 
              precision    recall  f1-score   support

          0       0.92      1.00      0.96        33
          1       1.00      0.96      0.98        76

avg / total       0.97      0.97      0.97       109


Confusion Matrix: 
 [[33  0]
 [ 3 73]]

Average Accuracy: 	 0.8039
Accuracy SD: 		 0.1125


In [18]:
print_score(clf,X_train,y_train,X_test,y_test,train=False)

Test Result:

accuracy score: 0.7397

Classification Report: 
              precision    recall  f1-score   support

          0       0.65      0.58      0.61        26
          1       0.78      0.83      0.80        47

avg / total       0.73      0.74      0.74        73


Confusion Matrix: 
 [[15 11]
 [ 8 39]]



In [19]:
bag_clf  = BaggingClassifier(base_estimator=clf, n_estimators=1000,bootstrap=True, random_state=42, n_jobs=-1)

In [20]:
bag_clf.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=1000, n_jobs=-1, oob_score=False,
         random_state=42, verbose=0, warm_start=False)

In [21]:
print_score(bag_clf,X_train,y_train,X_test,y_test,train=True)

Train Result:

accuracy score: 0.9725

Classification Report: 
              precision    recall  f1-score   support

          0       0.94      0.97      0.96        33
          1       0.99      0.97      0.98        76

avg / total       0.97      0.97      0.97       109


Confusion Matrix: 
 [[32  1]
 [ 2 74]]



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Average Accuracy: 	 0.8039
Accuracy SD: 		 0.1125


In [22]:
print_score(bag_clf,X_train,y_train,X_test,y_test,train=False)

Test Result:

accuracy score: 0.7534

Classification Report: 
              precision    recall  f1-score   support

          0       0.68      0.58      0.62        26
          1       0.78      0.85      0.82        47

avg / total       0.75      0.75      0.75        73


Confusion Matrix: 
 [[15 11]
 [ 7 40]]



In [23]:
bag_clf = BaggingClassifier(base_estimator=clf, n_estimators=1000,n_jobs=-1,bootstrap=True,oob_score=True,random_state=42)

In [24]:
bag_clf.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=1000, n_jobs=-1, oob_score=True,
         random_state=42, verbose=0, warm_start=False)

In [25]:
bag_clf.oob_score_

0.8256880733944955

In [26]:
print_score(bag_clf,X_train,y_train,X_test,y_test,train=True)

Train Result:

accuracy score: 0.9725

Classification Report: 
              precision    recall  f1-score   support

          0       0.94      0.97      0.96        33
          1       0.99      0.97      0.98        76

avg / total       0.97      0.97      0.97       109


Confusion Matrix: 
 [[32  1]
 [ 2 74]]



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Average Accuracy: 	 0.8039
Accuracy SD: 		 0.1125


In [27]:
print_score(bag_clf,X_train,y_train,X_test,y_test,train=False)

Test Result:

accuracy score: 0.7534

Classification Report: 
              precision    recall  f1-score   support

          0       0.68      0.58      0.62        26
          1       0.78      0.85      0.82        47

avg / total       0.75      0.75      0.75        73


Confusion Matrix: 
 [[15 11]
 [ 7 40]]



In [29]:
from sklearn.ensemble import RandomForestClassifier

In [31]:
ra_clf = RandomForestClassifier(random_state=42)

In [32]:
ra_clf.fit(X_train,y_train)

  """Entry point for launching an IPython kernel.


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [33]:
print_score(ra_clf,X_train,y_train,X_test,y_test,train=True)

Train Result:

accuracy score: 0.9633

Classification Report: 
              precision    recall  f1-score   support

          0       0.94      0.94      0.94        33
          1       0.97      0.97      0.97        76

avg / total       0.96      0.96      0.96       109


Confusion Matrix: 
 [[31  2]
 [ 2 74]]



  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


Average Accuracy: 	 0.8006
Accuracy SD: 		 0.1266


In [34]:
print_score(ra_clf,X_train,y_train,X_test,y_test,train=False)

Test Result:

accuracy score: 0.7945

Classification Report: 
              precision    recall  f1-score   support

          0       0.76      0.62      0.68        26
          1       0.81      0.89      0.85        47

avg / total       0.79      0.79      0.79        73


Confusion Matrix: 
 [[16 10]
 [ 5 42]]



In [35]:
from sklearn.ensemble import ExtraTreesClassifier

In [37]:
ex_clf = ExtraTreesClassifier(random_state=42)

In [38]:
ex_clf.fit(X_train,y_train)

  """Entry point for launching an IPython kernel.


ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [41]:
print_score(ex_clf,X_train,y_train,X_test,y_test,train=True)

Train Result:

accuracy score: 0.9725

Classification Report: 
              precision    recall  f1-score   support

          0       0.92      1.00      0.96        33
          1       1.00      0.96      0.98        76

avg / total       0.97      0.97      0.97       109


Confusion Matrix: 
 [[33  0]
 [ 3 73]]

Average Accuracy: 	 0.7873
Accuracy SD: 		 0.1282


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
