In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn

In [2]:
df=sns.load_dataset('titanic')

In [3]:
df.shape

(891, 15)

In [4]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


# Data preprocessing
# X as pclass,sex,age
# Y as survived

In [5]:
subset=df[['pclass','age','sex','survived']].copy()
subset.dropna(inplace=True)

In [6]:
X=subset[['pclass','age','sex']].copy()

In [7]:
from sklearn import preprocessing
le=preprocessing.LabelEncoder()

In [8]:
X['sex']=le.fit_transform(subset['sex'])

In [9]:
X.head()

Unnamed: 0,pclass,age,sex
0,3,22.0,1
1,1,38.0,0
2,3,26.0,0
3,1,35.0,0
4,3,35.0,1


In [10]:
X.shape

(714, 3)

In [11]:
Y=subset[['survived']].copy()

In [12]:
Y.shape

(714, 1)

In [13]:
X.describe()

Unnamed: 0,pclass,age,sex
count,714.0,714.0,714.0
mean,2.236695,29.699118,0.634454
std,0.83825,14.526497,0.481921
min,1.0,0.42,0.0
25%,1.0,20.125,0.0
50%,2.0,28.0,1.0
75%,3.0,38.0,1.0
max,3.0,80.0,1.0


In [14]:
Y.describe()

Unnamed: 0,survived
count,714.0
mean,0.406162
std,0.49146
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


# Making of a general function 

In [15]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

In [16]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.1,random_state=0)

In [17]:
from sklearn import preprocessing
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,roc_auc_score

In [18]:
from sklearn.model_selection import cross_val_score,cross_val_predict

In [19]:
import numpy as np
def print_score(model,X_train,X_test,Y_train,Y_test,train=True):
    lb=preprocessing.LabelBinarizer()
    lb.fit(Y_train)
    if train:
        pred=model.predict(X_train)
        print("TRAIN RESULT:\n")
        print("accuracy score: {0:.4f}\n".format(accuracy_score(Y_train, 
                                                                pred)))
        print("Classification Report: \n {}\n".format(classification_report(Y_train, 
                                                                            pred)))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(Y_train, 
                                                                  pred)))
        print("ROC AUC: {0:.4f}\n".format(roc_auc_score(lb.transform(Y_train), 
                                                      lb.transform(pred))))

        res = cross_val_score(model, X_train, Y_train, cv=10, scoring='accuracy')
        print("Average Accuracy: \t {0:.4f}".format(np.mean(pred)))
        print("Accuracy SD: \t\t {0:.4f}".format(np.std(pred)))
        
    elif train==False:
        '''
        test performance
        '''
        pred_test = model.predict(X_test)
        print("Test Result:\n")        
        print("accuracy score: {0:.4f}\n".format(accuracy_score(Y_test, 
                                                                pred_test)))
        print("Classification Report: \n {}\n".format(classification_report(Y_test, 
                                                                            pred_test)))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(Y_test, 
                                                                  pred_test)))   
        print("ROC AUC: {0:.4f}\n".format(roc_auc_score(lb.transform(Y_test), 
                                                      lb.transform(pred_test))))

# Fitting in decision tree classifier

In [20]:
model=DecisionTreeClassifier()

In [21]:
model.fit(X_train,Y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [22]:
print_score(model,X_train,X_test,Y_train,Y_test,train=True)
print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")
print_score(model,X_train,X_test,Y_train,Y_test,train=False)

TRAIN RESULT:

accuracy score: 0.8910

Classification Report: 
               precision    recall  f1-score   support

           0       0.86      0.97      0.91       380
           1       0.95      0.77      0.85       262

    accuracy                           0.89       642
   macro avg       0.91      0.87      0.88       642
weighted avg       0.90      0.89      0.89       642


Confusion Matrix: 
 [[370  10]
 [ 60 202]]

ROC AUC: 0.8723

Average Accuracy: 	 0.3302
Accuracy SD: 		 0.4703
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
Test Result:

accuracy score: 0.8194

Classification Report: 
               precision    recall  f1-score   support

           0       0.86      0.84      0.85        44
           1       0.76      0.79      0.77        28

    accuracy                           0.82        72
   macro avg       0.81      0.81      0.81        72
weighted avg       0.82      0.82      0.82        72


Confusion Matrix: 
 [[37  7]
 [ 6 22]]

ROC AUC: 0.8133

# Bagging Classifier (oob_score=False)

In [23]:
model_2=BaggingClassifier(base_estimator=model,n_estimators=1000,n_jobs=-1,bootstrap=True,random_state=42)
model_2.fit(X_train,Y_train)

  y = column_or_1d(y, warn=True)


BaggingClassifier(base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                        class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        presort='deprecated',
                                                        random_state=None,


In [24]:
print_score(model_2, X_train, X_test, Y_train, Y_test, train=True)
print("\n********************************\n")
print_score(model_2, X_train, X_test, Y_train, Y_test, train=False)

TRAIN RESULT:

accuracy score: 0.8910

Classification Report: 
               precision    recall  f1-score   support

           0       0.88      0.94      0.91       380
           1       0.91      0.81      0.86       262

    accuracy                           0.89       642
   macro avg       0.90      0.88      0.89       642
weighted avg       0.89      0.89      0.89       642


Confusion Matrix: 
 [[359  21]
 [ 49 213]]

ROC AUC: 0.8789



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Average Accuracy: 	 0.3645
Accuracy SD: 		 0.4813

********************************

Test Result:

accuracy score: 0.8194

Classification Report: 
               precision    recall  f1-score   support

           0       0.88      0.82      0.85        44
           1       0.74      0.82      0.78        28

    accuracy                           0.82        72
   macro avg       0.81      0.82      0.81        72
weighted avg       0.83      0.82      0.82        72


Confusion Matrix: 
 [[36  8]
 [ 5 23]]

ROC AUC: 0.8198



In [25]:
model_3=BaggingClassifier(base_estimator=model,n_estimators=1000,n_jobs=-1,oob_score=True,random_state=42)

In [26]:
model_3.fit(X_train,Y_train)

  y = column_or_1d(y, warn=True)


BaggingClassifier(base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                        class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        presort='deprecated',
                                                        random_state=None,


In [27]:
print_score(model_3,X_train,X_test,Y_train,Y_test,train=True)
print('vghdjdfshjhfgshfsghfshfs')
print_score(model_3,X_train,X_test,Y_train,Y_test,train=False)

TRAIN RESULT:

accuracy score: 0.8910

Classification Report: 
               precision    recall  f1-score   support

           0       0.88      0.94      0.91       380
           1       0.91      0.81      0.86       262

    accuracy                           0.89       642
   macro avg       0.90      0.88      0.89       642
weighted avg       0.89      0.89      0.89       642


Confusion Matrix: 
 [[359  21]
 [ 49 213]]

ROC AUC: 0.8789



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Average Accuracy: 	 0.3645
Accuracy SD: 		 0.4813
vghdjdfshjhfgshfsghfshfs
Test Result:

accuracy score: 0.8194

Classification Report: 
               precision    recall  f1-score   support

           0       0.88      0.82      0.85        44
           1       0.74      0.82      0.78        28

    accuracy                           0.82        72
   macro avg       0.81      0.82      0.81        72
weighted avg       0.83      0.82      0.82        72


Confusion Matrix: 
 [[36  8]
 [ 5 23]]

ROC AUC: 0.8198



In [28]:
model_3.oob_score_

0.8006230529595015

# RandomClassifier

In [29]:
from sklearn.ensemble import RandomForestClassifier

In [30]:
model_4=RandomForestClassifier(random_state=42,n_estimators=100)

In [31]:
model_4.fit(X_train,Y_train)

  """Entry point for launching an IPython kernel.


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [32]:
print_score(model_4,X_train,X_test,Y_train,Y_test,train=True)
print('vghdjdfshjhfgshfsghfshfs')
print_score(model_4,X_train,X_test,Y_train,Y_test,train=False)

TRAIN RESULT:

accuracy score: 0.8910

Classification Report: 
               precision    recall  f1-score   support

           0       0.89      0.93      0.91       380
           1       0.90      0.83      0.86       262

    accuracy                           0.89       642
   macro avg       0.89      0.88      0.89       642
weighted avg       0.89      0.89      0.89       642


Confusion Matrix: 
 [[355  25]
 [ 45 217]]

ROC AUC: 0.8812



  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


Average Accuracy: 	 0.3769
Accuracy SD: 		 0.4846
vghdjdfshjhfgshfsghfshfs
Test Result:

accuracy score: 0.8333

Classification Report: 
               precision    recall  f1-score   support

           0       0.92      0.80      0.85        44
           1       0.74      0.89      0.81        28

    accuracy                           0.83        72
   macro avg       0.83      0.84      0.83        72
weighted avg       0.85      0.83      0.84        72


Confusion Matrix: 
 [[35  9]
 [ 3 25]]

ROC AUC: 0.8442



# GridSearchCV

# ExtraTreeClassifier

https://colab.research.google.com/drive/1w_yfO52dY_pZbQ3aZf0RJAsojYEDuZuc

# AdaBoost

In [33]:
from sklearn.ensemble import AdaBoostClassifier

In [34]:
ada=AdaBoostClassifier(n_estimators=100,random_state=42)

In [35]:
ada.fit(X_train,Y_train)

  y = column_or_1d(y, warn=True)


AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=100, random_state=42)

In [36]:
print_score(ada,X_train,X_test,Y_train,Y_test,train=True)
print("############################################33")
print_score(ada,X_train,X_test,Y_train,Y_test,train=False)

TRAIN RESULT:

accuracy score: 0.8178

Classification Report: 
               precision    recall  f1-score   support

           0       0.86      0.83      0.84       380
           1       0.76      0.80      0.78       262

    accuracy                           0.82       642
   macro avg       0.81      0.82      0.81       642
weighted avg       0.82      0.82      0.82       642


Confusion Matrix: 
 [[315  65]
 [ 52 210]]

ROC AUC: 0.8152



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Average Accuracy: 	 0.4283
Accuracy SD: 		 0.4948
############################################33
Test Result:

accuracy score: 0.8472

Classification Report: 
               precision    recall  f1-score   support

           0       0.90      0.84      0.87        44
           1       0.77      0.86      0.81        28

    accuracy                           0.85        72
   macro avg       0.84      0.85      0.84        72
weighted avg       0.85      0.85      0.85        72


Confusion Matrix: 
 [[37  7]
 [ 4 24]]

ROC AUC: 0.8490



# AdaBoost with Random Forest

In [37]:
ada_clf=AdaBoostClassifier(RandomForestClassifier(n_estimators=100),n_estimators=100)

In [38]:
ada_clf.fit(X_train,Y_train)

  y = column_or_1d(y, warn=True)


AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=RandomForestClassifier(bootstrap=True,
                                                         ccp_alpha=0.0,
                                                         class_weight=None,
                                                         criterion='gini',
                                                         max_depth=None,
                                                         max_features='auto',
                                                         max_leaf_nodes=None,
                                                         max_samples=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                       

In [39]:
print_score(ada_clf,X_train,X_test,Y_train,Y_test,train=True)
print("############################################33")
print_score(ada_clf,X_train,X_test,Y_train,Y_test,train=False)

TRAIN RESULT:

accuracy score: 0.8910

Classification Report: 
               precision    recall  f1-score   support

           0       0.89      0.94      0.91       380
           1       0.90      0.82      0.86       262

    accuracy                           0.89       642
   macro avg       0.89      0.88      0.89       642
weighted avg       0.89      0.89      0.89       642


Confusion Matrix: 
 [[356  24]
 [ 46 216]]

ROC AUC: 0.8806



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Average Accuracy: 	 0.3738
Accuracy SD: 		 0.4838
############################################33
Test Result:

accuracy score: 0.8333

Classification Report: 
               precision    recall  f1-score   support

           0       0.90      0.82      0.86        44
           1       0.75      0.86      0.80        28

    accuracy                           0.83        72
   macro avg       0.82      0.84      0.83        72
weighted avg       0.84      0.83      0.83        72


Confusion Matrix: 
 [[36  8]
 [ 4 24]]

ROC AUC: 0.8377



In [40]:
from sklearn.ensemble import GradientBoostingClassifier

In [41]:
clf=GradientBoostingClassifier()

In [42]:
clf.fit(X_train,Y_train)

  y = column_or_1d(y, warn=True)


GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [43]:
print_score(clf,X_train,X_test,Y_train,Y_test,train=True)
print("############################################33")
print_score(clf,X_train,X_test,Y_train,Y_test,train=False)

TRAIN RESULT:

accuracy score: 0.8629

Classification Report: 
               precision    recall  f1-score   support

           0       0.84      0.96      0.89       380
           1       0.92      0.73      0.81       262

    accuracy                           0.86       642
   macro avg       0.88      0.84      0.85       642
weighted avg       0.87      0.86      0.86       642


Confusion Matrix: 
 [[363  17]
 [ 71 191]]

ROC AUC: 0.8421



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Average Accuracy: 	 0.3240
Accuracy SD: 		 0.4680
############################################33
Test Result:

accuracy score: 0.8333

Classification Report: 
               precision    recall  f1-score   support

           0       0.85      0.89      0.87        44
           1       0.81      0.75      0.78        28

    accuracy                           0.83        72
   macro avg       0.83      0.82      0.82        72
weighted avg       0.83      0.83      0.83        72


Confusion Matrix: 
 [[39  5]
 [ 7 21]]

ROC AUC: 0.8182



# XGBoost