## Ensemble Learning
https://www.analyticsvidhya.com/blog/2018/06/comprehensive-guide-for-ensemble-models/

In [272]:
import pandas as pd
from sklearn.model_selection import train_test_split , StratifiedKFold 
import numpy as np

from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB ,MultinomialNB ,BernoulliNB
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score,recall_score, confusion_matrix, classification_report,f1_score
from sklearn.preprocessing import OneHotEncoder
from prettytable import PrettyTable

In [3]:
df = pd.read_csv('golf.csv')

In [4]:
df.shape

(25, 5)

In [5]:
features = list(df.columns[:4])
X = df[features]
# X


In [6]:
pg = {'No':0,'Yes':1}
y = df['PLAY GOLF']
# y[0].map(pg)
y = y.map(pg)  
# y

## OneHot Encoding

In [7]:
X= pd.get_dummies(X)

In [8]:
X

Unnamed: 0,TEMPERATURE_Overcast,TEMPERATURE_Rainy,TEMPERATURE_Sunny,HUMIDITY_Cool,HUMIDITY_Hot,HUMIDITY_Mild,WINDY_High,WINDY_Normal,DAYOFWEEK_F,DAYOFWEEK_M,DAYOFWEEK_S,DAYOFWEEK_Su,DAYOFWEEK_T,DAYOFWEEK_W
0,0,1,0,0,1,0,1,0,0,1,0,0,0,0
1,0,1,0,0,1,0,1,0,0,0,0,0,1,0
2,1,0,0,0,1,0,1,0,0,1,0,0,0,0
3,0,0,1,0,0,1,1,0,0,0,0,0,1,0
4,0,0,1,1,0,0,0,1,1,0,0,0,0,0
5,0,0,1,1,0,0,0,1,0,0,0,0,0,1
6,1,0,0,1,0,0,0,1,0,0,0,1,0,0
7,0,1,0,0,0,1,1,0,1,0,0,0,0,0
8,0,1,0,1,0,0,0,1,0,0,0,0,0,1
9,0,0,1,0,0,1,0,1,0,0,1,0,0,0


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)
# X_test
X_train

(18, 14) (18,)
(7, 14) (7,)


Unnamed: 0,TEMPERATURE_Overcast,TEMPERATURE_Rainy,TEMPERATURE_Sunny,HUMIDITY_Cool,HUMIDITY_Hot,HUMIDITY_Mild,WINDY_High,WINDY_Normal,DAYOFWEEK_F,DAYOFWEEK_M,DAYOFWEEK_S,DAYOFWEEK_Su,DAYOFWEEK_T,DAYOFWEEK_W
21,0,0,1,1,0,0,0,1,0,0,0,0,0,1
3,0,0,1,0,0,1,1,0,0,0,0,0,1,0
6,1,0,0,1,0,0,0,1,0,0,0,1,0,0
5,0,0,1,1,0,0,0,1,0,0,0,0,0,1
14,0,0,1,1,0,0,0,1,0,0,0,0,0,1
10,0,1,0,0,0,1,0,1,0,0,0,1,0,0
16,0,0,1,1,0,0,0,1,1,0,0,0,0,0
24,0,0,1,1,0,0,0,1,0,0,0,0,1,0
20,0,1,0,0,1,0,1,0,0,0,0,0,1,0
1,0,1,0,0,1,0,1,0,0,0,0,0,1,0


## Stacking

Create a third model, logistic regression, on the predictions of the decision tree and knn models.

In [10]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import StackingClassifier

In [13]:
# est = svm.SVC(kernel="rbf", C = 10 ,gamma=30)
# model = KNeighborsClassifier(n_neighbors=3,weights='distance')

estimators = [
    ('dt', tree.DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=6,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0,
                       random_state=None, splitter='best')),
    ('svc', (svm.SVC(kernel="poly", C = 10 ,gamma=30))),('knn',KNeighborsClassifier(n_neighbors=1,weights='distance'))
]
clf = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression()
)

In [14]:
clf.fit(X_train, y_train)
clf.score(X_train, y_train)

1.0

In [15]:
predicted_labels = clf.predict(X_test)
print(predicted_labels)

[1 0 1 1 0 1 0]


In [16]:
print ('Accuracy:', accuracy_score(y_test, predicted_labels))
print ('F1 score:', f1_score(y_test, predicted_labels))
print ('Recall:', recall_score(y_test, predicted_labels))
print ('Precision:', precision_score(y_test, predicted_labels))
print ('\n clasification report:\n', classification_report(y_test,predicted_labels))
print ('\n confussion matrix:\n',confusion_matrix(y_test, predicted_labels))


Accuracy: 0.7142857142857143
F1 score: 0.75
Recall: 0.75
Precision: 0.75

 clasification report:
               precision    recall  f1-score   support

           0       0.67      0.67      0.67         3
           1       0.75      0.75      0.75         4

    accuracy                           0.71         7
   macro avg       0.71      0.71      0.71         7
weighted avg       0.71      0.71      0.71         7


 confussion matrix:
 [[2 1]
 [1 3]]


# Bagging


In [154]:
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.datasets import make_classification

#X_train = np.array(X_train)
#y_train = np.array(y_train)

In [44]:
X_train.shape[1]

14

In [203]:
clf = BaggingClassifier(base_estimator=SVC(C=10,kernel='rbf',gamma=0.5),
                        n_estimators=5, random_state=0 ,verbose=0)
clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=SVC(C=10, break_ties=False, cache_size=200,
                                     class_weight=None, coef0=0.0,
                                     decision_function_shape='ovr', degree=3,
                                     gamma=0.5, kernel='rbf', max_iter=-1,
                                     probability=False, random_state=None,
                                     shrinking=True, tol=0.001, verbose=False),
                  bootstrap=True, bootstrap_features=False, max_features=1.0,
                  max_samples=1.0, n_estimators=5, n_jobs=None, oob_score=False,
                  random_state=0, verbose=0, warm_start=False)

In [204]:
predicted_labels =clf.predict(X_test)

In [205]:
print(predicted_labels)

[1 0 1 1 0 1 0]


In [206]:
print ('Accuracy:', accuracy_score(y_test, predicted_labels))
print ('F1 score:', f1_score(y_test, predicted_labels))
print ('Recall:', recall_score(y_test, predicted_labels))
print ('Precision:', precision_score(y_test, predicted_labels))
print ('\n clasification report:\n', classification_report(y_test,predicted_labels))
print ('\n confussion matrix:\n',confusion_matrix(y_test, predicted_labels))


Accuracy: 0.7142857142857143
F1 score: 0.75
Recall: 0.75
Precision: 0.75

 clasification report:
               precision    recall  f1-score   support

           0       0.67      0.67      0.67         3
           1       0.75      0.75      0.75         4

    accuracy                           0.71         7
   macro avg       0.71      0.71      0.71         7
weighted avg       0.71      0.71      0.71         7


 confussion matrix:
 [[2 1]
 [1 3]]


In [162]:
clf.n_classes_

2

## AdaBoost

In [207]:
from sklearn.ensemble import AdaBoostClassifier
model = AdaBoostClassifier(random_state=1,base_estimator=SVC(C=30,kernel='rbf',gamma=1),algorithm='SAMME')

In [208]:
model.fit(X_train, y_train)
model.score(X_test,y_test)

0.8571428571428571

In [211]:
model.estimator_errors_

array([0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

## Gradient Boosting (GBM)

In [242]:
from sklearn.ensemble import GradientBoostingClassifier
model= GradientBoostingClassifier(learning_rate=1.11,random_state=1,loss = 'exponential',subsample=.8)
model.fit(X_train, y_train)
model.score(X_train, y_train),model.score(X_test,y_test)


(1.0, 0.7142857142857143)

In [268]:


for i in np.around((np.arange (0,1,.1)),decimals=1):
    print( i)

0.0
0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9


In [275]:
for i in np.around((np.arange (.1,1,.1)),decimals=1):
    for j in np.around((np.arange (.1,1,.1)),decimals=1):
        model= GradientBoostingClassifier(learning_rate=j,random_state=1,loss = 'exponential',subsample=i,max_features=2)
        model.fit(X_train, y_train)
        train = round(model.score(X_train, y_train),2)
        test = round(model.score(X_test,y_test),2)
        t = PrettyTable(['Subsample', 'Learningrate','Train','Test'])
        t.add_row([i, j,train,test])
#         t.add_row(['j', j])
        print(t)
#         print('subsample:',i,'--','learning_rate:',j,'--','Train:',model.score(X_train, y_train),'--','Test:',model.score(X_test,y_test))

+-----------+--------------+-------+------+
| Subsample | Learningrate | Train | Test |
+-----------+--------------+-------+------+
|    0.1    |     0.1      |  0.5  | 0.57 |
+-----------+--------------+-------+------+
+-----------+--------------+-------+------+
| Subsample | Learningrate | Train | Test |
+-----------+--------------+-------+------+
|    0.1    |     0.2      |  0.5  | 0.57 |
+-----------+--------------+-------+------+
+-----------+--------------+-------+------+
| Subsample | Learningrate | Train | Test |
+-----------+--------------+-------+------+
|    0.1    |     0.3      |  0.5  | 0.57 |
+-----------+--------------+-------+------+
+-----------+--------------+-------+------+
| Subsample | Learningrate | Train | Test |
+-----------+--------------+-------+------+
|    0.1    |     0.4      |  0.5  | 0.57 |
+-----------+--------------+-------+------+
+-----------+--------------+-------+------+
| Subsample | Learningrate | Train | Test |
+-----------+--------------+----

In [221]:
model.get_params

<bound method BaseEstimator.get_params of GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=20.0, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=1, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)>

In [237]:
predicted_labels = clf.predict(X_test)
print(predicted_labels)

[1 0 1 1 0 1 0]


In [235]:
print ('Accuracy:', accuracy_score(y_test, predicted_labels))
print ('F1 score:', f1_score(y_test, predicted_labels))
print ('Recall:', recall_score(y_test, predicted_labels))
print ('Precision:', precision_score(y_test, predicted_labels))
print ('\n clasification report:\n', classification_report(y_test,predicted_labels))
print ('\n confussion matrix:\n',confusion_matrix(y_test, predicted_labels))

Accuracy: 0.7142857142857143
F1 score: 0.75
Recall: 0.75
Precision: 0.75

 clasification report:
               precision    recall  f1-score   support

           0       0.67      0.67      0.67         3
           1       0.75      0.75      0.75         4

    accuracy                           0.71         7
   macro avg       0.71      0.71      0.71         7
weighted avg       0.71      0.71      0.71         7


 confussion matrix:
 [[2 1]
 [1 3]]


## XGBoost
XGBoost (extreme Gradient Boosting) is an advanced implementation of the gradient boosting algorithm.  
XGBoost has proved to be a highly effective ML algorithm.  
XGBoost has high predictive power and is almost 10 times faster than the other gradient boosting techniques.  
It also includes a variety of regularization which reduces overfitting and improves overall performance.  
Hence it is also known as ‘regularized boosting‘ technique.

In [277]:
import xgboost as xgb
model=xgb.XGBClassifier(random_state=1,learning_rate=0.01)
model.fit(X_train, y_train)
model.score(X_test,y_test)


0.7142857142857143

In [279]:
model.get_params

<bound method XGBModel.get_params of XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.01, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=1,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)>

In [283]:
for i in np.around((np.arange (.1,1,.1)),decimals=1):
    for j in np.around((np.arange (.1,1,.1)),decimals=1):
        model= xgb.XGBClassifier(learning_rate=j,random_state=1,loss = 'exponential',subsample=0.9,gamma=i)
        model.fit(X_train, y_train)
        train = round(model.score(X_train, y_train),2)
        test = round(model.score(X_test,y_test),2)
        t = PrettyTable(['gamma', 'Learningrate','Train','Test'])
        t.add_row([i, j,train,test])
#         t.add_row(['j', j])
        print(t)

+-------+--------------+-------+------+
| gamma | Learningrate | Train | Test |
+-------+--------------+-------+------+
|  0.1  |     0.1      |  0.89 | 0.86 |
+-------+--------------+-------+------+
+-------+--------------+-------+------+
| gamma | Learningrate | Train | Test |
+-------+--------------+-------+------+
|  0.1  |     0.2      |  0.89 | 0.86 |
+-------+--------------+-------+------+
+-------+--------------+-------+------+
| gamma | Learningrate | Train | Test |
+-------+--------------+-------+------+
|  0.1  |     0.3      |  0.89 | 0.86 |
+-------+--------------+-------+------+
+-------+--------------+-------+------+
| gamma | Learningrate | Train | Test |
+-------+--------------+-------+------+
|  0.1  |     0.4      |  0.89 | 0.86 |
+-------+--------------+-------+------+
+-------+--------------+-------+------+
| gamma | Learningrate | Train | Test |
+-------+--------------+-------+------+
|  0.1  |     0.5      |  0.89 | 0.86 |
+-------+--------------+-------+------+
