
___
# Ensemble methods with Python

#Bagging #Boosting 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Get the Data


In [2]:
loan_df = pd.read_csv('loan_data.csv')

In [3]:
loan_df.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


In [4]:
loan_df['not.fully.paid'].value_counts()

0    8045
1    1533
Name: not.fully.paid, dtype: int64

In [5]:
loan_df['purpose'].value_counts()

debt_consolidation    3957
all_other             2331
credit_card           1262
home_improvement       629
small_business         619
major_purchase         437
educational            343
Name: purpose, dtype: int64

In [6]:
loan_df = pd.get_dummies(loan_df,columns=['purpose'],drop_first=True)

In [7]:
loan_df.head(2)

Unnamed: 0,credit.policy,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_major_purchase,purpose_small_business
0,1,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0,0,1,0,0,0,0
1,1,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0,1,0,0,0,0,0


In [8]:
predictors = loan_df.drop('not.fully.paid', axis = 1)
target = loan_df['not.fully.paid']

## Train Test Split

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(predictors, target, test_size=0.30, random_state=101, stratify = target)

# Train the Decision Tree Classifier

In [11]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(criterion='entropy', 
                              max_depth=None,
                              random_state=1)

dt.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=1, splitter='best')

In [14]:
from sklearn.metrics import accuracy_score

# in-sample error

in_predictions = dt.predict(X_train)
print(accuracy_score(y_train, in_predictions))


1.0


In [15]:
predictions = dt.predict(X_test)

In [16]:
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score

In [17]:
print(accuracy_score(y_test, predictions))

0.737647877523


## Let's try Bagging

In [18]:
from sklearn import model_selection
from sklearn.ensemble import BaggingClassifier


bagged_model = BaggingClassifier(base_estimator=None,
                        n_estimators=500, 
                        max_samples=1.0, 
                        max_features=1.0, 
                        bootstrap=True, 
                        bootstrap_features=False, 
                        n_jobs=1, 
                        random_state=1)


bagged_model.fit(X_train,y_train) 

BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=500, n_jobs=1, oob_score=False, random_state=1,
         verbose=0, warm_start=False)

In [19]:
bagging_predictions = bagged_model.predict(X_test)

In [20]:
print(accuracy_score(y_test, bagging_predictions))

0.832289491997


## Let's try Boosting

In [21]:
from sklearn.ensemble import AdaBoostClassifier


ada = AdaBoostClassifier(base_estimator=None,
                         n_estimators=500, 
                         learning_rate=0.1,
                         random_state=0)

In [22]:
ada = ada.fit(X_train, y_train)
ada_predictions = ada.predict(X_test)

In [23]:
print(accuracy_score(y_test, ada_predictions))

0.839944328462


## Let's try Gradient Boosting

In [24]:
from sklearn.ensemble import GradientBoostingClassifier


gb = GradientBoostingClassifier(n_estimators=500)

gb.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=500, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)

In [25]:
gb_predictions = gb.predict(X_test)

In [26]:
print(accuracy_score(y_test, gb_predictions))

0.830549756437


## Voting classifier

In [27]:
from sklearn import datasets
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

In [28]:
clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()

vc = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')

In [29]:
vc.fit(X_train,y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)), ('rf', RandomFore...=False, random_state=1,
            verbose=0, warm_start=False)), ('gnb', GaussianNB(priors=None))],
         n_jobs=1, voting='hard', weights=None)

In [30]:
vc_predictions = vc.predict(X_test)

In [31]:
print(accuracy_score(y_test, vc_predictions))

0.835768963118


## More example notebooks on ensemble methods

##### https://github.com/knathanieltucker/bit-of-data-science-and-scikit-learn/blob/master/notebooks/EnsembleMethods.ipynb
##### https://machinelearningmastery.com/ensemble-machine-learning-algorithms-python-scikit-learn/

### XGBoost


https://machinelearningmastery.com/develop-first-xgboost-model-python-scikit-learn/