## Ensemble Methods Analysis on Pima Indians Diabetes Dataset

1. **Random Forest Classification**: Comparing Random Forest and Decision Tree classifiers.
2. **Voting Ensemble**: Combining predictions from multiple models (Decision Tree, MLP).
3. **Boosting**: Implementation of AdaBoost with different classifiers (Decision Tree, Random Forest).
4. **Bagging Classifier**: Comparison of Bagging with different base models (Random Forest, KNN, MLP).

Dataset sourced from the UCI Machine Learning Repository.

In [None]:
# Random Forest Classification
import pandas
from sklearn import tree
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

dataframe = pandas.read_csv("pima-indians-diabetes.csv", names=names)

array = dataframe.values
X = array[:,0:8]
Y = array[:,8]

seed = 7

num_trees = 27

max_features = 7

X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.30, random_state=seed)

kfold = model_selection.StratifiedKFold(n_splits=10, random_state=seed)

RF = RandomForestClassifier(n_estimators=num_trees, max_features=max_features)
results = model_selection.cross_val_score(RF, X, Y, cv=kfold)
print("Random Forest: ",results.mean())

dtree = tree.DecisionTreeClassifier(criterion='entropy',random_state=seed)
results = model_selection.cross_val_score(dtree, X, Y, cv=kfold)
print("Decision Tree: ", results.mean())


In [None]:
# Voting Ensemble for Classification
import pandas
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier
import sys
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")

names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pandas.read_csv("pima-indians-diabetes.csv", names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
seed = 7

kfold = model_selection.StratifiedKFold(n_splits=10, random_state=seed)

# create the sub models
estimators = []
model1 = DecisionTreeClassifier()
estimators.append(('cart', model2))
model2 = MLPClassifier(hidden_layer_sizes=(20,20), random_state=5)
estimators.append(('mlp', model3))
# create the ensemble model
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, X, Y, cv=kfold)
print(results.mean())

In [None]:
# AdaBoostClassifier
import numpy as np
import pandas
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn import tree
seed = 1075
np.random.seed(seed)

names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pandas.read_csv("pima-indians-diabetes.csv", names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]

# Create classifiers
dt = tree.DecisionTreeClassifier()
rf = RandomForestClassifier()

clf_array = [dt, rf]
for clf in clf_array:
    pima_scores = cross_val_score(clf, X, Y, cv=10)
    boosting_clf = AdaBoostClassifier(base_estimator=clf, n_estimators=15, random_state=seed)
    boosting_scores = cross_val_score(boosting_clf, X, Y, cv=10)    
    print('Media solo ',pima_scores.mean(), 'Desvio solo',pima_scores.std(), 'Media AdaBoost',boosting_scores.mean(), 'Desvio AdaBoost ',boosting_scores.std())

In [None]:
# BaggingClassifier
import numpy as np
import pandas
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier,RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.neural_network import MLPClassifier
import sys
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore"

seed = 1075
np.random.seed(seed)

names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pandas.read_csv("pima-indians-diabetes.csv", names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]

# Create classifiers
rf = RandomForestClassifier()
knn = KNeighborsClassifier()
mlpc = MLPClassifier( hidden_layer_sizes=(8,8), random_state=seed)


clf_array = [rf, knn, mlpc]
for clf in clf_array:
    pima_scores = cross_val_score(clf, X, Y, cv=10)
    bagging_clf = BaggingClassifier(clf,max_samples=0.8, max_features=5, random_state=seed)
    bagging_scores = cross_val_score(bagging_clf, X, Y, cv=10)    
    print('Media Solo ',pima_scores.mean(), 'Desvio Solo',pima_scores.std(), 'Media Bagging',bagging_scores.mean(), 'Desvio ',bagging_scores.std())