In [15]:
# Bagged Decision Trees for Classification
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
import warnings
warnings.filterwarnings("ignore")

filename = '/content/pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]

seed = 7
# kfold = KFold(n_splits=10,random_state=seed,shuffle=True) # evaluation method
# shuffle=True indicates that the data should be shuffled before splitting it into folds.
# Shuffling ensures that the data is randomly distributed across the folds.
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html

kfold = KFold(n_splits=10)
cart = DecisionTreeClassifier()
num_trees = 100

model = BaggingClassifier(estimator=cart, n_estimators=num_trees, random_state=seed) #Classifier, n_estimators: default=10
results = cross_val_score(model, X, Y, cv=kfold)

In [2]:
results

array([0.66233766, 0.84415584, 0.75324675, 0.64935065, 0.83116883,
       0.83116883, 0.83116883, 0.84415584, 0.68421053, 0.78947368])

In [3]:
print(results.mean())

0.7720437457279563


In [4]:
# Random Forest Classification
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

array=dataframe.values
X = array[:,0:8]
Y = array[:,8]

num_trees = 100
max_features = 3 #Total features m=8, K=3 i.e. randomly select 3 features
kfold = KFold(n_splits=10)

model = RandomForestClassifier(n_estimators=num_trees, max_features=max_features) # try random_state=2, n_estimators=10 by default, default=gini
results = cross_val_score(model, X, Y, cv=kfold)

In [5]:
results

array([0.67532468, 0.85714286, 0.68831169, 0.64935065, 0.76623377,
       0.77922078, 0.85714286, 0.83116883, 0.69736842, 0.81578947])

In [6]:
print(results.mean())

0.7617053998632946


In [16]:
# AdaBoost Classification
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier
filename = '/content/pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
num_trees = 10 # try 20,30
seed=7
kfold = KFold(n_splits=10, random_state=seed,shuffle=True)
model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed) #n_estimators=50 by default
results = cross_val_score(model, X, Y, cv=kfold)

In [8]:
results

array([0.80519481, 0.74025974, 0.67532468, 0.81818182, 0.80519481,
       0.79220779, 0.71428571, 0.75324675, 0.78947368, 0.77631579])

In [9]:
print(results.mean())

0.7669685577580314


In [10]:
# Stacking Ensemble for Classification
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC # Support Vector Classifier
from sklearn.ensemble import VotingClassifier

filename = '/content/pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
kfold = KFold(n_splits=10)

# create the sub models
estimators = [] # create empty list for different algorithms

model1 = LogisticRegression(max_iter=500)
estimators.append(('logistic', model1))

model2 = DecisionTreeClassifier()
estimators.append(('cart', model2))

model3 = SVC()
estimators.append(('svc', model3))

# create the ensemble model
ensemble = VotingClassifier(estimators)
results = cross_val_score(ensemble, X, Y, cv=kfold)

In [11]:
estimators

[('logistic', LogisticRegression(max_iter=500)),
 ('cart', DecisionTreeClassifier()),
 ('svc', SVC())]

In [12]:
results

array([0.63636364, 0.80519481, 0.72727273, 0.64935065, 0.79220779,
       0.80519481, 0.84415584, 0.84415584, 0.76315789, 0.77631579])

In [13]:
print(results.mean())

0.7643369788106631


# **Additional Code: "random_state = any int"**

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

# Generate data for the example (values from 0 to 10)
data = np.arange(10)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data,data, test_size=0.3, random_state=30)

# Display the training and testing sets
print("Training set X:", X_train)
print("Testing set X:", X_test)
print("Training labels y:", y_train)
print("Testing labels y:", y_test)

'''
First time Output:
Training set X: [0 7 2 9 4 3 6]
Testing set X: [8 1 5]
Training labels y: [0 7 2 9 4 3 6]
Testing labels y: [8 1 5]

Second time Output
Same as above

TRY TO CHANGE random_state = ANY OTHER INTEGER
'''

Training set X: [3 6 4 7 8 9 5]
Testing set X: [0 1 2]
Training labels y: [3 6 4 7 8 9 5]
Testing labels y: [0 1 2]


'\nFirst time Output:\nTraining set X: [0 7 2 9 4 3 6]\nTesting set X: [8 1 5]\nTraining labels y: [0 7 2 9 4 3 6]\nTesting labels y: [8 1 5]\n\nSecond time Output\nSame as above\n\nTRY TO CHANGE random_state = ANY OTHER INTEGER\n'

# **If random_state is not passed as an argument**

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

# Generate data for the example (values from 0 to 99)
data = np.arange(10)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, data, test_size=0.3)

# Display the training and testing sets
print("Training set X:", X_train)
print("Testing set X:", X_test)
print("Training labels y:", y_train)
print("Testing labels y:", y_test)

'''
First time Output
Training set X: [4 3 7 9 1 6 8]
Testing set X: [0 2 5]
Training labels y: [4 3 7 9 1 6 8]
Testing labels y: [0 2 5]

Second time Output
Training set X: [6 8 2 3 7 1 9]
Testing set X: [5 0 4]
Training labels y: [6 8 2 3 7 1 9]
Testing labels y: [5 0 4]
'''

Training set X: [8 1 3 0 7 4 2]
Testing set X: [5 6 9]
Training labels y: [8 1 3 0 7 4 2]
Testing labels y: [5 6 9]


'\nFirst time Output\nTraining set X: [4 3 7 9 1 6 8]\nTesting set X: [0 2 5]\nTraining labels y: [4 3 7 9 1 6 8]\nTesting labels y: [0 2 5]\n\nSecond time Output\nTraining set X: [6 8 2 3 7 1 9]\nTesting set X: [5 0 4]\nTraining labels y: [6 8 2 3 7 1 9]\nTesting labels y: [5 0 4]\n'