In [141]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

In [5]:
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = pd.read_csv('pima-indians-diabetes.data (1).csv', names=names)

In [7]:
array = df.values
array

array([[  6.   , 148.   ,  72.   , ...,   0.627,  50.   ,   1.   ],
       [  1.   ,  85.   ,  66.   , ...,   0.351,  31.   ,   0.   ],
       [  8.   , 183.   ,  64.   , ...,   0.672,  32.   ,   1.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,   0.245,  30.   ,   0.   ],
       [  1.   , 126.   ,  60.   , ...,   0.349,  47.   ,   1.   ],
       [  1.   ,  93.   ,  70.   , ...,   0.315,  23.   ,   0.   ]])

In [13]:
X = array[:,0:8]
Y = array[:,8]

In [144]:
#Bagging Tree Classifier
num_trees = 100
cart = DecisionTreeClassifier()
kfold = KFold(n_splits=10)

model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees,random_state=0)
results = cross_val_score(model,X,Y, cv=kfold)
results

array([0.62337662, 0.83116883, 0.71428571, 0.61038961, 0.83116883,
       0.83116883, 0.83116883, 0.83116883, 0.71052632, 0.77631579])

In [145]:
results.mean()

0.7590738209159261

In [165]:
#Random Forest Classifier
num_trees = 100
kfold = KFold(n_splits=10, shuffle=True)
max_features = 3
model = RandomForestClassifier(n_estimators=num_trees, max_features=max_features,random_state=0)
result = cross_val_score(model, X, Y, cv=kfold)
result

array([0.81818182, 0.68831169, 0.79220779, 0.66233766, 0.79220779,
       0.71428571, 0.79220779, 0.77922078, 0.84210526, 0.77631579])

In [166]:
result.mean()

0.7657382091592618

In [147]:
#AdaBoost Classifier
num_tress = 10
kfold = KFold(n_splits=10)
cart = DecisionTreeClassifier()
model = AdaBoostClassifier(n_estimators=num_trees, random_state=0)
rusults = cross_val_score(model, X,Y, cv=kfold)
results

array([0.62337662, 0.83116883, 0.71428571, 0.61038961, 0.83116883,
       0.83116883, 0.83116883, 0.83116883, 0.71052632, 0.77631579])

In [140]:
rusults.mean()

0.7421565276828435

In [87]:
# Stacking
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
 
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier

kfold = KFold(n_splits=10)

estimators = []
model1 = LogisticRegression(max_iter=500)
estimators.append(('logistic',model1))
model2 = SVC()
estimators.append(('svm',model2))
model3 = DecisionTreeClassifier()
estimators.append(('cart', model3))

ensemble = VotingClassifier(estimators)
results = cross_val_score(ensemble, X,Y, cv=kfold)
results

array([0.64935065, 0.79220779, 0.72727273, 0.66233766, 0.79220779,
       0.80519481, 0.84415584, 0.84415584, 0.75      , 0.77631579])

In [89]:
results.mean()

0.7643198906356801

# XGboost

In [91]:

import xgboost
from xgboost import XGBClassifier
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score


In [92]:
X

array([[  6.   , 148.   ,  72.   , ...,  33.6  ,   0.627,  50.   ],
       [  1.   ,  85.   ,  66.   , ...,  26.6  ,   0.351,  31.   ],
       [  8.   , 183.   ,  64.   , ...,  23.3  ,   0.672,  32.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,  26.2  ,   0.245,  30.   ],
       [  1.   , 126.   ,  60.   , ...,  30.1  ,   0.349,  47.   ],
       [  1.   ,  93.   ,  70.   , ...,  30.4  ,   0.315,  23.   ]])

In [93]:
Y

array([1., 0., 1., 0., 1., 0., 1., 0., 1., 1., 0., 1., 0., 1., 1., 1., 1.,
       1., 0., 1., 0., 0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 1., 1., 1., 0., 0., 0., 1., 0., 1., 0., 0., 1., 0., 0.,
       0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0.,
       0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 1., 1., 1., 0., 0.,
       0., 1., 0., 0., 0., 1., 1., 0., 0., 1., 1., 1., 1., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0.,
       1., 1., 0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 1.,
       1., 1., 1., 0., 0., 1., 1., 0., 1., 0., 1., 1., 1., 0., 0., 0., 0.,
       0., 0., 1., 1., 0., 1., 0., 0., 0., 1., 1., 1., 1., 0., 1., 1., 1.,
       1., 0., 0., 0., 0.

In [97]:
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size=0.3, random_state=0)
model = XGBClassifier(n_estimators=100, max_depth=3)
model.fit(x_train, y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=3, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)

In [100]:
y_pred = model.predict(x_test)
y_pred

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1])

In [102]:
predictions = [round(value) for value in y_pred]

In [116]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test,y_pred)
print("Accuracy: %.2f%%" % (accuracy*100))

Accuracy: 77.92%


In [170]:
num_trees = 100
kfold = KFold(n_splits=10, shuffle=True)

model = XGBClassifier(n_estimators=num_trees,max_depth=3, random_state=0)
result = cross_val_score(model, X, Y, cv=kfold)
result

array([0.74025974, 0.76623377, 0.72727273, 0.72727273, 0.71428571,
       0.80519481, 0.76623377, 0.64935065, 0.72368421, 0.71052632])

In [173]:
result.mean()

0.7330314422419686

# Light GBM

In [176]:

!pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-3.3.5-py3-none-win_amd64.whl (1.0 MB)
     ---------------------------------------- 1.0/1.0 MB 225.3 kB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-3.3.5


In [179]:
import lightgbm as lgb
x_tran, x_test, y_train, y_test = train_test_split(X,Y, test_size=0.3, random_state=0)
d_train = lgb.Dataset(x_train, label=y_train)

In [180]:
d_train

<lightgbm.basic.Dataset at 0x22488bdd0d0>

In [181]:
params = {}
params['learning_rate'] = 0.002
params['boosting_type'] = 'gbdt'
params['objective'] = 'binary'
params['metric'] = 'binary_logloss'
params['num_leaves'] = 10
params['max_depth'] = 10

In [182]:
params

{'learning_rate': 0.002,
 'boosting_type': 'gbdt',
 'objective': 'binary',
 'metric': 'binary_logloss',
 'num_leaves': 10,
 'max_depth': 10}

In [183]:
clf = lgb.train(params, d_train, 1000)#1000 times repeatation
clf

[LightGBM] [Info] Number of positive: 194, number of negative: 343
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 614
[LightGBM] [Info] Number of data points in the train set: 537, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.361266 -> initscore=-0.569872
[LightGBM] [Info] Start training from score -0.569872


<lightgbm.basic.Booster at 0x22488bde730>

In [185]:
y_pred = clf.predict(x_test)

In [186]:
predictions = [round(value) for value in y_pred]

In [188]:
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%"%(accuracy*100))

Accuracy: 78.79%
