### 0.读入数据

## 模型融合

In [2]:
import pandas as pd

In [3]:
url = "pima-indians-diabetes.txt"

In [4]:
df = pd.read_csv(url, header=None)

In [5]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

In [7]:
df.columns = names

In [8]:
df.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


### 1.投票器分类器

In [9]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
import warnings
warnings.filterwarnings('ignore')

In [10]:
data = df.values

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
preg     768 non-null int64
plas     768 non-null int64
pres     768 non-null int64
skin     768 non-null int64
test     768 non-null int64
mass     768 non-null float64
pedi     768 non-null float64
age      768 non-null int64
class    768 non-null int64
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [12]:
data[:3,:]

array([[  6.   , 148.   ,  72.   ,  35.   ,   0.   ,  33.6  ,   0.627,
         50.   ,   1.   ],
       [  1.   ,  85.   ,  66.   ,  29.   ,   0.   ,  26.6  ,   0.351,
         31.   ,   0.   ],
       [  8.   , 183.   ,  64.   ,   0.   ,   0.   ,  23.3  ,   0.672,
         32.   ,   1.   ]])

In [13]:
X = data[:, :8]
Y = data[:, 8]

In [14]:
seed = 2018
kfold = model_selection.KFold(n_splits=12, random_state=seed)

In [15]:
# 创建投票器子模型
estimators = []
model_1 = LogisticRegression()
model_2 = DecisionTreeClassifier()
model_3 = SVC()
model_4 = RandomForestClassifier()

estimators.append(('logistic', model_1))
estimators.append(('dt', model_2))
estimators.append(('svm', model_3))
estimators.append(('nb', model_4))

In [16]:
estimators

[('logistic',
  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
            intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
            penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
            verbose=0, warm_start=False)),
 ('dt',
  DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, presort=False, random_state=None,
              splitter='best')),
 ('svm', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)),
 ('nb',
  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
           

In [17]:
# 构建投票器融合
ensemble = VotingClassifier(estimators=estimators)

In [18]:
result = model_selection.cross_val_score(ensemble, X, Y, cv=kfold)

In [19]:
result

array([0.65625 , 0.75    , 0.734375, 0.6875  , 0.625   , 0.78125 ,
       0.78125 , 0.78125 , 0.84375 , 0.796875, 0.71875 , 0.75    ])

In [60]:
result.mean()

0.7239234449760765

### Bagging

In [61]:
from sklearn.ensemble import BaggingClassifier

In [66]:
cart = DecisionTreeClassifier()
num_trees = 100
model = BaggingClassifier(base_estimator=cart, \
                          n_estimators=num_trees,\
                         random_state=2017)

In [67]:
result = model_selection.cross_val_score(model, X, Y, cv=kfold)

In [68]:
result

array([0.67532468, 0.81818182, 0.74025974, 0.62337662, 0.80519481,
       0.81818182, 0.83116883, 0.85714286, 0.69736842, 0.80263158])

In [69]:
result.mean()

0.7668831168831168

### 随机森林

In [70]:
from sklearn.ensemble import RandomForestClassifier

In [83]:
model_rf = RandomForestClassifier(n_estimators=num_trees, \
                                  max_features=5)

In [84]:
results = model_selection.cross_val_score(model_rf, X, Y, cv=kfold)

In [85]:
results

array([0.71428571, 0.77922078, 0.75324675, 0.63636364, 0.79220779,
       0.80519481, 0.85714286, 0.85714286, 0.68421053, 0.78947368])

In [86]:
results.mean()

0.7668489405331511

### Adaboost

In [87]:
from sklearn.ensemble import AdaBoostClassifier

In [96]:
model_ada = AdaBoostClassifier(n_estimators=28, random_state=2018)

In [97]:
results = model_selection.cross_val_score(model_ada, X, Y, cv=kfold)

In [98]:
results

array([0.68831169, 0.81818182, 0.7012987 , 0.68831169, 0.75324675,
       0.80519481, 0.79220779, 0.83116883, 0.73684211, 0.80263158])

In [99]:
results.mean()

0.7617395762132604