In [None]:
!pip install xlrd

In [None]:
# 第 1 部分
# 載入函式庫
import matplotlib.pyplot as plt
import numpy as np
import openensembles as oe 
import pandas as pd
from sklearn import metrics

# 載入資料集
data = pd.read_excel('WHR.xls')
regs = pd.read_excel('REG.xls')

# 使用 2017 年的資料並填補缺少的項目
recents = data[data.Year == 2017]
recents = recents.dropna(axis=1, how="all")
recents = recents.fillna(recents.median())

# 使用以下特徵
columns = ['Log GDP per capita',
           'Social support',
           'Healthy life expectancy at birth',
           'Freedom to make life choices',
           'Generosity',
           'Perceptions of corruption',
           'Positive affect',
           'Negative affect',
           'Confidence in national government',
           'Democratic Quality',
           'Delivery Quality']

# 特徵正規化
normalized = recents[columns]
normalized = normalized - normalized.mean()
normalized = normalized / normalized.std()

cluster_data = oe.data(recents[columns], columns)

In [None]:
# 第 2 部分程式
# 建立集成模型
np.random.seed(123456)
results = {'K':[], 'size':[], 'silhouette': []}
# 測試不同的子群數
Ks = [2, 4, 6, 8, 10, 12, 14]
# 測試不同的基學習器個數
sizes = [5, 10, 20, 50]
for K in Ks:
    for ensemble_size in sizes:
        # 初始化基學習器
        ensemble = oe.cluster(cluster_data)
        for i in range(ensemble_size):
            # 訓練基學習器
            name = f'kmeans_{ensemble_size}_{i}'
            ensemble.cluster('parent', 'kmeans', name, K)

        # 使用共現鏈組成所有基學習器的輸出
        preds = ensemble.finish_co_occ_linkage(threshold = 0.5)
        print(f'K: {K}, size {ensemble_size}:', end=' ')
        # 計算輪廓係數
        silhouette = metrics.silhouette_score(recents[columns],
                                              preds.labels['co_occ_linkage'])
        print('%.2f' % silhouette)
        results['K'].append(K)
        results['size'].append(ensemble_size)
        results['silhouette'].append(silhouette)
        
results_df = pd.DataFrame(results)
cross = pd.crosstab(results_df.K,
                    results_df['size'], 
                    results_df['silhouette'],
                    aggfunc=lambda x: x)
print(cross)