In [None]:
!pip install xlrd

In [None]:
# 第 1 部分
# 載入函式庫
import matplotlib.pyplot as plt
import numpy as np
import openensembles as oe 
import pandas as pd
from sklearn import metrics

# 載入資料集
data = pd.read_excel('WHR.xls')
regs = pd.read_excel('REG.xls')

# 使用 2017 年的資料並填補缺少的項目
recents = data[data.Year == 2017]
recents = recents.dropna(axis=1, how="all")
recents = recents.fillna(recents.median())

# 使用以下特徵
columns = ['Log GDP per capita',
           'Social support',
           'Healthy life expectancy at birth',
           'Freedom to make life choices',
           'Generosity',
           'Perceptions of corruption',
           'Positive affect',
           'Negative affect',
           'Confidence in national government',
           'Democratic Quality',
           'Delivery Quality']

# 特徵正規化
normalized = recents[columns]
normalized = normalized - normalized.mean()
normalized = normalized / normalized.std()

cluster_data = oe.data(recents[columns], columns)

In [None]:
# 第 2 部分程式
# 建立集成模型
ensemble = oe.cluster(cluster_data)
for i in range(10):
    name = f'kmeans({i}-tsne'
    ensemble.cluster('parent', 'kmeans', name, 10)

# 使用共現鏈組成所有基學習器的輸出
preds = ensemble.finish_co_occ_linkage(threshold=0.5)

In [None]:
# 第 3 部分程式
# 分析成果

# 將「生活階梯」資料集
columns = ['Life Ladder',
           'Log GDP per capita',
           'Social support',
           'Healthy life expectancy at birth',
           'Freedom to make life choices',
           'Generosity',
           'Perceptions of corruption',
           'Positive affect',
           'Negative affect',
           'Confidence in national government',
           'Democratic Quality',
           'Delivery Quality']

# 將預測值加入資料集
recents['Cluster'] = preds.labels['co_occ_linkage']
grouped = recents.groupby('Cluster')

# 計算平均值
means = grouped.mean()[columns]

# 建立長條圖
def create_bar(col, nc, nr, index):
    plt.subplot(nc, nr, index)
    values = means.sort_values('Life Ladder')[col]
    values.plot(kind='bar')
    plt.title(col[:18])
    
# 繪製每項特徵
plt.figure(figsize = (20, 60))
plt.subplots_adjust(hspace=0.4)
i = 1
for col in columns:
    create_bar(col, 3, 4, i)
    i += 1
plt.show()

# 列出每個子群的國家
for index, row in recents.iterrows():
    print(row['Country name'], row['Cluster'])