In [35]:
'''
集群 (Clustering)
'''
from linear_algebra import squared_distance, vector_mean, distance
import math, random
import matplotlib.image as mpimg
import matplotlib.pyplot as plt

# 計算 k-mean的集群
class KMeans:
    '''
    執行 k-mean 集群分析
    '''
    def __init__(self, k):
        self.k = k           # 集群的數量
        self.means = None    # 集群的平均值
        
    def classify(self, input):
        '''
        送回與輸入值最接近的集群索引值
        '''
        return min(range(self.k),
                  key=lambda i: squared_distance(input, self.means[i]))
    
    def train(self, inputs):
        # 選擇 k 個隨機點，作為初始平均值
        self.means = random.sample(inputs, self.k)
        assignments = None
        
        while True:
            # 找看看有沒有新的分配結果
            new_assignments = list(map(self.classify, inputs))

            # 如果分配結果已經不再變動，工作就完成了
            if assignments == new_assignments:
                return

            # 要不然的話，就繼續進行新的分配工作，
            assignments = new_assignments

            # 根據新的分配結果，計算出新的平均值
            for i in range(self.k):
                # 找出集群 i 中所有的數據點
                i_points = [p for p, a in zip(inputs, assignments) if a == i]

                # 要先確定 i_points 不是空的，才不會發生除以 0 的情況
                if i_points:
                    self.means[i] = vector_mean(i_points)

ModuleNotFoundError: No module named 'linear_algebra'

In [65]:
# Example 1
from linear_algebra import squared_distance, vector_mean, distance
import math, random
import matplotlib.image as mpimg
import matplotlib.pyplot as plt

inputs = [[-14,-5],[13,13],[20,23],[-19,-11],[-9,-16],[21,27],[-49,15],
          [26,13],[-46,5],[-34,-1],[11,15],[-49,0],[-22,-16],[19,28],[-12,-8],
          [-13,-19],[-41,8],[-11,-6],[-25,-9],[-18,-3]]

random.seed(0) # so you get the same results as me
clusterer = KMeans(3)
clusterer.train(inputs)
print("3-means:")
print(clusterer.means)
print()

random.seed(0)
clusterer = KMeans(2)
clusterer.train(inputs)
print("2-means:")
print(clusterer.means)
print()

print("errors as a function of k")

for k in range(1, len(inputs) + 1):
    print(k, squared_clustering_errors(inputs, k))
print()


print("bottom up hierarchical clustering")

base_cluster = bottom_up_cluster(inputs)
print(base_cluster)

print()
print("three clusters, min:")
for cluster in generate_clusters(base_cluster, 3):
    print(get_values(cluster))

print()
print("three clusters, max:")
base_cluster = bottom_up_cluster(inputs, max)
for cluster in generate_clusters(base_cluster, 3):
    print(get_values(cluster))

ModuleNotFoundError: No module named 'linear_algebra'

In [68]:
'''
k 的選擇
'''
def squared_clustering_errors(inputs, k):
    '''
    計算出輸入值與集群 k-mean 之間的誤差平方和
    '''
    clusterer = KMeans(k)
    clusterer.train(inputs)
    means = clusterer.means
    assignments = list(map(clusterer.classify, inputs))
    
    return sum(squared_distance(input, means[cluster])
                for input, cluster in zip(inputs, assignments))

# 從 1 往上到 len(inputs)，畫出不同集群數量所對應的圖形

ks = range(1, len(inputs) + 1)
errors = [squared_clustering_errors(inputs, k) for k in ks]

plt.plot(ks, errors)
plt.xticks(ks)
plt.xlabel("k")
plt.ylabel("total squared error")
plt.title("Total Error vs. # of Clusters")
plt.show()

NameError: name 'KMeans' is not defined

In [71]:
'''
對顏色進行集群分析
'''
# 把每個像素分配到紅綠藍空間中的五個集群即可
path_to_jpg_file = r"C:\Users\Zhong-Xun Yu\Pictures\tsmc_logo.jpg"
import matplotlib.image as mpimg
img = mpimg.imread(path_to_jpg_file)    # img 是一個 NumPy 陣列，當作「列表的列表的列表」

# img[i][j] 代表的是第 i 列的第 j 個像素，每個像素都是一個數值介於 0 與 1 的「紅, 綠, 藍」列表
# 定義該像素的顏色
top_row = img[0]
top_left_pixel = top_row[0]
# red, green, blue = top_left_pixel

# 像素變成一個扁平化的列表
pixels = [pixel for row in img for pixel in row]

clusterer = KMeans(5)
clusterer.train(pixels)

# 按照相同格式，建立一個新的圖片
def recolor(pixel):
    cluster = clusterer.classify(pixel)          # 最接近集群的索引值
    return clusterer.means[cluster]              # 最接近集群的平均值

new_img = [[recolor(pixel) for pixel in row]     # 重新設定此列像素的顏色
          for row in img]                        # 針對圖片中的每一列

plt.imshow(new_img)
plt.axis('off')
plt.show()

NameError: name 'KMeans' is not defined

In [73]:
'''
由下而上階層式分群法
'''
# 往前回溯合併最接近的值，予以合併
# 「樹葉集群」(leaf clusters)
# 一元組 (1-tuples)
leaf1 = ([10, 20],)    # 一元組要在後面加個逗號
leaf2 = ([30, -15],)   # 否則 Python 就會把 () 當成真正的括號來處理

# 合併後的集群 (合併順序, 子集群)
# 二元組 (2-tuples)
merged = (1, [leaf1, leaf2])

In [77]:
# 建立輔助函數
from linear_algebra import squared_distance, vector_mean, distance
import math, random
import matplotlib.image as mpimg
import matplotlib.pyplot as plt

def is_leaf(cluster):
    '''
    如果集群的長度為 1，它就是個樹葉集群
    '''
    return len(cluster) == 1

def get_children(cluster):
    '''
    如果是個合併過的集群，就送回此集群的子集群；
    如果是個樹葉集群，就發送出例外狀況
    '''
    if is_leaf(cluster):
        raise TypeError("a leaf cluster has no children")
    else:
        return cluster[1]
    
# 觀察相隔的最小距離
def get_values(cluster):
    '''
    送出集群的值 (如果是樹葉集群的話)
    或送出子集群中所有樹葉集群的值 (如果不是樹葉集群的話)
    '''
    if is_leaf(cluster):
        return cluster      # 本身已是個包含值的一元組
    else:
        return [value
               for child in get_children(cluster)
               for value in get_values(child)]
    
# 改用最大距離
def cluster_distance(cluster1, cluster2, distance_agg=min):
    '''
    計算出集群1 和集群2 中所有兩兩成對組和的距離
    然後套入 distance_agg 函式中，得出一個結果列表
    '''
    return distance_agg([distance(input1, input2)
                        for input1 in get_values(cluster1)
                        for input2 in get_values(cluster2)])

# 樹葉群並非合併來的
# 將其合併順序設定為無限大
def get_merge_order(cluster):
    if is_leaf(cluster):
        return float('inf')
    else:
        return cluster[0]    # merge_order (合併順序) 就是二元組的第一個元素
    
def bottom_up_cluster(inputs, distance_agg=min):
    # 一開始每個輸入項都是樹葉集群 / 一元組
    clusters = [(input,) for input in inputs]
    
    # 只要還有一個以上的集群...
    while len(clusters) > 1:
        # 找出最接近的兩個集群
        c1, c2 = min([(cluster1, cluster2)
                     for i, cluster1 in enumerate(clusters)
                     for cluster2 in clusters[:i]],
                     key=lambda p: cluster_distance(p[0], p[1], distance_agg))
        
        # 將它們從集群列表中移除
        clusters = [c for c in clusters if c != c1 and c != c2]
        
        # 將它們合併起來，設定 merge_order = 剩餘集群的數量
        merged_cluster = (len(clusters), [c1, c2])
        
        # 然後再把合併結果添加回去
        clusters.append(merged_cluster)
        
    # 如果只剩下一個集群，就把它送出去
    return clusters[0]

base_cluster = bottom_up_cluster(inputs)

ModuleNotFoundError: No module named 'linear_algebra'

In [80]:
# 往前回溯的方式
# 按照所需得任何數量的集群
def generate_clusters(base_cluster, num_clusters):
    # 一開始只有僅包含基礎集群 (base cluster) 的一個列表
    clusters = [base_cluster]
    
    # 只要我們還沒得到足夠的集群數量
    while len(clusters) < num_clusters:
        # 選出目前為止最後一次合併的集群
        next_cluster = min(clusters, key=get_merge_order)
        # 把它從列表中移除
        clusters = [c for c in clusters if c != next_cluster]
        # 把它的子集群重新加到列表中 (往前回溯)
        clusters.extend(get_children(next_cluster))
    
    # 只要取得足夠的集群數量
    return clusters

In [86]:
# 3個集群
three_clusters = [get_values(cluster)
                 for cluster in generate_clusters(base_cluster, 3)]

for i, cluster, maker, color in zip([1, 2, 3],
                                   three_clusters,
                                   ['D', 'o', '*'],
                                   ['r', 'g', 'b']):
    xs, ys = zip(*cluster)   # un-zip 拆分技巧
    plt.scatter(xs, ys, color=color, maker=maker)
    
    # 在集群平均值的位置放個數字
    x, y = vector_mean(cluster)
    plt.plot(x, y, maker='$' + str(i) + '$', color='black')
    
plt.title("User Locations -- 3 Bottom-up Clusters, Min")
plt.xlabel("blocks east of city center")
plt.ylabel("blocks north of city center")
plt.show()

NameError: name 'base_cluster' is not defined