In [1]:
# https://github.com/Airwavess/NLP-practice/blob/master/2.%20kmeans/K-Means%2B%2B%20-%20natural%20language%20processing.ipynb

In [1]:
import json
import random
import re

import jieba
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

jieba.load_userdict("dict.txt.big.txt")

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\BIGDAT~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.520 seconds.
Prefix dict has been built succesfully.


In [2]:
class KMeans:
    def cal_dist(self, p0, p1):
        """
        比較兩點的距離
        """
        return np.sqrt(np.sum((p0-p1)**2))
    
    def nearest_cluster_center(self, point, cluster_centers):
        """
        找到距離 point 最近的中心點
        """
        min_dist = float("inf")
        m = cluster_centers.shape[0]
        for i in range(m):
            d = self.cal_dist(point, cluster_centers[i])
            if min_dist > d:
                min_dist = d
        return min_dist 

    def get_centroids(self, datapoints, k):
        """
        K-means++ 演算法，取得初始化中心點
        """
        clusters = np.array([random.choice(datapoints)])
        dist = np.zeros(len(datapoints))
        
        for i in range(k-1):
            sum_dist = 0
            for j, point in enumerate(datapoints):
                dist[j] = self.nearest_cluster_center(point, clusters)
                sum_dist += dist[j]
            
            sum_dist *= random.random()
            for j, d in enumerate(dist):
                sum_dist = sum_dist - d
                if sum_dist <= 0:
                    clusters = np.append(clusters, [datapoints[j]], axis=0)
                    break
        
        return clusters
        
        
    def kmeans_plus_plus(self, datapoints, k=2):
        """
        K-means 演算法
        """
        # 定義資料維度
        d = datapoints.shape[1]
        # 最大的迭代次數
        Max_Iterations = 1000

        cluster = np.zeros(datapoints.shape[0])
        prev_cluster = np.ones(datapoints.shape[0])

        cluster_centers = self.get_centroids(datapoints, k)

        iteration = 0
        while np.array_equal(cluster, prev_cluster) is False or iteration > Max_Iterations:
            iteration += 1
            prev_cluster = cluster.copy()

            # 將每一個點做分群
            for idx, point in enumerate(datapoints):
                min_dist = float("inf")
                for c, cluster_center in enumerate(cluster_centers):
                    dist = self.cal_dist(point, cluster_center)
                    if dist < min_dist:
                        min_dist = dist  
                        cluster[idx] = c   # 指定該點屬於哪個分群

            # 更新分群的中心
            for k in range(len(cluster_centers)):
                new_center = np.zeros(d)
                members = 0
                for point, c in zip(datapoints, cluster):
                    if c == k:
                        new_center += point
                        members += 1
                if members > 0:
                    new_center = new_center / members
                cluster_centers[k] = new_center

        return cluster

In [3]:
import csv
import json

csvfile = open('woman_test_genre.csv', 'r', encoding="utf-8")
jsonfile = open('woman_test_genre.json', 'w')

fieldnames = ("No","songName","singer","v1","v2","lyrics","type")
reader = csv.DictReader( csvfile, fieldnames)
for row in reader:
    json.dump(row, jsonfile)
    jsonfile.write(',')


In [4]:
# 讀取資料
DATASET_DIR = 'woman_test_genre.json'
with open(DATASET_DIR, encoding='utf8') as f:
    dataset = json.load(f)
    
# 讀取 stop words
STOP_WORDS_DIR = 'stopwords.txt'
with open(STOP_WORDS_DIR, encoding='utf8') as f:
    stop_words = f.read().splitlines()

In [5]:
# 讀取內容
song_list = list(map(lambda d: d['lyrics'], dataset))
singer_list = list(map(lambda d: d['singer'], dataset))
no_list = list(map(lambda d: d['No'], dataset))


# 去除繁體中文以外的英文、數字、符號
rule = re.compile(r"[^\u4e00-\u9fa5]")
song_list = [list(jieba.cut(rule.sub('', song))) for song in song_list]
for idx, song in enumerate(song_list):
    song_list[idx] = ' '.join([word for word in song if word not in stop_words])

In [6]:
tfidf_vectorizer = TfidfVectorizer()
tfidf = tfidf_vectorizer.fit_transform(song_list)
tfidf = tfidf.toarray()

In [7]:
k = 4
Kmeans_cluster = KMeans()
song_cluster_result = Kmeans_cluster.kmeans_plus_plus(tfidf, k)
cluster = [[] for _ in range(k)]

for idx, c in enumerate(song_cluster_result):
    cluster[int(c)].append(no_list[idx])
    
for c, result in enumerate(cluster):
    print('Cluster {}: {}'.format(c, ' '.join(result)))

Cluster 0: 2 3 5 8 18 19 20 26 27 28 31 32 33 36 37 42 47 53 54 58 66 67 71 72 74 80 82 83 86 89 102 103 106 112 115 116 117 118 120 121 125 127 133 135 138 142 148 149 150 151 153 154 168 172 173 174 181 185 195 196 197 200 201 205 207 211 218 219 220 221 222 223 224 225 226 230 241 244 247 251 253 254 259 261 262 266 269 270 272 279 280 281 282 284 288 290 292 293 295 304 310 312 313 316 317 318 319 323 324 327 331 333 339 340 341 342 348 350 352 356 360 361 362 365 367 368 373 374 377 380 381 382 383 384 385 386 387 388 389 391 392 393 394 395 396 399 400 401 403 406 408 409 410 411 412 413 416 417 418 419 420 421 422 425 427 431 432 436 439 440 441 444 446 447 450 451 452 453 455 456 459 460 463 464 465 467 468 469 471 472 473 474 475 476 477 478 479 480 481 482 483 484 486 487 488 489 490 493 494 496 498 500 502 503 506 507 509 510 511 514 516 517 519 520 521 522 523 524 525 526 527 528 529 530 531 533 534 535 536 538 539 540 541 543 544 546 547 548 552 553 554 555 556 557 558 561

In [8]:
# 解析最每一分類相關的字
vectorizer = CountVectorizer()
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(vectorizer.fit_transform(song_list))
bag_of_words = vectorizer.get_feature_names()
weight = tfidf.toarray()

news_most_related_words = {}
for i in range(len(weight)): 
    w = dict(zip(bag_of_words, weight[i]))
    w = sorted(w.items(), key=lambda x: x[1], reverse=True)
    top_10 = []
    for word, prob in w[:10]:
        if prob > 0:
            top_10.append(word)
    news_most_related_words.update({singer_list[i]: top_10})

In [9]:
news_most_related_words

{'A-Lin': ['魔力', '加持', '戀愛', '慌了', '淡定', '喜歡', '不知所措', '忐忑', '左右', '靠近'],
 '丁噹': ['不能自已', '山窮水盡', '歎息', '風沙', '相依', '遠去', '千里', '想了又想', '千萬分之一', '相擁'],
 '于文文': ['體面', '俐落', '毀掉', '聲嘶力竭', '喝彩', '執念', '轟轟烈烈', '尊嚴', '尊重', '結尾'],
 '王心凌': ['醒著', '清晨', '陌生人', '手震', '逆時針', '苦撐', '失真', '陪襯', '心疼', '失戀'],
 '王菲': ['變幻', '孤傲', '掩藏', '匆匆忙忙', '天平', '總是', '孤單', '猜測', '衡量', '兩端'],
 '田馥甄': ['平凡', '靈魂', '肉身', '相襯', '困在', '筆尖', '齒輪', '弄堂', '選擇', '難得一見'],
 '江蕙': ['鑼聲', '愛人', '目睭', '船墘', '船燈', '落船', '袂出', '要開', '離水', '響袂'],
 '李千那': ['金手指', '電影片', '圍巾', '彼段', '少年', '相片', '拍見', '黑白', '叨去', '微微'],
 '李佳薇': ['力量', '遠走', '模樣', '發狂', '懸掛', '離場', '慌亂', '侵蝕', '放空', '心臟'],
 '李玟': ['接受', '隱隱約約', '狂戀', '真話', '摸索', '看不清楚', '說服', '急著', '明顯', '迷惑'],
 '那英': ['變幻', '年代', '救贖', '路上', '回望', '追回', '迷惘', '朋友', '是否', '改變'],
 '林憶蓮': ['面容', '相擁', '邪術', '莫名', '心痛', '每次', '承認', '夢便', '熱夢', '變鷹'],
 '邵雨薇': ['懶得', '討好', '打嘴巴', '流言蜚語', '胡鬧', '真相', '分神', '紛爭', '時間', '請別'],
 '范瑋琪': ['解不開', '在一起', '走下去', '解脫', '背影', '問題', '表情', '不愛', '無奈