## Preprocess

In [1]:
import pickle, random, re
pfile = open('saved/lyrics_filtered.pkl', 'rb')

lyrics = pickle.load(pfile)
random.shuffle(lyrics)

dev_lyrics = lyrics[:500]
test_lyrics = lyrics[:5000]

In [2]:
import numpy as np

from gensim.models import Word2Vec

import thulac

import many_stop_words

In [4]:
class Doc():
    
    model = Word2Vec.load('saved/word2vec_model')
    cut = thulac.thulac(seg_only=True)  #只进行分词，不进行词性标注\n"
    
    corpus = []
    stop = many_stop_words.get_stop_words()
    
    def __init__(self, document, tokenizer = 'char'):
        '''Doc class, a representation of document.
        
        @param document: A Chinese sentence.
        @param tokenizer: the tokenizer(char/word)
        '''
        document = ''.join([char for char in document if char not in Doc.stop])
        if tokenizer == 'char':
            self.bag_of_words = list(document)
        if tokenizer == 'word':
            self.bag_of_words = list(list(zip(*Doc.cut.cut(document)))[0])
        
        self.vec = self.to_vec()
        Doc.corpus.append(self)
    
    def to_vec(self):
        vec = np.zeros([512], dtype='float')
        for word in self.bag_of_words:
            if word in Doc.model.wv:
                vec += Doc.model.wv[word]
            else:
                for char in word:
                    vec += Doc.model.wv[char] if char in Doc.model.wv else 0

        return vec / np.linalg.norm(vec) if np.linalg.norm(vec) > 0 else vec

    def similarity(self, doc2):
        '''Return the cosine distance between two lines.'''
        return np.dot(self.vec, doc2.vec)

    def most_similar(self):
        '''Find the most similar line in the corpus.

        Similar defined as cosine distance.
        '''
        most_simi, winner = 0, Doc('')

        for i, candidate in enumerate(Doc.corpus):
            simi = Doc.similarity(self, candidate)
            if simi > most_simi and candidate.bag_of_words != self.bag_of_words:
                most_simi, winner = simi, candidate

        return most_simi, ''.join(winner.bag_of_words)
    
    def load_corpus(lyrics, doc_type='line', tokenizer='char'):
        '''Load from a list of lyrics, a list of a list of lines.
        
        doc_type: line/piece
        '''
        Doc.corpus = []
        for idx, lyric in enumerate(lyrics):
            if idx % 100 == 0:
                print('Loading %d' % idx)
            if doc_type == 'line':
                for line in lyric:
                    Doc(line, tokenizer)
            if doc_type == 'piece':
                Doc(''.join(lyric), tokenizer)

    def test():
        '''Unit test & usage'''
        line1 = '天青色等烟雨'
        doc1 = Doc(line1)
        print('Tokenized and word vec[:10] of %s:' % line1)
        print(doc1.bag_of_words)
        print(doc1.to_vec()[:10])
        print('')
        print('Most similar word to 河流')
        print(Doc.model.most_similar('河流'))
        print('')
        line2 = '而我在等你'
        doc2 = Doc(line2)
        print('Similarity between %s, %s' % (line1, line2))
        print(Doc.similarity(doc1, doc2))
        print('')
        print('Most similar to %s in corpus' % line1)
        print(doc1.most_similar())
        print('')

Model loaded succeed


In [5]:
Doc.load_corpus(dev_lyrics, doc_type='piece', tokenizer='word')
Doc.test()

Loading 0
Loading 100
Loading 200
Loading 300
Loading 400
Tokenized and word vec[:10] of 天青色等烟雨:
['天', '青', '色', '等', '烟', '雨']
[-0.01485717  0.0193655  -0.00730191 -0.03039845 -0.00163739  0.02942444
 -0.07133354 -0.07417747  0.03933919 -0.02917195]

Most similar word to 河流
[('奔腾', 0.675121545791626), ('稻香', 0.6594686508178711), ('原野', 0.6528366804122925), ('穿行', 0.6512413620948792), ('海洋', 0.6460176110267639), ('流淌啊', 0.6350507140159607), ('山间', 0.6339350938796997), ('旷野', 0.6277040839195251), ('江河', 0.6272122263908386), ('高原', 0.626851499080658)]

Similarity between 天青色等烟雨, 而我在等你
0.531282287307

Most similar to 天青色等烟雨 in corpus
(0.6912553735328979, '星星像眼泪坠落大海风处歌声飘传说中语失心女子悲伤系裙摆天天亲吻诺言等待夜夜月徘徊声声哼唱情深奈听都会伤怀晚风常叹气翻山越海找心回天天亲吻诺言等待夜夜月徘徊声声哼唱情深奈听都会伤怀晚风常叹气翻山越海找心回孤单相恋失心女子')



In [6]:
from gensim import corpora
import gensim

In [7]:
dictionary = corpora.Dictionary([doc.bag_of_words for doc in Doc.corpus])
corpus = [dictionary.doc2bow(doc.bag_of_words) for doc in Doc.corpus]

In [9]:
Lda = gensim.models.ldamulticore.LdaMulticore
ldamodel = Lda(corpus, num_topics=10, id2word=dictionary, passes=10)

In [10]:
ldamodel.print_topics(num_topics=10, num_words=100)

[(0,
  '0.014*"就" + 0.011*" " + 0.010*"想" + 0.010*"爱" + 0.008*"没" + 0.007*"要" + 0.006*"过" + 0.006*"会" + 0.006*"里" + 0.006*"说" + 0.006*"永远" + 0.006*"怕" + 0.005*"能" + 0.005*"呀" + 0.005*"出" + 0.004*"像" + 0.004*"四川" + 0.004*"中" + 0.004*"心" + 0.004*"做" + 0.004*"都" + 0.004*"梦" + 0.004*"地" + 0.004*"对" + 0.004*"时" + 0.004*"走" + 0.003*"快乐" + 0.003*"故乡" + 0.003*"寂寞" + 0.003*"见" + 0.003*"爱情" + 0.003*"啊" + 0.003*"办法" + 0.003*"才" + 0.003*"宝贝" + 0.003*"天" + 0.003*"开" + 0.003*"间" + 0.003*"停" + 0.003*"家" + 0.002*"难" + 0.002*"次" + 0.002*"幸福" + 0.002*"回" + 0.002*"相" + 0.002*"吹" + 0.002*"多少" + 0.002*"离" + 0.002*"太" + 0.002*"希望" + 0.002*"痛" + 0.002*"山水" + 0.002*"花" + 0.002*"世界" + 0.002*"昨天" + 0.002*"离开" + 0.002*"偏偏" + 0.002*"演唱" + 0.002*"温暖" + 0.002*"种" + 0.002*"愿" + 0.002*"隆咚" + 0.002*"终点" + 0.002*"哈" + 0.002*"伤口" + 0.002*"美" + 0.002*"未" + 0.002*"总" + 0.002*"听" + 0.002*"时空" + 0.002*"爹娘" + 0.002*"真" + 0.001*"等" + 0.001*"守" + 0.001*"回忆" + 0.001*"进" + 0.001*"眼泪" + 0.001*"注定" + 0.001*"问" + 0.001*"应" + 0.001*

In [84]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, NMF, LatentDirichletAllocation
from sklearn.cluster import KMeans

In [107]:
# original version
def find_topic(texts, topic_model, n_topics, vec_model="tf", thr=1e-2, **kwargs):
        # 1. vectorization
        vectorizer = CountVectorizer() if vec_model== "tf" else TfidfVectorizer()
        text_vec = vectorizer.fit_transform(texts)
        
        words = np.array(vectorizer.get_feature_names())
        return words
        # 2. topic finding
        topic_models = {"nmf":NMF, "svd": TruncatedSVD, "lda":LatentDirichletAllocation, "kmeans":KMeans}
        topicfinder = topic_models[topic_model](n_topics, **kwargs).fit(text_vec)
        topic_dists = topicfinder.components_ if topic_model is not "kmeans" else topicfinder.cluster_centers_
        topic_dists /= topic_dists.max(axis = 1).reshape((-1,1))
        
        # 3. keywords for topics
        def _topic_keywords(topic_dist):
            keywords_index = np.abs(topic_dist) >= thr
            keywords_prefix = np.where(np.sign(topic_dist)>0, "","^")[keywords_index]
            keywords = " | ".join(map(lambda x: "".join(x), zip(keywords_prefix, words[keywords_index])))
            return keywords
        topic_keywords = map(_topic_keywords, topic_dists)
        return "\n".join("Topic %i:%s" % (i, t) for i, t in enumerate(topic_keywords))

In [59]:
# vec model is tf

# print(find_topic(bag,"svd",20, vec_model = "tf"))
# print(find_topic(bag,"nmf",20, vec_model = "tf"))
# print(find_topic(bag,"lda",20, vec_model = "tf"))
# print(find_topic(bag,"kmeans",20, vec_model = "tf"))

# vec model is tfidf

#print(find_topic(bag,"svd",20, vec_model = "tfidf"))
# print(find_topic(bag, "nmf", 20, vec_model = "tfidf"))
#print(find_topic(bag,"lda",20, vec_model = "tfidf"))
# print(find_topic(bag,"kmeans",20, vec_model = "tfidf"))

Topic 0:之间 | 从前 | 回忆 | 彼此 | 忘记 | 慢慢 | 我们 | 拥有 | 故事 | 无法 | 明天 | 曾经 | 最后 | 朋友 | 相爱 | 相遇 | 祝福 | 约定 | 记得
Topic 1:关系 | 其实 | 发现 | 后悔 | 地方 | 感觉 | 改变 | 时候 | 没有 | 烦恼 | 理由 | 虽然 | 身边
Topic 2:一个 | 习惯 | 只是 | 地方 | 孤单 | 拥抱 | 最后 | 朋友 | 梦想 | 爱上 | 理由 | 生活 | 眼神 | 等待 | 角落 | 遇见
Topic 3:一切 | 其实 | 已经 | 心中 | 时候 | 现在 | 知道 | 需要
Topic 4:勇气 | 发现 | 只能 | 可以 | 告诉 | 悲伤 | 改变 | 放纵 | 无法 | 明白 | 最后 | 欺骗 | 相信 | 美丽 | 自己 | 觉得 | 迷失 | 面对
Topic 5:分开 | 分离 | 就是 | 心中 | 忘记 | 怀念 | 放弃 | 朋友 | 永远 | 直到 | 相信 | 等待 | 美丽 | 身边
Topic 6:世界 | 从此 | 只有 | 只要 | 尽头 | 属于 | 改变 | 整个 | 美丽 | 身边 | 这个
Topic 7:友情 | 可以 | 失去 | 已经 | 故事 | 曾经 | 游戏 | 爱情 | 甜蜜 | 相信 | 眼泪 | 美丽
Topic 8:一定 | 一生 | 只要 | 存在 | 安康 | 幸福 | 心中 | 感到 | 感觉 | 明天 | 欢迎 | 每天 | 浪漫 | 滋味 | 甜蜜 | 生活 | 痛苦 | 相信 | 美好
Topic 9:一起 | 何必 | 兄弟 | 可以 | 好好 | 时候 | 最后 | 朋友 | 白头 | 记得 | 走过 | 跟着 | 跳舞 | 这里
Topic 10:一切 | 今天 | 从前 | 其实 | 可以 | 听到 | 告诉 | 喜欢 | 回到 | 回忆 | 失去 | 好好 | 如果 | 就是 | 已经 | 得到 | 忘记 | 愿意 | 拥有 | 时间 | 明天 | 爱上 | 爱人 | 生命 | 相信 | 真心 | 眼泪 | 继续 | 能够 | 还有 | 遇见 | 重头
Topic 11:一生 | 一直 | 下去 | 为何 | 以为 | 伤心 | 分手 | 可以 | 告诉 |