## Preprocess

In [3]:
import pickle, random, re
pfile = open('saved/lyrics_filtered.pkl', 'rb')

lyrics = pickle.load(pfile)
random.shuffle(lyrics)

dev_lyrics = lyrics[:500]

In [6]:
import numpy as np

from gensim.models import Word2Vec

import thulac

In [7]:
class Doc():
    
    model = Word2Vec.load('saved/word2vec_model')
    cut = thulac.thulac(seg_only=True)  #只进行分词，不进行词性标注\n"
    
    corpus = []
    
    def __init__(self, document, tokenizer = 'char'):
        '''Doc class, a representation of document.
        
        @param document: A Chinese sentence.
        @param tokenizer: the tokenizer(char/word)
        '''
        if tokenizer == 'char':
            self.bag_of_words = list(document)
        if tokenizer == 'word':
            self.bag_of_words = list(list(zip(*Doc.cut.cut(sentence)))[0])
        
        self.vec = self.to_vec()
        
        Doc.corpus.append(self)
    
    def to_vec(self):
        vec = np.zeros(512, dtype='float')
        for word in self.bag_of_words:
            if word in Doc.model.wv:
                vec += Doc.model.wv[word]
            else:
                for char in word:
                    vec += Doc.model.wv[char] if char in Doc.model.wv else 0

        return vec / np.linalg.norm(vec) if np.linalg.norm(vec) > 0 else vec

    def similarity(doc1, doc2):
        '''Return the cosine distance between two sentences.'''
        return np.dot(doc1.vec, doc2.vec)

    def most_similar(self):
        '''Find the most similar sentence in the corpus.

        Similar defined as cosine distance.
        '''
        most_simi, winner = 0, Doc('')

        for i, candidate in enumerate(Doc.corpus):
            simi = Doc.similarity(self, candidate)
            if simi > most_simi and candidate.bag_of_words != self.bag_of_words:
                most_simi, winner = simi, candidate

        return most_simi, ''.join(winner.bag_of_words)
    
    def test():
        '''Unit test & usage'''
        sentence1 = '天青色等烟雨'
        doc1 = Doc(sentence1)
        print('Tokenized and word vec[:10] of %s:' % sentence1)
        print(doc1.bag_of_words)
        print(doc1.to_vec()[:10])
        print('')
        print('Most similar word to 河流')
        print(Doc.model.most_similar('河流'))
        print('')
        sentence2 = '而我在等你'
        doc2 = Doc(sentence2)
        print('Similarity between %s, %s' % (sentence1, sentence2))
        print(Doc.similarity(doc1, doc2))
        print('')
        print('Most similar to %s in corpus' % sentence1)
        print(doc1.most_similar())
        print('')

Model loaded succeed


In [8]:
def load_lyrics(lyrics):
    Doc.corpus = []
    for lyric in lyrics:
        for sentence in lyric:
            Doc(sentence)

load_lyrics(dev_lyrics)

In [19]:
Doc('天青色等烟雨').to_vec()

array([ -1.48571671e-02,   1.93655009e-02,  -7.30190808e-03,
        -3.03984456e-02,  -1.63738999e-03,   2.94244422e-02,
        -7.13335421e-02,  -7.41774667e-02,   3.93391895e-02,
        -2.91719492e-02,   1.86987652e-02,   8.03371543e-03,
        -9.36264681e-03,   8.38347216e-03,  -6.52779517e-02,
        -3.99518762e-02,   7.29505621e-02,   3.46279187e-03,
         6.94428955e-02,  -2.54271927e-02,  -2.16518580e-02,
         5.87576212e-03,  -2.85494016e-02,   5.47539963e-02,
         3.95933170e-02,   3.48851657e-02,  -4.00025828e-02,
         1.60664714e-02,   2.76331733e-02,   7.87805289e-02,
        -8.14829098e-02,  -7.25446614e-02,   2.62316425e-02,
        -2.40862873e-02,  -1.07238769e-02,   1.53676263e-02,
        -2.04014147e-02,  -7.99943282e-02,  -1.49576747e-02,
         2.96647055e-02,   8.25912472e-03,   1.14714220e-02,
         5.53795345e-03,  -3.53379884e-02,  -4.63778060e-03,
        -5.07312944e-02,   8.21440663e-03,   3.44553906e-02,
         5.61354592e-02,

In [9]:
Doc.test()

Tokenized and word vec[:10] of 天青色等烟雨:
['天', '青', '色', '等', '烟', '雨']
[-0.01485717  0.0193655  -0.00730191 -0.03039845 -0.00163739  0.02942444
 -0.07133354 -0.07417747  0.03933919 -0.02917195]

Most similar word to 河流
[('奔腾', 0.675121545791626), ('稻香', 0.6594686508178711), ('原野', 0.6528366804122925), ('穿行', 0.6512413024902344), ('海洋', 0.6460175514221191), ('流淌啊', 0.6350507140159607), ('山间', 0.6339350938796997), ('旷野', 0.6277040243148804), ('江河', 0.6272121667861938), ('高原', 0.6268514394760132)]

Similarity between 天青色等烟雨, 而我在等你
0.36849127679

Most similar to 天青色等烟雨 in corpus
(0.80305124424941721, '树叶飘落在十月 天色刺眼 我却写了阴雨天')



In [20]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD,NMF,LatentDirichletAllocation
from sklearn.cluster import KMeans

In [26]:
def find_topic(texts, topic_model, n_topics, vec_model="tf", thr=1e-2, **kwargs):
        # 1. vectorization
        vectorizer = CountVectorizer() if vec_model== "tf" else TfidVectorizer()
        text_vec = vectorizer.fit_transform(texts)
        words = np.array(vectorizer.get_feature_names())
        # 2. topic finding
        topic_models = {"nmf":NMF, "svd": TruncatedSVD, "lda":LatentDirichletAllocation,"kmeans":KMeans}
        topicfinder = topic_models[topic_model](n_topics, **kwargs).fit(text_vec)
        topic_dists = topicfinder.components_ if topic_model is not "kmeans" else topicfinder.cluster_centers_
        topic_dists /= topic_dists.max(axis = 1).reshape((-1,1))
        # 3. keywords for topics
        def _topic_keywords(topic_dist):
            keywords_index = np.abs(topic_dist) >= thr
            keywords_prefix = np.where(np.sign(topic_dist)>0, "","^")[keywords_index]
            keywords = " | ".join(map(lambda x: "".join(x), zip(keywords_prefix, words[keywords_index])))
            return keywords
        topic_keywords = map(_topic_keywords, topic_dists)
        return "\n".join("Topic %i:%s" % (i, t) for i, t in enumerate(topic_keywords))
        
    

In [31]:
vectorizer = CountVectorizer()

In [33]:
text_vec = vectorizer.fit_transform(["we love burgers"])

In [35]:
text_vec[0]

<1x3 sparse matrix of type '<class 'numpy.int64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [29]:
def generate_clearcut_topics():
    return np.repeat(["we love bergers", "we hate sandwiches"],[1000,1000])

In [30]:
print(find_topic(generate_clearcut_topics(), "svd", 4, vec_model="tf"))

Topic 0:bergers | hate | love | sandwiches | we
Topic 1:bergers | ^hate | love | ^sandwiches
Topic 2:bergers | hate | love | sandwiches | ^we
Topic 3:bergers | ^hate | ^love | sandwiches
