## Preprocess

In [1]:
import pickle, random, re

lyrics = pickle.load(open('saved/lyrics_filtered.pkl', 'rb'))
random.shuffle(lyrics)

dev_lyrics = lyrics[:500]

In [2]:
import numpy as np

from gensim.models import Word2Vec

import thulac

In [3]:
class Doc():
    
    model = Word2Vec.load('saved/word2vec_model')
    cut = thulac.thulac(seg_only=True)  #只进行分词，不进行词性标注\n"
    
    corpus = []
    
    def __init__(self, document, tokenizer = 'char'):
        '''Doc class, a representation of document.
        
        @param document: A Chinese sentence.
        @param tokenizer: the tokenizer(char/word)
        '''
        if tokenizer == 'char':
            self.bag_of_words = list(document)
        if tokenizer == 'word':
            self.bag_of_words = list(list(zip(*Doc.cut.cut(sentence)))[0])
        
        self.vec = self.to_vec()
        
        Doc.corpus.append(self)
    
    def to_vec(self):
        vec = np.zeros(512, dtype='float')
        for word in self.bag_of_words:
            if word in Doc.model.wv:
                vec += Doc.model.wv[word]
            else:
                for char in word:
                    vec += Doc.model.wv[char] if char in Doc.model.wv else 0

        return vec / np.linalg.norm(vec) if np.linalg.norm(vec) > 0 else vec

    def similarity(doc1, doc2):
        '''Return the cosine distance between two sentences.'''
        return np.dot(doc1.vec, doc2.vec)

    def most_similar(self):
        '''Find the most similar sentence in the corpus.

        Similar defined as cosine distance.
        '''
        most_simi, winner = 0, Doc('')

        for i, candidate in enumerate(Doc.corpus):
            simi = Doc.similarity(self, candidate)
            if simi > most_simi and candidate.bag_of_words != self.bag_of_words:
                most_simi, winner = simi, candidate

        return most_simi, ''.join(winner.bag_of_words)
    
    def test():
        '''Unit test & usage'''
        sentence1 = '天青色等烟雨'
        doc1 = Doc(sentence1)
        print('Tokenized and word vec[:10] of %s:' % sentence1)
        print(doc1.bag_of_words)
        print(doc1.to_vec()[:10])
        print('')
        print('Most similar word to 河流')
        print(Doc.model.most_similar('河流'))
        print('')
        sentence2 = '而我在等你'
        doc2 = Doc(sentence2)
        print('Similarity between %s, %s' % (sentence1, sentence2))
        print(Doc.similarity(doc1, doc2))
        print('')
        print('Most similar to %s in corpus' % sentence1)
        print(doc1.most_similar())
        print('')

Model loaded succeed


In [4]:
def load_lyrics(lyrics):
    Doc.corpus = []
    for lyric in lyrics:
        for sentence in lyric:
            Doc(sentence)

load_lyrics(dev_lyrics)

In [5]:
Doc.test()

Tokenized and word vec[:10] of 天青色等烟雨:
['天', '青', '色', '等', '烟', '雨']
[-0.01485717  0.0193655  -0.00730191 -0.03039845 -0.00163739  0.02942444
 -0.07133354 -0.07417747  0.03933919 -0.02917195]

Most similar word to 河流
[('奔腾', 0.675121545791626), ('稻香', 0.6594686508178711), ('原野', 0.6528366804122925), ('穿行', 0.6512413620948792), ('海洋', 0.6460176110267639), ('流淌啊', 0.6350507140159607), ('山间', 0.6339350938796997), ('旷野', 0.6277040839195251), ('江河', 0.6272122263908386), ('高原', 0.626851499080658)]

Similarity between 天青色等烟雨, 而我在等你
0.36849127679

Most similar to 天青色等烟雨 in corpus
(0.80197426198757493, '天黑黑 欲落雨')

