In [3]:
import json, pickle
from hanziconv import HanziConv

In [10]:
def read_from_crawler(filename='crawled.json', show=False):
    """Read songs from crawled.

    return titiles, lyrics.
    """
    try:
        with open(filename, 'r', encoding='utf-8') as f:
            songs = json.load(f)
            print('Read {0} lyrics from crawled.'.format(len(songs)))
            print('Example:')
            print(songs[0]['title'])
            print(songs[0]['lyric'])
            titles = [song['title'] for song in songs]
            lyrics = [song['lyric'] for song in songs]
            return titles, lyrics
    except Exception:
        print('crawled file not exist.')

In [5]:
def dump_to_pickle(titles, lyrics):
    """Write titles and lyrics to pkl.
    """

    pickle.dump(titles, open('titles.pkl', 'wb'))
    pickle.dump(lyrics, open('lyrics.pkl', 'wb'))

In [6]:
def filter_lyrics(titles, lyrics):
    """Filter out bad lyrics.
    """
    ori_len = len(titles)
    # filter out junk
    for i in range(len(lyrics)):
        lyrics[i] = lyrics[i].replace('\ufeff', '').replace('\r', '').replace('\n','')
        lyrics[i] = lyrics[i].split('[')[:-1]

    # filter out invalid lyrics
    titles = [title for title, lyric in zip(titles, lyrics) if lyric != [] and lyric[0] == '']
    lyrics = [lyric[1:] for lyric in lyrics if lyric != [] and lyric[0] == '']
    lyrics = [['[' + sentence for sentence in lyric if not 'lrcgc' in sentence]\
              for lyric in lyrics]
    
    # to Simplified Chinese
    
    lyrics = [[HanziConv.toSimplified(sentence)\
               for sentence in lyric] for lyric in lyrics]

    print('{0}% lyrics filtered.'.format(100 * (ori_len - len(titles)) / ori_len))
    return titles, lyrics

In [7]:
def load_from_pickle():
    """Load tiltes and lyrics from pickle.
    """
    try:
        titles = pickle.load(open('titles.pkl', 'rb'))
    except Exception:
        print('titles.pkl not found.')
        return
    try:
        lyrics = pickle.load(open('lyrics.pkl', 'rb'))
    except Exception:
        print('titles.pkl not found.')
        return
    return titles, lyrics

In [8]:
def preprocess():
    """Preprocess crawled data to pickle"""
    titles, lyrics = read_from_crawler(show=True)
    titles, lyrics = filter_lyrics(titles, lyrics)
    dump_to_pickle(titles, lyrics)

In [11]:
if __name__ == '__main__':
    preprocess()

Read 134317 lyrics from crawled.
Example:
周杰伦-最后的战役(Live).lrc
﻿[al:The One 演唱会]
[00:46.89]机枪扫射声中我们寻找遮蔽的战壕
[00:50.33]儿时沙雕的城堡毁坏了重新盖就好
[00:54.34]可是你那件染血布满弹孔的军外套
[00:58.00]却就连祷告手都举不好
[01:02.38]硝烟中想起冰棒汽水的味道
[01:06.44]和那些无所事事一整个夏天的年少
[01:10.32]我放下枪回忆去年一起毕业的学校
[01:14.19]而眼泪一直都忘记要掉
[01:19.83]嘲笑的声音在风中不断被练习
[01:26.68]这树林间充满了敌意
[01:34.37]部队弃守阵地你坚持要我也离去
[01:38.40]我也离去
[01:41.72]我怎么能放弃
[01:46.89]我留着陪你
[01:49.65]强忍着泪滴
[01:51.35]有些事真的来不及回不去
[01:55.46]你脸在抽搐
[01:57.32]就快没力气
[01:59.39]家乡事不准我再提
[02:03.76]我留着陪你
[02:05.44]最后的距离
[02:07.09]是你的侧脸倒在我的怀里
[02:11.32]你慢慢睡去
[02:13.60]我摇不醒你
[02:15.76]泪水在战壕里决了堤
[02:35.68]硝烟中想起冰棒汽水的味道
[02:38.78]和那些无所事事一整个夏天的年少
[02:42.55]我放下枪回忆去年一起毕业的学校
[02:46.40]可眼泪一直都忘记要掉
[02:52.52]嘲笑的声音在风中不断被练习
[02:59.16]这树林间充满了敌意
[03:06.85]部队弃守阵地你坚持要
[03:10.62]我也离去
[03:14.30]我怎么能放弃
[03:20.08]我留着陪你
[03:21.81]强忍着泪滴
[03:23.53]有些事真的来不及回不去
[03:27.72]你脸在抽搐
[03:29.60]就快没力气
[03:31.67]家乡事不准我再提
[03:36.12]我留着陪你
[03:37.69]最后的距离
[03:39.63]是你的侧脸倒在我的怀里
[03:43.74]你慢慢睡去
[03:45.83]我摇不醒你
[03:48.04]泪水在战壕里决了堤

找歌词，上歌词千