In [1]:
import jieba
from zhon import hanzi as zh_hanzi
import srt

from cedict_utils.cedict import CedictParser

from dragonmapper import transcriptions
from dragonmapper import hanzi

from IPython.display import HTML, display, Markdown, clear_output

import ipywidgets as widgets

from pydub import AudioSegment

import cv2

import json
import urllib.request

import matplotlib.pyplot as plt

import numpy as np
import io

from PIL import Image

from pathlib import Path

import hashlib

from sys import platform

import datetime
import math

from collections import defaultdict

import requests
from bs4 import BeautifulSoup

from tqdm import tqdm

# from gensim.parsing.preprocessing import remove_stopwords
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

import spacy
nlp = spacy.load("en_core_web_sm")
import re

import csv

2022-12-12 12:44:01.182371: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
PUNCTUATION = set(zh_hanzi.punctuation)

parser = CedictParser()
# Can use a more up to date version of the dictionary
parser.read_file("resources/cedict_ts.u8")
entries = parser.parse()

CEDICT = dict()
for e in entries:
    CEDICT[e.simplified] = e
    
DECK = "deck:Shelley"

STOPWORDS = set(stopwords.words("english"))

In [3]:
# HSK = dict()
# words = set()
# resource_path = "/home/daniel/programming/chinese/chinese-tools/Mandarin corner/resources/hsk3.0-"
# levels = [str(i) for i in list(range(1,7))] + ["7-9"]

# for level in levels:
#     file_path = resource_path + level + '.txt'
#     with open(file_path) as f:
#         text = f.read()
#         text_words = text.split("\n")
#         text_words = set(text_words)
#         words.update(text_words)
#         HSK[level] = words.copy()

# HSK["1"].remove("")

# generate the HSK csv file
# HSK_csv = []
# seen_words = set()
# resource_path = "/home/daniel/programming/chinese/chinese-tools/Mandarin corner/resources/hsk3.0-"
# levels = [str(i) for i in list(range(1,7))] + ["7-9"]

# for level in levels:
#     file_path = resource_path + level + '.txt'
#     with open(file_path) as f:
#         text = f.read()
#         text_words = text.split("\n")
#         for word in text_words:
#             if word in seen_words:
#                 continue
#             else:
#                 HSK_csv.append([word, level])
#                 seen_words.add(word)

with open("./resources/hsk.csv") as f:
    reader = csv.reader(f)
    HSK = { row[0] : row[1] for row in reader }

In [4]:
def hsk_level_new(word: str) -> str:

#     for i in range(1,7):
#         if word in HSK[str(i)]:
#             return str(i)

#     if word in HSK["7-9"]:
#         return "7-9"
    
#     return "0"
    
    if word in HSK:
        return HSK[word]
    
    return "0"

In [5]:
class Sentence():
    
    def __init__(self, sentence, seg_list, english):
        self.sentence = sentence 
        self.seg_list = seg_list
        self.english = english
    
    def __str__(self):
        return " ".join(self.seg_list) + "\n" + self.english
    
    def score(self, level=3) -> int:
        level_dict = dict()
        score = 0
        for w in self.seg_list:
            w_level = hsk_level_new(w)
            level_dict[w] = w_level
            if w_level in ['4', '5']:
                score += 1
            elif w_level in ['6', '7-9']:
                score += 2
            elif w_level == "0":
                score += 3
        
        # should also penalize longer sentences 
#         LENGTH_THRESHOLD = 30
#         length_penalty = max(0, len(self.sentence) - LENGTH_THRESHOLD)
#         score += length_penalty
        
        return score

In [6]:
# Anki connect boilerplate

def request(action, **params):
    return {'action': action, 'params': params, 'version': 6}

def invoke(action, **params):
    requestJson = json.dumps(request(action, **params)).encode('utf-8')
#     print(type(requestJson))
#     print(requestJson)
#     print(urllib.request.Request('http://localhost:8765', requestJson))
    response = json.load(urllib.request.urlopen(urllib.request.Request('http://localhost:8765', requestJson)))
    if len(response) != 2:
        raise Exception('response has an unexpected number of fields')
    if 'error' not in response:
        raise Exception('response is missing required error field')
    if 'result' not in response:
        raise Exception('response is missing required result field')
    if response['error'] is not None:
        raise Exception(response['error'])
    return response['result']

# invoke('createDeck', deck='test1')
# result = invoke('deckNames')
# print('got list of decks: {}'.format(result))

In [7]:
def get_defn(word: str) -> str:
    if word in CEDICT:
        return ", ".join(CEDICT[word].meanings) 
    else:
#         if not in CEDICT, try and get the definition from Purple Culture 
        URL = "https://www.purpleculture.net/dictionary-details/?word=" + word
        response = requests.get(URL)

        if response.status_code != 200:
            print("ERROR:", card_word)

        soup = BeautifulSoup(response.text, 'html.parser')
        Sentences = list()
        defn = soup.findAll('div', attrs={'class':'en py-2'})
        if defn:
            return ", ".join([i.text for i in defn])

        return ""

In [8]:
def reduced_word_defn(word: str) -> set:
    if word in CEDICT:
        defn = CEDICT[word].meanings
    else:
        return set()
    
    defn_sent = " ".join(defn)
    doc = nlp(defn_sent)
    nlp_sentence = list(doc.sents)[0]
    nlp_list_sent = []
    for word in nlp_sentence:
        nlp_list_sent.append(word.lemma_)

    # filtered_defn = " ".join(nlp_list_sent)
    # print("join nlp_list_sent:", filtered_defn)
    filtered_defn = [i.translate(str.maketrans('', '', string.punctuation)) for i in nlp_list_sent]
#     print("translate:", filtered_defn)
    # filtered_defn = filtered_defn.translate(str.maketrans('', '', string.punctuation))
    # print("translate:", filtered_defn)
    # remove any chinese characters e.g. classifiers from the definition, "cl" string means classifier, remove empty string
    filtered_defn = [i.lower() for i in filtered_defn if i not in STOPWORDS and not re.search(u'[\u4e00-\u9fff]', i) and i != "cl" and i != ""]
#     print("remove stopwords:", filtered_defn)
    #     filtered_defn = [j.split(' ') for j in filtered_defn]
    #     print(filtered_defn)
    #     filtered_defn = [item for sublist in filtered_defn for item in sublist]
    #     print(filtered_defn)
    # filtered_defn = filtered_defn.split(" ")
    # print(filtered_defn)
    filtered_defn = set(filtered_defn)
#     print(filtered_defn)
    return filtered_defn

In [9]:
char_sent = "如果愿意继续工作，他们将不得不承认，薪水有可能上升也有可能下降。"
seg_list = list(jieba.cut(char_sent, cut_all=False))
for word in seg_list:
    if word not in PUNCTUATION:
        print(word, reduced_word_defn(word))

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.382 seconds.
Prefix dict has been built successfully.


如果 {'case', 'event'}
愿意 {'want', 'ready', 'willing', 'sth', 'wish'}
继续 {'continue', 'proceed', 'go'}
工作 {'task', 'operate', 'work', 'job', 'machine'}
他们 set()
将 {'invite', 'desire', 'request'}
不得不 {'avoid', 'choice', 'help', 'option'}
承认 {'etc', 'admit', 'recognize', 'concede', 'diplomatic', 'recognition', 'acknowledge', 'artistic'}
薪水 {'salary', 'wage'}
有 {'exist'}
可能 {'maybe', 'possible', 'might', 'probable', 'perhaps', 'possibility', 'probability', 'happen'}
上升 {'go', 'rise', 'ascend'}
也 {'imply', 'also', 'final', 'affirmation', 'chinese', 'classical', 'particle'}
有 {'exist'}
可能 {'maybe', 'possible', 'might', 'probable', 'perhaps', 'possibility', 'probability', 'happen'}
下降 {'go', 'fall', 'decrease', 'drop', 'decline'}


In [10]:
def get_example_sentences(card_word, num=1) -> list[Sentence]:
    
    URL = "https://www.purpleculture.net/sample-sentences/?word=" + card_word
    response = requests.get(URL)

    if response.status_code != 200:
        print("ERROR:", card_word)
        
    soup = BeautifulSoup(response.text, 'html.parser')
    
    Sentences = list()

    sample_sentences = soup.findAll('span', attrs={'class':'sc samplesen'})
#     print(card_word, sample_sentences)
    en_sentences = soup.findAll('div', attrs={'class':'sample_en'})
#     print(en_sentences)
    
    if not sample_sentences:
        URL = 'https://www.purpleculture.net/dictionary-details/?word=' + card_word
        response = requests.get(URL)
        if response.status_code != 200:
            print("ERROR:", card_word)
        soup = BeautifulSoup(response.text, 'html.parser')    
        sample_sentences = soup.findAll('div', attrs = {'class' : 'samplesen'})
        en_sentences = soup.findAll('div', attrs={'class':'sample_en'})
#         print(en_sentences)

    MAX_NUM = len(sample_sentences)
    seg_map = dict()

    for i, sent in enumerate(sample_sentences):
        chars = sent.findAll('span', attrs={'class':'cnchar'})
        char_sent = "".join(char.text for char in chars)
#         print(char_sent)

        seg_list = list(jieba.cut(char_sent, cut_all=False))

        seg_map[char_sent] = seg_list
        Sentences.append(Sentence(char_sent, seg_list, en_sentences[i].text))
        
    filtered_defn = reduced_word_defn(card_word) 
#     print("filtered defn:", filtered_defn)
    filtered_sents = []
    for i in Sentences:
#         print("testing:", i.english)
        multiplier = 0
        for j in filtered_defn:
#             print("checking", j)
            if j in i.english:
#                 filtered_sents.append(i)
                multiplier -= 1
#                 continue
        if multiplier < 0:
            filtered_sents.append((i, multiplier))
#         print(i, multiplier)
    
#     filtered_sents.sort(key=Sentence.score)
    LENGTH_THRESHOLD = 30
    filtered_sents.sort(key=lambda x: x[0].score() * x[1] + max(0, len(x[0].sentence) - LENGTH_THRESHOLD))
    
    num = min(num, MAX_NUM)
    return [i[0] for i in filtered_sents[:num]]

In [11]:
for i in get_example_sentences("加息", 5):
#     print(i[1]*i[0].score(), i[0].sentence, i[0].english, len(i[0].sentence))
    print(i)

如果 美联储 为 维护 两家 公司 而 担忧 ， 那 为了 对抗 通货膨胀 而 进行 的 加息 他们 能 消受 吗 ？
And if the Fed has to worry about safeguarding Fannie and Freddie, can it afford to raise interest rates to combat inflation? 
当 被 问及 ， 美联储 何时 会 开始 加息 时 ， 伯南克 贫嘴 道 ： “ 将来 ” 。
"Asked when the Fed will start raising interest rates, Bernanke quipped "in the future. " 
货币政策 这一 受到 右翼 欢迎 的 、 以 市场 为 基础 的 政策 ， 通过 加息 这一 惩戒 手段 ， 以 价格 来 限制 支出 。
Monetary policy - a market-based policy favoured by the right - restricts spending by price through the discipline of higher interest rates. 
从 这方面 来说 ， 中国 近期 采取 加息 举措 是 个 好 迹象 ， 显示 出 政府 通过 为 信贷 定价 （ 而 不是 采用 容易 钻空子 的 贷款 限额 ） ， 在 资本 配置 方面 作出 了 新 的 努力 。
In this respect, recent efforts to nudge interest rates higher are a good sign, signaling a renewed effort to allocate capital by putting a price on it instead of an easily circumvented quota. 
他 在 5 月 参观 陕西 的 猪场 ， 8 月 参观 北京 的 农产品 市场 。 中国人民银行 今年 四次 加息 。
The People' s Bank of China, the central bank of China, increased interest rates four times this y

In [221]:
known_words_note_ids = invoke('findNotes', query=DECK)
known_words_notes = invoke("notesInfo", notes=known_words_note_ids)

for card in tqdm(known_words_notes):
    note = dict()
    note["id"] = card['noteId']
    note["fields"] = dict()
    word = card['fields']['Character']['value'].strip()
    sentence = card['fields']['Sentence']['value'].strip()
    pinyin = hanzi.to_pinyin(word)
#     english = ", ".join(CEDICT[word].meanings) if word in CEDICT else ""
    # get definition from Purple Culture if not in CEDICT; maybe break this all out into a function
    english = get_defn(word)
    seg_list = list(jieba.cut(sentence, cut_all=False))
#     sent_pinyin = " ".join([hanzi.to_pinyin(i) for i in seg_list if i not in PUNCTUATION])
    sent_pinyin = " ".join([hanzi.to_pinyin(i) for i in seg_list])
    hsk_level = hsk_level_new(word)
    processed = (card["fields"]["Processed"]['value'] == "1")

    
    note['fields']['Character'] = word 
    note['fields']['Sentence'] = sentence
    
    if not processed:
        if card['fields']['Pinyin']['value'] != pinyin:
            note['fields']['Pinyin'] = pinyin
        if not card['fields']['English']['value'] and english:
            note['fields']['English'] = english
        if not sentence:
            print(word)
            example_sentences = get_example_sentences(word)
            ex_sents = "\n".join([i.sentence for i in example_sentences])
            print(ex_sents)
            eng_sents = "\n".join([i.english for i in example_sentences])
            note['fields']['Sentence'] = ex_sents
            note['fields']['Sentence English'] = eng_sents
    #         sents_pinyin = " ".join([hanzi.to_pinyin(i.seg_list) for i in  if i not in PUNCTUATION])
        if sentence:
            pass
        if not card['fields']['Sentence Pinyin']['value']:
            sent_pinyin = " ".join([hanzi.to_pinyin(i) for i in seg_list])
            sent_pinyin.replace("< br >", "\n")
            note['fields']['Sentence Pinyin'] = sent_pinyin

        if not card['fields']['Sentence English']['value']:
            example_sentences = get_example_sentences(word)
            if example_sentences and card['fields']['Sentence']['value'] == example_sentences[0].sentence:
    #             print(card['fields']['Sentence']['value'], example_sentences[0].english)
                note['fields']['Sentence English'] = example_sentences[0].english
            elif not example_sentences:
                print("no example sentences found", word)
            else:
                print("example sentence not from Purple Culture:", card['fields']['Sentence']['value'])
        if not card['fields']['HSK Level']['value']:
            note['fields']['HSK Level'] = hsk_level
        
        note['fields']['Processed'] = "1"

        result = invoke('updateNoteFields', note=note)
    #     print(result)

100%|█████████████████████████████████████████████████████████████████████████████| 97/97 [00:16<00:00,  5.71it/s]


In [206]:
print(get_example_sentences("推出")[0])

东京 地面 ， 全日空 121 ， 停机位 65 ， 请求 推出 开车 。
Tokyo Ground, ANA 121, Spot 65, request push-back. 


In [23]:
test_card = invoke("notesInfo", notes=[1667355348498])
test_card[0]['fields']["Processed"]

{'value': '', 'order': 10}

In [17]:
if not test_card[0]['fields']['Sentence Pinyin']['value']:
    print("empty")

empty


In [8]:
known_words_notes[0]['fields']

{'Character': {'value': '放下', 'order': 0},
 'Pinyin': {'value': 'fàngxia', 'order': 1},
 'English': {'value': 'to lay down, to put down, to let go of, to relinquish, to set aside, to lower (the blinds etc)',
  'order': 2},
 'Character Audio': {'value': '', 'order': 3},
 'Sentence': {'value': '放下所有的事情，解决现在的问题', 'order': 4},
 'Sentence Audio': {'value': '', 'order': 5},
 'Sentence Pinyin': {'value': 'fàngxia suǒyǒu de shìqing jiějué xiànzài de wèntí',
  'order': 6},
 'Sentence English': {'value': '', 'order': 7},
 'Notes': {'value': '', 'order': 8}}

In [73]:
from ttskit import sdk_api

wav = sdk_api.tts_sdk('如果他决定变成成年女性，他也可以接受女性激素，这将会提高他的声调，使他胸部丰满并发育出其他女性的身体特征。', audio='24')

INFO:sdk_api:Synthesizing: 如果他决定变成成年女性，
INFO:sdk_api:Synthesizing: 他也可以接受女性激素，
INFO:sdk_api:Synthesizing: 这将会提高他的声调，
INFO:sdk_api:Synthesizing: 使他胸部丰满并发育出其他女性的身体特征。
INFO:sdk_api:processes: 1, cuda: True, device: None, time consumed: 0.51s, text length: 53, audio duration: 11.47s, RTF: 0.0447, FPS: 103.2687


In [74]:
# from pydub.playback import play
import io

recording = AudioSegment.from_file(io.BytesIO(wav), format="wav")
# recording.export('new.mp3', format='mp3') # for export 
# play(recording) # for play


In [75]:
recording

In [80]:
import zhtts

text = "2020年，这是一个开源的端到端中文语音合成系统"
tts1 = zhtts.TTS() # use fastspeech2 by default
tts = zhtts.TTS(text2mel_name="TACOTRON")

tts.text2wav(text, "demo.wav")
tts1.text2wav(text, "demo1.wav")
# >>> Save wav to demo.wav

# tts.frontend(text)
# >>> ('二零二零年，这是一个开源的端到端中文语音合成系统', 'sil ^ er4 #0 l ing2 #0 ^ er4 #0 l ing2 #0 n ian2 #0 #3 zh e4 #0 sh iii4 #0 ^ i2 #0 g e4 #0 k ai1 #0 ^ van2 #0 d e5 #0 d uan1 #0 d ao4 #0 d uan1 #0 zh ong1 #0 ^ uen2 #0 ^ v3 #0 ^ in1 #0 h e2 #0 ch eng2 #0 x i4 #0 t ong3 sil')

# tts.synthesis(text)
# >>> array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)

index: 0, text: 2020年，
frontend info: ('二零二零年，', 'sil ^ er4 #0 l ing2 #0 ^ er4 #0 l ing2 #0 n ian2 sil')
index: 1, text: 这是一个开源的端到端中文语音合成系统
frontend info: ('这是一个开源的端到端中文语音合成系统', 'sil zh e4 #0 sh iii4 #0 ^ i2 #0 g e4 #0 k ai1 #0 ^ van2 #0 d e5 #0 d uan1 #0 d ao4 #0 d uan1 #0 zh ong1 #0 ^ uen2 #0 ^ v3 #0 ^ in1 #0 h e2 #0 ch eng2 #0 x i4 #0 t ong3 sil')
Save wav to demo.wav
index: 0, text: 2020年，
frontend info: ('二零二零年，', 'sil ^ er4 #0 l ing2 #0 ^ er4 #0 l ing2 #0 n ian2 sil')
index: 1, text: 这是一个开源的端到端中文语音合成系统
frontend info: ('这是一个开源的端到端中文语音合成系统', 'sil zh e4 #0 sh iii4 #0 ^ i2 #0 g e4 #0 k ai1 #0 ^ van2 #0 d e5 #0 d uan1 #0 d ao4 #0 d uan1 #0 zh ong1 #0 ^ uen2 #0 ^ v3 #0 ^ in1 #0 h e2 #0 ch eng2 #0 x i4 #0 t ong3 sil')
Save wav to demo1.wav


In [14]:
test_sentence = "有时候，问题还在于货币政策：欧洲中央银行应该改变近期的加息举措，降低利率。"

In [15]:
eng_sent = "Hello my name is Daniel, and I like to learn languages."

In [23]:
filtered_sent = remove_stopwords(eng_sent)
filtered_sent = filtered_sent.translate(str.maketrans('', '', string.punctuation))
filtered_sent = filtered_sent.split(' ')
filtered_sent = [i.lower() for i in filtered_sent]

In [24]:
print(filtered_sent)

['hello', 'daniel', 'i', 'like', 'learn', 'languages']


In [48]:
defn = CEDICT["加息"].meanings
filtered_defn = [i.translate(str.maketrans('', '', string.punctuation)) for i in defn]
filtered_defn = [remove_stopwords(i) for i in filtered_defn]
filtered_defn = [j.split(' ') for j in filtered_defn]
filtered_defn = [item for sublist in filtered_defn for item in sublist]
filtered_defn = set(filtered_defn)

ex_sents = get_example_sentences("加息", 10)

# " ".join([remove_stopwords(i) for i in CEDICT['衰退'].meanings])

In [54]:
print(filtered_defn)

{'rates', 'raise'}


In [49]:
for i in ex_sents:
    print(i)

现在 你 已经 研究 了 你 地图 ， 无论是 在 您 的 加息 的 过渡 地区 求贤若渴 。
Now that you've studied your map, be on the lookout for those transition areas on your hike. 
但 加息 将会 吸引 更多 热钱 ， 这 在 一定 程度 上 会 弄巧成拙 。
But higher rates could attract more hot money, which would be partly self-defeating. 
李 的 态度 是 首次 境外 官方 的 “ 加息 大合唱 ” 的 回应 。
Li's attitude is the first time outside official "rate hike Cantata" response. 
他 还 说 ， 在 这个 秋天 之前 加息 ， 这个 最 适度 的 价格 收益 给 了 中央银行 喘息 的 时间 。
He said the modest price gains give the central bank time to breathe before hiking interest rates this fall. 
有时候 ， 问题 还 在于 货币政策 ： 欧洲中央银行 应该 改变 近期 的 加息 举措 ， 降低利率 。
In some cases the fault lies with monetary policy: the European Central Bank should reverse its recent rate rises. 
如果 美联储 为 维护 两家 公司 而 担忧 ， 那 为了 对抗 通货膨胀 而 进行 的 加息 他们 能 消受 吗 ？
And if the Fed has to worry about safeguarding Fannie and Freddie, can it afford to raise interest rates to combat inflation? 
有 20 多人 等待 着 自己 的 梯级 下降 湲 ， 这 可能 需要 一些 时间 ， 但 也 有 机会 加息 ， 并 探讨 该 地区 。
With 20 or so peop

In [50]:
filtered_sents = []
for i in ex_sents:
    for j in filtered_defn:
        if j in i.english:
            filtered_sents.append(i)
            continue

In [51]:
len(filtered_sents)

5

In [52]:
filtered_sents.sort(key=Sentence.score)

In [53]:
for i in filtered_sents:
    print(i)

但 加息 将会 吸引 更多 热钱 ， 这 在 一定 程度 上 会 弄巧成拙 。
But higher rates could attract more hot money, which would be partly self-defeating. 
他 还 说 ， 在 这个 秋天 之前 加息 ， 这个 最 适度 的 价格 收益 给 了 中央银行 喘息 的 时间 。
He said the modest price gains give the central bank time to breathe before hiking interest rates this fall. 
如果 美联储 为 维护 两家 公司 而 担忧 ， 那 为了 对抗 通货膨胀 而 进行 的 加息 他们 能 消受 吗 ？
And if the Fed has to worry about safeguarding Fannie and Freddie, can it afford to raise interest rates to combat inflation? 
如果 美联储 为 维护 两家 公司 而 担忧 ， 那 为了 对抗 通货膨胀 而 进行 的 加息 他们 能 消受 吗 ？
And if the Fed has to worry about safeguarding Fannie and Freddie, can it afford to raise interest rates to combat inflation? 
他 在 5 月 参观 陕西 的 猪场 ， 8 月 参观 北京 的 农产品 市场 。 中国人民银行 今年 四次 加息 。
The People' s Bank of China, the central bank of China, increased interest rates four times this year alone. 


In [171]:
# # defn = CEDICT['加息'].meanings
# defn = CEDICT['加息'].meanings
# defn_sent = " ".join(defn)
# print(defn_sent)

# doc = nlp(defn_sent)
# nlp_sentence = list(doc.sents)[0]
# nlp_list_sent = []
# for word in nlp_sentence:
#     nlp_list_sent.append(word.lemma_)

# # filtered_defn = " ".join(nlp_list_sent)
# # print("join nlp_list_sent:", filtered_defn)
# filtered_defn = [i.translate(str.maketrans('', '', string.punctuation)) for i in nlp_list_sent]
# print("translate:", filtered_defn)
# # filtered_defn = filtered_defn.translate(str.maketrans('', '', string.punctuation))
# # print("translate:", filtered_defn)
# filtered_defn = [i for i in filtered_defn if i not in STOPWORDS]
# print("remove stopwords:", filtered_defn)
# #     filtered_defn = [j.split(' ') for j in filtered_defn]
# #     print(filtered_defn)
# #     filtered_defn = [item for sublist in filtered_defn for item in sublist]
# #     print(filtered_defn)
# # filtered_defn = filtered_defn.split(" ")
# # print(filtered_defn)
# filtered_defn = set(filtered_defn)
# print(filtered_defn)

In [66]:
with open("/home/daniel/programming/chinese/harry_potter/hp1.txt") as f:
    text = f.readlines()

In [68]:
text = [i.strip("\n") for i in text if i != "\n"]

In [69]:
text

['谨以此书献给',
 '杰西卡，她喜欢这故事',
 '安妮，她也喜欢这故事',
 '戴，她是故事的第一位听众',
 '主要人物表',
 '哈利·波特 本书主人公，霍格沃茨魔法学校一年级学生',
 '罗恩·韦斯莱 哈利在魔法学校的好朋友',
 '赫敏·格兰杰 哈利在魔法学校的好朋友',
 '纳威·隆巴顿 哈利在魔法学校的同学',
 '德拉科·马尔福 哈利在魔法学校的同学',
 '佩妮·德思礼 哈利的姨妈',
 '弗农·德思礼 哈利的姨父',
 '达力·德思礼 哈利的表哥，德思礼夫妇的儿子',
 '鲁伯·海格 霍格沃茨魔法学校钥匙保管员，猎场看守',
 '阿不思·邓布利多 霍格沃茨魔法学校校长',
 '米勒娃·麦格 霍格沃茨魔法学校副校长',
 '西弗勒斯·斯内普 霍格沃茨魔法学校魔药课教师',
 '奇洛 霍格沃茨魔法学校黑魔法防御术课教师',
 '伏地魔 杀死哈利父母的黑魔头，被人称为“神秘人”',
 '目次',
 '* * *',
 '第 1 章',
 '大难不死的男孩',
 '第 2 章',
 '悄悄消失的玻璃',
 '第 3 章',
 '猫头鹰传书',
 '第 4 章',
 '钥匙保管员',
 '第 5 章',
 '对角巷',
 '第 6 章',
 '从9¾站台开始的旅程',
 '第 7 章',
 '分院帽',
 '第 8 章',
 '魔药课老师',
 '第 9 章',
 '午夜决斗',
 '第 10 章',
 '万圣节前夕',
 '第 11 章',
 '魁地奇比赛',
 '第 12 章',
 '厄里斯魔镜',
 '第 13 章',
 '尼可·勒梅',
 '第 14 章',
 '挪威脊背龙——诺伯',
 '第 15 章',
 '禁林',
 '第 16 章',
 '穿越活板门',
 '第 17 章',
 '双面人',
 '第1章 大难不死的男孩',
 '家住女贞路4号的德思礼夫妇总是得意地说他们是非常规矩的人家，拜托，拜托了。他们从来跟神秘古怪的事不沾边，因为他们根本不相信那些邪门歪道。',
 '弗农·德思礼先生在一家名叫格朗宁的公司做主管，公司生产钻机。他高大魁梧，胖得几乎连脖子都没有了，却蓄着一脸大胡子。德思礼太太是一个瘦削的金发女人。她的脖子几乎比正常人长一倍。这样每当她花许多时间隔着篱墙引颈而望、窥探左邻右舍时，她的长脖子可就派上了大用场。德思礼夫妇有一

In [70]:
av = sum([len(i) for i in text])/len(text)

In [71]:
av

48.98993288590604

In [138]:
HSK

{'1': {'',
  '一',
  '一下儿',
  '一些',
  '一会儿',
  '一半',
  '一块儿',
  '一样',
  '一点儿',
  '一起',
  '一边',
  '七',
  '三',
  '上',
  '上午',
  '上学',
  '上次',
  '上班',
  '上网',
  '上课',
  '上车',
  '上边',
  '下',
  '下午',
  '下次',
  '下班',
  '下课',
  '下车',
  '下边',
  '下雨',
  '不',
  '不大',
  '不客气',
  '不对',
  '不用',
  '东',
  '东西',
  '东边',
  '两',
  '个',
  '中',
  '中午',
  '中国',
  '中学',
  '中学生',
  '中文',
  '中间',
  '九',
  '也',
  '书',
  '书包',
  '书店',
  '买',
  '了',
  '事',
  '二',
  '五',
  '人',
  '什么',
  '今天',
  '今年',
  '介绍',
  '从',
  '他',
  '他们',
  '们',
  '休息',
  '会',
  '住',
  '你',
  '你们',
  '做',
  '儿子',
  '元',
  '先',
  '先生',
  '八',
  '六',
  '关',
  '关上',
  '再',
  '再见',
  '写',
  '冷',
  '准备',
  '几',
  '出',
  '出去',
  '出来',
  '分',
  '别',
  '别人',
  '别的',
  '到',
  '前',
  '前天',
  '前边',
  '动',
  '动作',
  '包',
  '包子',
  '北',
  '北京',
  '北边',
  '医生',
  '医院',
  '十',
  '午饭',
  '半',
  '半天',
  '半年',
  '南',
  '南边',
  '去',
  '去年',
  '口',
  '叫',
  '右',
  '右边',
  '号',
  '吃',
  '吃饭',
  '同学',
  '名字',
  '后',
  '后天',
  '后边',
  '吗',
  '吧',
  '听',
  '听写',