In [1]:
import jieba
from zhon import hanzi as zh_hanzi
import srt

from cedict_utils.cedict import CedictParser

from dragonmapper import transcriptions
from dragonmapper import hanzi

from IPython.display import HTML, display, Markdown, clear_output
# display(HTML('''<style>
#     .widget-label { max-width: 5ex !important; }
# </style>'''))

import ipywidgets as widgets

from pydub import AudioSegment

import cv2

import json
import urllib.request

import matplotlib.pyplot as plt

import numpy as np
import io

from PIL import Image

from pathlib import Path

import hashlib

from sys import platform

In [2]:
class Params:
#     episode = "beauty"
#     episode = "stereotypes"
#     episode = "livestreaming"
    episode = "lying_flat"

In [3]:
# Make a set so faster searching
PUNCTUATION = set(zh_hanzi.punctuation)

In [4]:
parser = CedictParser()
entries = parser.parse()

CEDICT = dict()
for e in entries:
    CEDICT[e.simplified] = e

In [5]:
CEDICT['我们'].raw_line

'我們 我们 [wo3 men5] /we/us/ourselves/our/'

In [6]:
transcriptions.numbered_to_accented(CEDICT['我们'].pinyin).replace(" ", "")

'wǒmen'

In [7]:
", ".join(CEDICT['我们'].meanings)

'we, us, ourselves, our'

In [8]:
audio_file = AudioSegment.from_file(Params.episode + ".mp4")
vidcap = cv2.VideoCapture(Params.episode + ".mp4")

In [9]:
# Anki connect boilerplate

def request(action, **params):
    return {'action': action, 'params': params, 'version': 6}

def invoke(action, **params):
    requestJson = json.dumps(request(action, **params)).encode('utf-8')
#     print(type(requestJson))
#     print(requestJson)
#     print(urllib.request.Request('http://localhost:8765', requestJson))
    response = json.load(urllib.request.urlopen(urllib.request.Request('http://localhost:8765', requestJson)))
    if len(response) != 2:
        raise Exception('response has an unexpected number of fields')
    if 'error' not in response:
        raise Exception('response is missing required error field')
    if 'result' not in response:
        raise Exception('response is missing required result field')
    if response['error'] is not None:
        raise Exception(response['error'])
    return response['result']

# invoke('createDeck', deck='test1')
# result = invoke('deckNames')
# print('got list of decks: {}'.format(result))

In [10]:
subs = []
with open(Params.episode + ".srt") as f:
    subs = list(srt.parse(f))

In [23]:
KNOWN_WORDS = set()
with open('hsk1-5a.txt') as g:
    for word in g:
        KNOWN_WORDS.add(word.strip())
        
len(KNOWN_WORDS)

2148

In [24]:
known_words_note_ids = invoke('findNotes', query='"deck:Bank::subs2srs::Mandarin Corner"')
# print(type(result))
# print('Note IDs: {}'.format(known_words_note_ids))
known_words_note_ids += invoke('findNotes', query='"deck:Bank::subs2srs::Beauty Standards"')
known_words_note_ids += invoke('findNotes', query='"deck:Bank::subs2srs::Chinese Stereotypes"')
known_words_note_ids += invoke('findNotes', query='"deck:Bank::subs2srs::Livestreaming"')
known_words_note_ids += invoke('findNotes', query='"deck:Bank::subs2srs::Lying Flat"')

In [25]:
known_words_notes = invoke("notesInfo", notes=known_words_note_ids)
# known_words_notes[0]["fields"]["VocabHanzi"]["value"]
for word in known_words_notes:
    KNOWN_WORDS.add(word["fields"]["VocabHanzi"]["value"])

In [26]:
len(KNOWN_WORDS)

3045

In [27]:
NEW_WORDS = dict()

In [28]:
class WordEntry:

    def __init__(self, word, subtitle, index):
        self.word = word
        self.subtitle = subtitle
        self.index = index

In [29]:
no_def = 0
for sub in subs:
    line = sub.content.replace(" ","").strip()
    seg_list = list(jieba.cut(line, cut_all=False))
    for word in seg_list: 
        if word not in KNOWN_WORDS and word not in PUNCTUATION:
            if word in CEDICT:
                print(word, ", ".join(CEDICT[word].meanings))
            else:
                no_def += 1
                print(word)

来到 to come, to arrive
MandarinCorner
Eileen
Sharon
段时间
家里 home
一些 some, a few, a little, (following an adjective) slightly ...er
乐器行
教教
们 plural marker for pronouns, and nouns referring to individuals
之前 before, prior to, ago, previously, beforehand
太大
这份
两样 two kinds, difference
而且 (not only ...) but also, moreover, in addition, furthermore
之前 before, prior to, ago, previously, beforehand
这份
不是 no, is not, not
但 but, yet, however, only, merely, still
那么 like that, in that way, or so, so, so very much, about, in that case
那种 that, that kind of, that sort of, that type of
这种 this, this kind of, this sort of, this type of
躺平
一种 one kind of, one type of
躺平
这个 this, this one
词 word, statement, speech, lyrics, CL:組|组[zu3],個|个[ge4], a form of lyric poetry, flourishing in the Song dynasty 宋朝|宋朝[Song4 chao2], CL:首[shou3]
真的
很常
!
不论是
现实生活
中 to hit (the mark), to be hit by, to suffer, to win (a prize, a lottery)
常常 frequently, often
听到 to hear
还有 furthermore, in addition, still, also
一期
躺平
那么 li

变成 to change into, to turn into, to become
就是 (emphasizes that sth is precisely or exactly as stated), precisely, exactly, even, if, just like, in the same way as
人们 people
常说
一个
不是 no, is not, not
更好
而是 rather
地去
各种 every kind of, all kinds of, various kinds
更好
这时候
地 earth, ground, field, place, land, CL:片[pian4]
自我调侃
他们 they
最大
就是 (emphasizes that sth is precisely or exactly as stated), precisely, exactly, even, if, just like, in the same way as
就是 (emphasizes that sth is precisely or exactly as stated), precisely, exactly, even, if, just like, in the same way as
这个 this, this one
累 rope, to bind together, to twist around
就要 will, shall, to be going to
但 but, yet, however, only, merely, still
不是 no, is not, not
真的
聊一聊
刷 to select
一刷
道高一尺
魔高一丈
说法 way of speaking, wording, formulation, one's version (of events), statement, theory, hypothesis, interpretation
不是 no, is not, not
他们 they
这种 this, this kind of, this sort of, this type of
他们 they
之前 before, prior to, ago, previously, beforeh

In [30]:
no_def

358

In [31]:
def add_to_anki(label):
    word = label.description
    subtitle = NEW_WORDS[word].subtitle
    index = NEW_WORDS[word].index
    seg_list = list(jieba.cut(subtitle.content, cut_all=False))
#     seg_list = [i for i in seg_list if i not in zh_hanzi.punctuation]
    start_timestamp = NEW_WORDS[word].subtitle.start.total_seconds()*1000
    end_timestamp = NEW_WORDS[word].subtitle.end.total_seconds()*1000 
    
    UniqueIdentifier = word + "|" + subtitle.content 
    SentHanzi = subtitle.content.replace(word, "<b>" + word + "</b>")
    SentPinyin = " ".join([hanzi.to_pinyin(i) for i in seg_list if i not in PUNCTUATION])
#     VocabPinyin = transcriptions.numbered_to_accented(CEDICT[word].pinyin).replace(" ", "")
    VocabPinyin = hanzi.to_pinyin(word)
    VocabDef = ", ".join(CEDICT[word].meanings) if word in CEDICT else ""
    
    vidcap.set(cv2.CAP_PROP_POS_MSEC, (start_timestamp))  
    success, image = vidcap.read()
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    s = word + subtitle.content
    hash_object = hashlib.sha1(s.encode('utf-8'))
    file_stub = hash_object.hexdigest()
    
    if platform.startswith("darwin"):
        media_dir = Path("/Users/Daniel/programming/chinese/Mandarin corner/media/")
    elif platform.startswith("linux"):
        media_dir = Path("/home/daniel/programming/chinese/mandarin_corner/media/")
    
    img_path = media_dir.joinpath(file_stub + ".jpg")
    audio_path = media_dir.joinpath(file_stub + "mp3")
        

#     if not Path("/home/daniel/programming/chinese/mandarin_corner/media/" + file_stub + ".jpg").exists():
    if not img_path.exists():
        im = Image.fromarray(image)
#         im.save("media/" + file_stub + ".jpg", quality=50)
        im.save(img_path, quality=50)
        audio_clip = audio_file[start_timestamp - 200 : end_timestamp + 200]
#         audio_clip.export("media/" + file_stub + ".mp3")
        audio_clip.export(audio_path)

    note = {
#         "deckName": "Bank::subs2srs::Beauty Standards",
#         "deckName": "Bank::subs2srs::Chinese stereotypes",
#         "deckName": "Bank::subs2srs::Livestreaming",
        "deckName": "Bank::subs2srs::Lying Flat",
        "modelName": "Chinese sentences",
        "fields": {
            "UniqueIdentifier": UniqueIdentifier,
            "SentHanzi": SentHanzi,
            "SentPinyin": SentPinyin,
            "VocabHanzi": word,
            "VocabPinyin": VocabPinyin,
            "VocabDef": VocabDef,
#             "Misc": "Beauty standards - " + str(index)
#             "Misc": "Chinese stereotypes - " + str(index)
#             "Misc": "Livestreaming - " + str(index)
            "Misc": "Lying Flat - " + str(index)
        },
        "picture" : [{
#             "path" : "/home/daniel/programming/chinese/mandarin_corner/media/" + file_stub + ".jpg",
#             "path" : "/Users/Daniel/programming/chinese/Mandarin corner/media/" + file_stub + ".jpg",
            "path" : str(img_path),
            "filename": file_stub + ".jpg",
            "fields" : [
                "Image"
            ]
        }],
        "audio" : [{
#             "path" : "/home/daniel/programming/chinese/mandarin_corner/media/" + file_stub + ".mp3",
#             "path" : "/Users/Daniel/programming/chinese/Mandarin corner/media/" + file_stub + ".mp3",
            "path" : str(audio_path),
            "filename": file_stub + ".mp3",
            "fields" : [
                "SentAudio"
            ]
        }],
    }

    result = invoke('addNote', note=note)

In [32]:
NEW_WORDS = dict()
v_lines = []
i = 1

for sub in subs:
    line = sub.content.replace(" ","").strip()
    seg_list1 = list(jieba.cut(line, cut_all=False))
    seg_list2 = list(jieba.cut(line, cut_all=True))
    buttons1 = []
    buttons2 = []
    outs = []
    display1 = []
    display2 = []
    display1.append(widgets.Label(str(i) + "."))
    display2.append(widgets.Label("    "))
    for word in seg_list1:
        word_entry = WordEntry(word, sub, i)
        if word not in KNOWN_WORDS and word not in zh_hanzi.punctuation and word not in NEW_WORDS:
            NEW_WORDS[word] = word_entry
            button = widgets.Button(description=word_entry.word, style=dict(
                    font_size="20px"
            ))
            #             button.style.font_size = 
            out = widgets.Output()
            button.on_click(add_to_anki)
            display1.append(button)
        else:
            display1.append(widgets.Label(word))
    v_lines.append(widgets.HBox(display1))
    if seg_list2 != seg_list1:
            for word in [i for i in seg_list2 if i not in seg_list1]:
                word_entry = WordEntry(word, sub, i)
                if word not in KNOWN_WORDS and word not in PUNCTUATION and word not in NEW_WORDS:
                    NEW_WORDS[word] = word_entry
                    button = widgets.Button(description=word_entry.word)
        #             button.style.font_size = 
                    out = widgets.Output()
                    button.on_click(add_to_anki)
                    display2.append(button)
                else:
                    display2.append(widgets.Label(word))
            v_lines.append(widgets.HBox(display2))
    i += 1

widgets.VBox(v_lines)

VBox(children=(HBox(children=(Label(value='1.'), Label(value='大家'), Label(value='好'), Label(value='，'), Label(…

In [21]:
stereotypes_word_ids = invoke('findNotes', query='"deck:Bank::subs2srs::Chinese Stereotypes"')
stereotypes_words_notes = invoke("notesInfo", notes=stereotypes_word_ids)
STEREOTYPE_WORDS = set()
# known_words_notes[0]["fields"]["VocabHanzi"]["value"]
for word in stereotypes_words_notes:
    STEREOTYPE_WORDS.add(word["fields"]["VocabHanzi"]["value"])

In [22]:
STEREOTYPE_WORDS

{'一无所知',
 '一胎政策',
 '三天不打上房揭瓦',
 '丢尽',
 '丢脸',
 '个例',
 '为啥',
 '乘以',
 '以上',
 '以下',
 '传宗接代',
 '传染',
 '传言',
 '体验',
 '假如',
 '假装',
 '假设',
 '做到',
 '僚机',
 '儿化',
 '全国',
 '全程',
 '关机',
 '兼职',
 '内脏',
 '冒犯',
 '冰棍',
 '冰淇淋',
 '凌晨',
 '凤凰',
 '出谋划策',
 '初次',
 '刻板印象',
 '剪',
 '匿名',
 '单身',
 '卖家',
 '参谋',
 '反馈',
 '发火',
 '发货',
 '向往',
 '吓人',
 '含义',
 '吹',
 '味精',
 '哦',
 '唠嗑',
 '唯一',
 '商品',
 '啥',
 '喷香',
 '嘞',
 '嘲笑',
 '回复',
 '围坐',
 '在我看来',
 '壮',
 '声明',
 '大动干戈',
 '大衣',
 '失态',
 '失调',
 '头盖骨',
 '奇形怪状',
 '女方',
 '好不容易',
 '如果说',
 '妹子',
 '妻管严',
 '娶',
 '娶媳妇',
 '媳妇',
 '嫁',
 '实力',
 '室内',
 '家乡',
 '家门',
 '对象',
 '封面',
 '小品',
 '小气',
 '市场竞争',
 '帖子',
 '带劲',
 '年纪',
 '底子',
 '弄饭',
 '引',
 '强行',
 '当上',
 '当成',
 '录',
 '彩礼',
 '快递',
 '忽悠',
 '怕老婆',
 '恐惧',
 '恐惧症',
 '惹',
 '意味着',
 '懵',
 '我操',
 '所在',
 '打倒',
 '打水漂',
 '打瞌睡',
 '打鼓',
 '拉萨',
 '拐弯抹角',
 '拖地',
 '挂钩',
 '捋',
 '捡',
 '据我所知',
 '控制',
 '搓衣板',
 '撕',
 '撮',
 '擅长',
 '攀比',
 '散场',
 '整',
 '无中生有',
 '无故',
 '晃悠',
 '晓得',
 '晕乎乎',
 '暖气',
 '最先',
 '朋友圈',
 '权利',
 '杠精',
 '柴火',
 '标签',
 '样貌',
 '棉袄',
 '欺负',
 '欺骗',

In [None]:
class 

In [32]:
with open('hsk5-nouns.txt') as f:
    lines = f.readlines()
    print(lines[0].split()[1].split('/')[0])
    print(lines[3])

爱心
我被她对别人的爱心所感动。

