In [1]:
from Tokenizer import tokenize
import os
from collections import Counter
from glob import glob
from rich import print
import math

In [2]:
tokenize("私は日本語を勉強しています")

{'私': 1, '日本': 1, '語': 1, '勉強': 1}

In [3]:
def detect_hard_words(texts):
    hard_words = []
    for text in texts:
        tokens = tokenize(text)
        for token in tokens:
            hard_words.append(token)
    return hard_words

def parse_srt(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        srt_text = file.read()

    srt_blocks = srt_text.split('\n\n')
    
    sentences = []
    for block in srt_blocks:
        lines = block.split('\n')
        if len(lines) > 2: 
            text = ' '.join(lines[2:]) 
            sentences.append(text.strip())
    
    return sentences

def process_srt_folder():
    hard_words = []
    srt_files = glob(os.path.join("./subtitles", '*.srt'))

    for srt_file in srt_files:
        print(f"[#6699ff]Processing file: {srt_file}[/]")
        sentences = parse_srt(srt_file)
        hard_words.extend(detect_hard_words(sentences))
    
    return hard_words

hard_words = process_srt_folder()
print("[#33ff99]Important Words Detected:[/]")

word_counts = Counter(hard_words).most_common()

def sigmoid(x):
    return 1 / (1 + (math.exp(-x)))

string_to_print = ''
highest_freq = float(word_counts[0][1])
median = float(word_counts[len(word_counts) // 2][1])
for word, freq in word_counts:
    normalized_freq = (float(freq) - median)
    green = sigmoid(normalized_freq)
    red = sigmoid(-normalized_freq)
    blue = sigmoid(-normalized_freq+5)
    string_to_print += f'[b][#{format(int(red * 255), "02x")}{format(int(green * 255), "02x")}{format(int(blue * 255), "02x")}]{word}:[#{format(int((red**0.2) * 255), "02x")}{format(int((green**0.2) * 255), "02x")}{format(int((blue**0.2) * 255), "02x")}]{freq}[/], '

print(string_to_print)

In [None]:
import requests
import json

def check_word_in_anki(word: str) -> bool:
    payload = {
        "action": "findNotes",
        "version": 6,
        "params": {
            "query": f'"{word}"'
        }
    }

    response = requests.post("http://localhost:8765", json=payload)
    result = response.json()
    return len(result['result']) > 0


word = "悲しい"
if check_word_in_anki(word):
    print(f"The word '{word}' exists in your Anki decks.")
else:
    print(f"The word '{word}' does not exist in your Anki decks.")


The word '悲しい' exists in your Anki decks.


In [None]:
import math
def detect_hard_words(texts, hard_words_list):
    hard_words = []
    for text in texts:
        tokens = tokenize(text)
        for token in tokens:
            if token not in hard_words_list:
                hard_words.append(token)
    return hard_words

def parse_srt(file_path):
    """
    Parses an SRT file and extracts the text content.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        srt_text = file.read()

    srt_blocks = srt_text.split('\n\n')
    
    sentences = []
    for block in srt_blocks:
        lines = block.split('\n')
        if len(lines) > 2: 
            text = ' '.join(lines[2:]) 
            sentences.append(text.strip())
    
    return sentences

def process_srt_folder(known_words):
    hard_words = []
    srt_files = glob(os.path.join("./subtitles", '*.srt'))

    for srt_file in srt_files:
        print(f"Processing file: {srt_file}")
        sentences = parse_srt(srt_file)
        hard_words.extend(detect_hard_words(sentences, known_words))
    
    return hard_words


hard_words = process_srt_folder(known_words)
print("[#33ff99]Important Words Detected:[/]")

word_counts = Counter(hard_words).most_common()

def sigmoid(x):
    return 1 / (1 + (math.exp(-x)))

string_to_print = ''
highest_freq = float(word_counts[0][1])
median = float(word_counts[len(word_counts) // 2][1])
for word, freq in word_counts:
    normalized_freq = (float(freq) - median)
    green = sigmoid(normalized_freq)
    red = sigmoid(-normalized_freq)
    blue = sigmoid(-normalized_freq+5)
    string_to_print += f'[b][#{format(int(red * 255), "02x")}{format(int(green * 255), "02x")}{format(int(blue * 255), "02x")}]{word}:[#{format(int((red**0.2) * 255), "02x")}{format(int((green**0.2) * 255), "02x")}{format(int((blue**0.2) * 255), "02x")}]{freq}[/], '

print(string_to_print)


In [None]:
import spacy
from collections import Counter


nlp = spacy.load('ja_core_news_sm')

def tokenize_and_lemmatize(text):

    doc = nlp(text)
    return [token.lemma_ for token in doc if not token.is_punct and not token.is_stop]

def detect_hard_words(texts, hard_words_list):
    hard_words = []
    for text in texts:

        tokens = tokenize_and_lemmatize(text)
        for token in tokens:
            if token not in hard_words_list:
                hard_words.append(token)
    return hard_words

def main():

    subtitle_strings = [
        "日本語を一番好きな言語"
    ]
    
    
    known_words = set([
        "する", "です", "ます", "行く", "見る", "分かる", "食べる", 
        "天気", "映画", "レストラン", "勉強", "楽しい", "提供", "美味しい", "語"
    ])
    

    hard_words = detect_hard_words(subtitle_strings, known_words)
    print("[#33ff99]Hard Words Detected:[/]")
    dict = Counter(hard_words)
    highestFreq = float(dict[next(key for key in dict)])
    stringToPrint = ''
    for word in dict:
        freq = dict[word]
        green = (float(freq)/highestFreq)
        red = 1-(float(freq)/highestFreq)
        stringToPrint += (f'[#{format(int(red*255), "02x")}{format(int(green*255), "02x")}ff]{word}[/]\n')
    print(stringToPrint)

if __name__ == "__main__":
    main()