In [2]:
from Tokenizer import tokenize
import os
from collections import Counter
from glob import glob
from rich import print

In [None]:
tokenize("私は日本語を勉強しています")

In [None]:
known_words = set([
    "する", "です", "ます", "行く", "見る", "分かる", "食べる", 
    "天気", "映画", "レストラン", "勉強", "楽しい", "提供", "美味しい", "語"
])

def detect_hard_words(texts, hard_words_list):
    hard_words = []
    for text in texts:
        tokens = tokenize(text)
        for token in tokens:
            if token not in hard_words_list:
                hard_words.append(token)
    return hard_words

def parse_srt(file_path):
    """
    Parses an SRT file and extracts the text content.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        srt_text = file.read()

    # Split by the subtitle blocks (each block starts with a number and timecodes)
    srt_blocks = srt_text.split('\n\n')
    
    # Extract text from each block
    sentences = []
    for block in srt_blocks:
        lines = block.split('\n')
        if len(lines) > 2:  # If there are more than two lines, it's a valid subtitle block
            text = ' '.join(lines[2:])  # Skip the timecodes and number
            sentences.append(text.strip())
    
    return sentences

def process_srt_folder(known_words):
    hard_words = []
    srt_files = glob(os.path.join("./subtitles", '*.srt'))

    for srt_file in srt_files:
        print(f"[#6699ff]Processing file: {srt_file}[/]")
        # Parse the SRT file to extract text
        sentences = parse_srt(srt_file)
        # Detect hard words in the sentences
        hard_words.extend(detect_hard_words(sentences, known_words))
    
    return hard_words

# Process all SRT files in the folder
hard_words = process_srt_folder(known_words)

print("[#33ff99]Hard Words Detected:[/]")
dict = Counter(hard_words)
highestFreq = next(iter(dict.values()))
for word, freq in dict:
    green = (freq/highestFreq)
    red = 1-(freq/highestFreq)
    print(f'[#{hex(red*255)}{hex(green*255)}ff]'+word+'[/]')

In [22]:
import math
def detect_hard_words(texts, hard_words_list):
    hard_words = []
    for text in texts:
        tokens = tokenize(text)
        for token in tokens:
            if token not in hard_words_list:  # Check if the word is not in the known words list
                hard_words.append(token)
    return hard_words

def parse_srt(file_path):
    """
    Parses an SRT file and extracts the text content.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        srt_text = file.read()

    # Split by the subtitle blocks (each block starts with a number and timecodes)
    srt_blocks = srt_text.split('\n\n')
    
    # Extract text from each block
    sentences = []
    for block in srt_blocks:
        lines = block.split('\n')
        if len(lines) > 2:  # If there are more than two lines, it's a valid subtitle block
            text = ' '.join(lines[2:])  # Skip the timecodes and number
            sentences.append(text.strip())
    
    return sentences

def process_srt_folder(known_words):
    hard_words = []
    # Use glob to get all .srt files in the folder
    srt_files = glob(os.path.join("./subtitles", '*.srt'))

    for srt_file in srt_files:
        print(f"Processing file: {srt_file}")
        # Parse the SRT file to extract text
        sentences = parse_srt(srt_file)
        # Detect hard words in the sentences
        hard_words.extend(detect_hard_words(sentences, known_words))
    
    return hard_words

# Example: Load a predefined list of known words or use a dictionary
known_words = set([
    "する", "です", "ます", "行く", "見る", "分かる", "食べる", 
    "天気", "映画", "レストラン", "勉強", "楽しい", "提供", "美味しい", "語"
])
# Detect hard words in the sentences
hard_words = process_srt_folder(known_words)
print("[#33ff99]Important Words Detected:[/]")

# Use a variable name other than `dict`
word_counts = Counter(hard_words).most_common()


def sigmoid(x,sign=1):
    return 1 / (1 + (sign*math.exp(5-10*x)))

string_to_print = ''
highest_freq = float(word_counts[0][1])
for word, freq in word_counts:
    normalized_freq = (float(freq) / highest_freq)
    green = sigmoid(normalized_freq)
    red = sigmoid(normalized_freq, -1)
    # Format and append the colored string for each word
    string_to_print += f'[#{format(int(red * 255), "02x")}{format(int(green * 255), "02x")}66]{word}[/]:{freq} color: {red, green}, '

print(string_to_print)


In [None]:
import spacy
from collections import Counter

# Load spaCy's Japanese model
nlp = spacy.load('ja_core_news_sm')

def tokenize_and_lemmatize(text):
    # Tokenize and lemmatize the text using spaCy
    doc = nlp(text)
    return [token.lemma_ for token in doc if not token.is_punct and not token.is_stop]

def detect_hard_words(texts, hard_words_list):
    hard_words = []
    for text in texts:
        # Tokenize and lemmatize the sentence
        tokens = tokenize_and_lemmatize(text)
        for token in tokens:
            if token not in hard_words_list:  # Check if the word is not in the known words list
                hard_words.append(token)
    return hard_words

def main():
    # Example of subtitle-like strings
    subtitle_strings = [
        "日本語を一番好きな言語"
    ]
    
    # Example: Load a predefined list of known words or use a dictionary
    known_words = set([
        "する", "です", "ます", "行く", "見る", "分かる", "食べる", 
        "天気", "映画", "レストラン", "勉強", "楽しい", "提供", "美味しい", "語"
    ])  # Example for N4 level
    
    # Detect hard words in the sentences

    hard_words = detect_hard_words(subtitle_strings, known_words)
    print("[#33ff99]Hard Words Detected:[/]")
    dict = Counter(hard_words)
    highestFreq = float(dict[next(key for key in dict)])
    stringToPrint = ''
    for word in dict:
        freq = dict[word]
        green = (float(freq)/highestFreq)
        red = 1-(float(freq)/highestFreq)
        stringToPrint += (f'[#{format(int(red*255), "02x")}{format(int(green*255), "02x")}ff]{word}[/]\n')
    print(stringToPrint)

if __name__ == "__main__":
    main()