In [82]:
from cjk_decomp_full import characters_dictionary
from frequent_characters import frequent_characters_list


In [None]:
# Three-level recursion

def decompose_character(character):
  char_dict = {character:[]}
  # Retrieves from characters dictionary a list of current character's
  # radicals
  if character in characters_dictionary.keys():
    list_radicals = characters_dictionary[character]
  else:
    return None
  
  for radical in list_radicals:
    char_dict[character].append(radical)
    # If a radical is listed in the character dictionary as a 
    # character on its own, decompose it further
    if radical in characters_dictionary.keys():
      list_subradicals = characters_dictionary[radical]
      for subradical in list_subradicals:
        char_dict[character].append(subradical)
        if subradical in characters_dictionary.keys():
          list_subsubradicals = characters_dictionary[subradical]
          for subsubradical in list_subsubradicals:
            char_dict[character].append(subsubradical)
  return char_dict

# Obtain full decomposition for 1000 frequent characters
for hanzi in frequent_characters_list:
  print(decompose_character(hanzi))





In [85]:
# Counting most frequent radicals / strokes
# Importing ready-to-use dictionary of fully decomposed characters
# generated using the steps above

from decomposition_dictionary import decomposition_dictionary


In [None]:
# Saving all radical and strokes, including repeated ones, in a list
total_list = []
for item in decomposition_dictionary.values():
  total_list.append(item[0])
# Saving all radicals and strokes, without duplicates
total_set = set(total_list)

# Building final dictionary containing stroke/radical frequency
total_dict = {}
for item in total_set:
  total_dict[item] = total_list.count(item)

# sorted_dict = {k: v for k, v in sorted(total_dict.items(), key=lambda item: item[1])}

#Next step: create a score system attributing a score to each character according to its own frequency * the frequency of its radicals


In [102]:
import pandas as pd

In [155]:
df = pd.read_csv("1000freq.csv", sep="\t", header=None)
df.drop(columns=[0,3,4,5],inplace=True)
df.reset_index()

# Adding a column from decomposition_dictionary

df['Decomposition'] = list(decomposition_dictionary.values())

# Calculating score for each character in a score_dictionary
# Characters are rewarded for having frequent radicals, but also (exponentially) penalized for having too many strokes

score_dictionary = {}
for character in decomposition_dictionary:
  score = 0
  for radical in decomposition_dictionary[character]:
    if radical in total_dict:
      score += total_dict[radical]
  score_dictionary[character] = score / len(decomposition_dictionary[character])**2

df['Radical & Stroke Score'] = score_dictionary.values()

df['Total Score'] = df[2] * df['Radical & Stroke Score'] / 1000000
df.sort_values(by=['Total Score'],ascending=False)

df.to_excel('finaltable.xlsx')