In [1]:
import os
from pyprojroot import here
project_root = here()
print(f"Changing working directory to {project_root}")
os.chdir(project_root)

# Unihan

This notebooks checks the raw data of the Unihan dataset and transforms it into a usable state.

## Downloading Raw data

In [2]:
from bigchina.data import download_unihan_zip
download_unihan_zip()
os.listdir("data/unihan")

['Unihan_OtherMappings.txt',
 'Unihan.zip',
 'Unihan_DictionaryLikeData.txt',
 'Unihan_Variants.txt',
 'Unihan_NumericValues.txt',
 'Unihan_Readings.txt',
 'Unihan_DictionaryIndices.txt',
 'Unihan_IRGSources.txt',
 'Unihan_RadicalStrokeCounts.txt']

## Reading data files

In [3]:
from bigchina.data import read_all_unihan_files
df = read_all_unihan_files()
df

Unnamed: 0,unicode,field,description
0,U+3402,kJIS0213,11403
1,U+3406,kJIS0213,20113
2,U+340C,kKPS1,3451
3,U+341C,kKPS1,345F
4,U+3425,kKPS1,346A
...,...,...,...
1356823,U+2F9D0,kRSAdobe_Japan1_6,C+14068+149.7.9
1356824,U+2F9DE,kRSAdobe_Japan1_6,C+20066+159.7.3
1356825,U+2F9DF,kRSAdobe_Japan1_6,C+14069+159.7.9
1356826,U+2F9F4,kRSAdobe_Japan1_6,C+15269+45.3.12


## Transforming the data

First of all the ```field``` column has to be spread into multiple columns with the ```description``` as Value:


In [4]:
from bigchina.transform import spread_unihan
df = spread_unihan(df)
df

field,unicode,kAccountingNumeric,kBigFive,kCCCII,kCNS1986,kCNS1992,kCangjie,kCantonese,kCheungBauer,kCheungBauerIndex,...,kTGHZ2013,kTaiwanTelegraph,kTang,kTotalStrokes,kTraditionalVariant,kUnihanCore2020,kVietnamese,kXHC1983,kXerox,kZVariant
0,U+20000,,,,,,,,,,...,,,,2,,,,,,
1,U+20001,,,,,,,cat1,,,...,,,,2,,,,,,
2,U+20002,,,,,,,,,,...,,,,2,,,,,,
3,U+20003,,,,,,,,,,...,,,,3,,,,,,
4,U+20004,,,,,,,,,,...,,,,3,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93853,U+FAD5,,,,,,,,,,...,,,,15,,,,,,
93854,U+FAD6,,,,,,,,,,...,,,,20,,,,,,
93855,U+FAD7,,,,,,,,,,...,,,,15,,,,,,
93856,U+FAD8,,,,,,,,,,...,,,,22,,,,,,


Now alot of columns are visible. Some interesting ones are for example:

In [5]:
df[
    [
        "unicode",
        "kDefinition",
        "kFrequency",
        "kMandarin",
        "kCantonese",
        "kHangul",
        "kJapaneseKun",
        "kJapaneseOn",
        "kSimplifiedVariant",
        "kTraditionalVariant",
        "kSemanticVariant",
        "kZVariant",
        "kSpoofingVariant",
        "kCompatibilityVariant",
        "kMandarin",
        "kRSKangXi",
        "kRSUnicode",
        "kTotalStrokes",
    ]
]

field,unicode,kDefinition,kFrequency,kMandarin,kCantonese,kHangul,kJapaneseKun,kJapaneseOn,kSimplifiedVariant,kTraditionalVariant,kSemanticVariant,kZVariant,kSpoofingVariant,kCompatibilityVariant,kMandarin.1,kRSKangXi,kRSUnicode,kTotalStrokes
0,U+20000,the sound made by breathing in; oh! (cf. U+311...,,hē,,,,,,,,,,,hē,1.1,1.1,2
1,U+20001,the original form for 七 U+4E03,,qī,cat1,,,,,,,,,,qī,1.1,1.1,2
2,U+20002,,,,,,,,,,,,,,,1.1,1.1,2
3,U+20003,,,qiě,,,,,,,,,,,qiě,1.2,1.2,3
4,U+20004,,,,,,,,,,,,,,,1.2,1.2,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93853,U+FAD5,,,,,,,,,,,,,U+25249,,,109.10,15
93854,U+FAD6,,,,,,,,,,,,,U+25CD0,,,118.12,20
93855,U+FAD7,,,,,,,,,,,,,U+27ED3,,,156.6,15
93856,U+FAD8,,,,,,,,,,,,,U+9F43,,,209.9,22


But first lets sort the DataFrame by the frequency:

In [6]:
df = df.sort_values("kFrequency")
df[
    [
        "unicode",
        "kDefinition",
        "kFrequency",
        "kMandarin",
        "kCantonese",
        "kHangul",
        "kJapaneseKun",
        "kJapaneseOn",
        "kSimplifiedVariant",
        "kTraditionalVariant",
        "kSemanticVariant",
        "kZVariant",
        "kSpoofingVariant",
        "kCompatibilityVariant",
        "kMandarin",
        "kRSKangXi",
        "kRSUnicode",
        "kTotalStrokes",
    ]
]

field,unicode,kDefinition,kFrequency,kMandarin,kCantonese,kHangul,kJapaneseKun,kJapaneseOn,kSimplifiedVariant,kTraditionalVariant,kSemanticVariant,kZVariant,kSpoofingVariant,kCompatibilityVariant,kMandarin.1,kRSKangXi,kRSUnicode,kTotalStrokes
72397,U+4E00,"one; a, an; alone",1,yī,jat1,일:0E,HITOTSU HITOTABI HAJIME,ICHI ITSU,,,"U+5F0C<kLau,kMatthews,kMeyerWempe U+58F9<kLau,...",,,,yī,1.0,1.0,1
75338,U+597D,"good, excellent, fine; well",1,hǎo,hou2 hou3,호:0E,KONOMU SUKU YOI,KOU,,,,,,,hǎo,38.3,38.3,6
75343,U+5982,"if, supposing; as if; like, as",1,rú,jyu4,여:0E,GOTOKU SHIKU YUKU,JO NYO,,,,,,,rú,38.3,38.3,6
75805,U+5B50,"offspring, child; fruit, seed of; 1st terrestr...",1,zi,zi2,자:0E,KO MI OTOKO,SHI SU,,,U+53EA<kLau,,,,zi,39.0,39.0,3
75827,U+5B66,"learning, knowledge; school",1,xué,hok6,학:N,MANABU,GAKU,,U+5B78,,,,,xué,39.5,39.5,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93853,U+FAD5,,,,,,,,,,,,,U+25249,,,109.10,15
93854,U+FAD6,,,,,,,,,,,,,U+25CD0,,,118.12,20
93855,U+FAD7,,,,,,,,,,,,,U+27ED3,,,156.6,15
93856,U+FAD8,,,,,,,,,,,,,U+9F43,,,209.9,22


There are a lot of missing values, but most of them are not needed for this analysis.
Also, most unicode characters are written in Unicdoe notation, which means it is not readable by a human.
This can be easily fixed:

In [7]:
from bigchina.transform import extract_encode_glyph_columns
df = extract_encode_glyph_columns(df)
df[['glyph', "kDefinition", 'simplified_variant', 'traditional_variant', 'semantic_variant', 'specialized_variant', 'z_variant', 'spoofing_variant', 'compatibility_variant']].head(10)


field,glyph,kDefinition,simplified_variant,traditional_variant,semantic_variant,specialized_variant,z_variant,spoofing_variant,compatibility_variant
72397,一,"one; a, an; alone",,,弌 壹,壹,,,
75338,好,"good, excellent, fine; well",,,,,,,
75343,如,"if, supposing; as if; like, as",,,,,,,
75805,子,"offspring, child; fruit, seed of; 1st terrestr...",,,只,,,,
75827,学,"learning, knowledge; school",,學,,,,,
75845,學,"learning, knowledge; school",学,,斈,,,,
75879,定,"decide, settle, fix",,,,,,,
75907,家,"house, home, residence; family",,,,,,,
75974,对,"correct, right; facing, opposed",,對,對,對,,,
75994,對,"correct, right; facing, opposed",对,,对,对,,,


## Radicals

Every Han Character is associated with a Radical number.



In [12]:
from bigchina.transform import split_radical_additional_strokes_column
df = split_radical_additional_strokes_column(df)
df[
    [
        "glyph",
        "kDefinition",
        "kRSUnicode",
        "radical",
        "additional_strokes",
        "simplified_radical_indicator"
    ]
]

field,glyph,kDefinition,kRSUnicode,radical,additional_strokes,simplified_radical_indicator
72397,一,"one; a, an; alone",1.0,1,0,False
75338,好,"good, excellent, fine; well",38.3,38,3,False
75343,如,"if, supposing; as if; like, as",38.3,38,3,False
75805,子,"offspring, child; fruit, seed of; 1st terrestr...",39.0,39,0,False
75827,学,"learning, knowledge; school",39.5,39,5,False
...,...,...,...,...,...,...
93853,𥉉,,109.10,109,10,False
93854,𥳐,,118.12,118,12,False
93855,𧻓,,156.6,156,6,False
93856,齃,,209.9,209,9,False
