# 작업에 필요한 패키지 및 함수들

In [1]:
pip install jamo

Collecting jamo
  Downloading jamo-0.4.1-py3-none-any.whl (9.5 kB)
Installing collected packages: jamo
Successfully installed jamo-0.4.1


In [2]:
__all__ = ["split_syllable_char", "split_syllables",
           "join_jamos", "join_jamos_char",
           "CHAR_INITIALS", "CHAR_MEDIALS", "CHAR_FINALS"]

import itertools

INITIAL = 0x001
MEDIAL = 0x010
FINAL = 0x100
CHAR_LISTS = {
    INITIAL: list(map(chr, [
        0x3131, 0x3132, 0x3134, 0x3137, 0x3138, 0x3139,
        0x3141, 0x3142, 0x3143, 0x3145, 0x3146, 0x3147,
        0x3148, 0x3149, 0x314a, 0x314b, 0x314c, 0x314d,
        0x314e
    ])),
    MEDIAL: list(map(chr, [
        0x314f, 0x3150, 0x3151, 0x3152, 0x3153, 0x3154,
        0x3155, 0x3156, 0x3157, 0x3158, 0x3159, 0x315a,
        0x315b, 0x315c, 0x315d, 0x315e, 0x315f, 0x3160,
        0x3161, 0x3162, 0x3163
    ])),
    FINAL: list(map(chr, [
        0x3131, 0x3132, 0x3133, 0x3134, 0x3135, 0x3136,
        0x3137, 0x3139, 0x313a, 0x313b, 0x313c, 0x313d,
        0x313e, 0x313f, 0x3140, 0x3141, 0x3142, 0x3144,
        0x3145, 0x3146, 0x3147, 0x3148, 0x314a, 0x314b,
        0x314c, 0x314d, 0x314e
    ]))
}
CHAR_INITIALS = CHAR_LISTS[INITIAL]
CHAR_MEDIALS = CHAR_LISTS[MEDIAL]
CHAR_FINALS = CHAR_LISTS[FINAL]
CHAR_SETS = {k: set(v) for k, v in CHAR_LISTS.items()}
CHARSET = set(itertools.chain(*CHAR_SETS.values()))
CHAR_INDICES = {k: {c: i for i, c in enumerate(v)}
                for k, v in CHAR_LISTS.items()}


def is_hangul_syllable(c):
    return 0xac00 <= ord(c) <= 0xd7a3  # Hangul Syllables


def is_hangul_jamo(c):
    return 0x1100 <= ord(c) <= 0x11ff  # Hangul Jamo


def is_hangul_compat_jamo(c):
    return 0x3130 <= ord(c) <= 0x318f  # Hangul Compatibility Jamo


def is_hangul_jamo_exta(c):
    return 0xa960 <= ord(c) <= 0xa97f  # Hangul Jamo Extended-A


def is_hangul_jamo_extb(c):
    return 0xd7b0 <= ord(c) <= 0xd7ff  # Hangul Jamo Extended-B


def is_hangul(c):
    return (is_hangul_syllable(c) or
            is_hangul_jamo(c) or
            is_hangul_compat_jamo(c) or
            is_hangul_jamo_exta(c) or
            is_hangul_jamo_extb(c))


def is_supported_hangul(c):
    return is_hangul_syllable(c) or is_hangul_compat_jamo(c)


def check_hangul(c, jamo_only=False):
    if not ((jamo_only or is_hangul_compat_jamo(c)) or is_supported_hangul(c)):
        raise ValueError(f"'{c}' is not a supported hangul character. "
                         f"'Hangul Syllables' (0xac00 ~ 0xd7a3) and "
                         f"'Hangul Compatibility Jamos' (0x3130 ~ 0x318f) are "
                         f"supported at the moment.")


def get_jamo_type(c):
    check_hangul(c)
    assert is_hangul_compat_jamo(c), f"not a jamo: {ord(c):x}"
    return sum(t for t, s in CHAR_SETS.items() if c in s)


def split_syllable_char(c):
    """
    Splits a given korean syllable into its components. Each component is
    represented by Unicode in 'Hangul Compatibility Jamo' range.
    Arguments:
        c: A Korean character.
    Returns:
        A triple (initial, medial, final) of Hangul Compatibility Jamos.
        If no jamo corresponds to a position, `None` is returned there.
    Example:
        >>> split_syllable_char("안")
        ("ㅇ", "ㅏ", "ㄴ")
        >>> split_syllable_char("고")
        ("ㄱ", "ㅗ", None)
        >>> split_syllable_char("ㅗ")
        (None, "ㅗ", None)
        >>> split_syllable_char("ㅇ")
        ("ㅇ", None, None)
    """
    check_hangul(c)
    if len(c) != 1:
        raise ValueError("Input string must have exactly one character.")

    init, med, final = None, None, None
    if is_hangul_syllable(c):
        offset = ord(c) - 0xac00
        x = (offset - offset % 28) // 28
        init, med, final = x // 21, x % 21, offset % 28
        if not final:
            final = None
        else:
            final -= 1
    else:
        pos = get_jamo_type(c)
        if pos & INITIAL == INITIAL:
            pos = INITIAL
        elif pos & MEDIAL == MEDIAL:
            pos = MEDIAL
        elif pos & FINAL == FINAL:
            pos = FINAL
        idx = CHAR_INDICES[pos][c]
        if pos == INITIAL:
            init = idx
        elif pos == MEDIAL:
            med = idx
        else:
            final = idx
    return tuple(CHAR_LISTS[pos][idx] if idx is not None else None
                 for pos, idx in
                 zip([INITIAL, MEDIAL, FINAL], [init, med, final]))


def split_syllables(s, ignore_err=True, pad=None):
    """
    Performs syllable-split on a string.
    Arguments:
        s (str): A string (possibly mixed with non-Hangul characters).
        ignore_err (bool): If set False, it ensures that all characters in
            the string are Hangul-splittable and throws a ValueError otherwise.
            (default: True)
        pad (str): Pad empty jamo positions (initial, medial, or final) with
            `pad` character. This is useful for cases where fixed-length
            strings are needed. (default: None)
    Returns:
        Hangul-split string
    Example:
        >>> split_syllables("안녕하세요")
        "ㅇㅏㄴㄴㅕㅇㅎㅏㅅㅔㅇㅛ"
        >>> split_syllables("안녕하세요~~", ignore_err=False)
        ValueError: encountered an unsupported character: ~ (0x7e)
        >>> split_syllables("안녕하세요ㅛ", pad="x")
        'ㅇㅏㄴㄴㅕㅇㅎㅏxㅅㅔxㅇㅛxxㅛx'
    """

    def try_split(c):
        try:
            return split_syllable_char(c)
        except ValueError:
            if ignore_err:
                return (c,)
            raise ValueError(f"encountered an unsupported character: "
                             f"{c} (0x{ord(c):x})")

    s = map(try_split, s)
    if pad is not None:
        tuples = map(lambda x: tuple(pad if y is None else y for y in x), s)
    else:
        tuples = map(lambda x: filter(None, x), s)
    return "".join(itertools.chain(*tuples))


def join_jamos_char(init, med, final=None):
    """
    Combines jamos into a single syllable.
    Arguments:
        init (str): Initial jao.
        med (str): Medial jamo.
        final (str): Final jamo. If not supplied, the final syllable is made
            without the final. (default: None)
    Returns:
        A Korean syllable.
    """
    chars = (init, med, final)
    for c in filter(None, chars):
        check_hangul(c, jamo_only=True)

    idx = tuple(CHAR_INDICES[pos][c] if c is not None else c
                for pos, c in zip((INITIAL, MEDIAL, FINAL), chars))
    init_idx, med_idx, final_idx = idx
    # final index must be shifted once as
    # final index with 0 points to syllables without final
    final_idx = 0 if final_idx is None else final_idx + 1
    return chr(0xac00 + 28 * 21 * init_idx + 28 * med_idx + final_idx)


def join_jamos(s, ignore_err=True):
    """
    Combines a sequence of jamos to produce a sequence of syllables.
    Arguments:
        s (str): A string (possible mixed with non-jamo characters).
        ignore_err (bool): If set False, it will ensure that all characters
            will be consumed for the making of syllables. It will throw a
            ValueError when it fails to do so. (default: True)
    Returns:
        A string
    Example:
        >>> join_jamos("ㅇㅏㄴㄴㅕㅇㅎㅏㅅㅔㅇㅛ")
        "안녕하세요"
        >>> join_jamos("ㅇㅏㄴㄴㄴㅕㅇㅎㅏㅅㅔㅇㅛ")
        "안ㄴ녕하세요"
        >>> join_jamos()
    """
    last_t = 0
    queue = []
    new_string = ""

    def flush(n=0):
        new_queue = []
        while len(queue) > n:
            new_queue.append(queue.pop())
        if len(new_queue) == 1:
            if not ignore_err:
                raise ValueError(f"invalid jamo character: {new_queue[0]}")
            result = new_queue[0]
        elif len(new_queue) >= 2:
            try:
                result = join_jamos_char(*new_queue)
            except (ValueError, KeyError):
                # Invalid jamo combination
                if not ignore_err:
                    raise ValueError(f"invalid jamo characters: {new_queue}")
                result = "".join(new_queue)
        else:
            result = None
        return result

    for c in s:
        if c not in CHARSET:
            if queue:
                new_c = flush() + c
            else:
                new_c = c
            last_t = 0
        else:
            t = get_jamo_type(c)
            new_c = None
            if t & FINAL == FINAL:
                if not (last_t == MEDIAL):
                    new_c = flush()
            elif t == INITIAL:
                new_c = flush()
            elif t == MEDIAL:
                if last_t & INITIAL == INITIAL:
                    new_c = flush(1)
                else:
                    new_c = flush()
            last_t = t
            queue.insert(0, c)
        if new_c:
            new_string += new_c
    if queue:
        new_string += flush()
    return new_string

# 강의 내용 관련

In [3]:
# Mount Google Drive to this Notebook instance.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd

fileDir = "drive/My Drive/2021/특강(한국음운론학회)/Data/CDS_orthographic.txt"

fr = open(fileDir, 'r')
contents = fr.readlines()
fr.close()

Sentences = []
for content in contents:
    Sentences.append(content.replace("\n",""))

In [5]:
print(Sentences)

['이게 뭐야 이게 뭐야 민준아 응', '뭐 또 뭐가 있어 뭐가', '민준이 뭐 줄까요', '뛰뛰빵빵도 있는데', '츄츄츄츄츄츄츄츄', '츄츄츄츄츄츄츄츄츄츄츄', '이게 뭐야', '기린 기린 아이고', '뺐어 민준이가 뺐어', '민준이 엄마가 빠방 갖다 줄까 빠방', '잡아 이게 뭐야', '타 볼까 이렇게 앉아 이렇게 그렇지', '잡고 그렇지 뿡뿡 뿡뿡', '신기한게 많아', '민준이 신기한게 많아', '뭐 이제 뭐 하고 싶은데 아니야 이거', '이제 민준이 뭐 할까요 응']


# 1.문장에서 언어적 요소 추출하기

# 단어 단위

In [9]:
# 단어의 타입

wordType = set()

for sentence in Sentences:
  words = sentence.split(" ")
  for word in words:
    wordType.add(word)

print(wordType)

{'뭐야', '줄까', '앉아', '민준이가', '잡아', '신기한게', '많아', '이렇게', '아이고', '민준아', '또', '하고', '아니야', '있어', '있는데', '뿡뿡', '볼까', '뭐가', '뛰뛰빵빵도', '타', '응', '그렇지', '갖다', '할까요', '이거', '줄까요', '츄츄츄츄츄츄츄츄', '이제', '엄마가', '빠방', '이게', '기린', '뭐', '뺐어', '싶은데', '츄츄츄츄츄츄츄츄츄츄츄', '잡고', '민준이'}


In [10]:
# 단어의 토큰

wordToken = dict()

for sentence in Sentences:
  words = sentence.split(" ")
  for word in words:
    if wordToken.get(word) == None:
      wordToken[word] = 1
    else:
      wordToken[word] = wordToken[word] + 1

print(wordToken)

{'이게': 4, '뭐야': 4, '민준아': 1, '응': 2, '뭐': 5, '또': 1, '뭐가': 2, '있어': 1, '민준이': 4, '줄까요': 1, '뛰뛰빵빵도': 1, '있는데': 1, '츄츄츄츄츄츄츄츄': 1, '츄츄츄츄츄츄츄츄츄츄츄': 1, '기린': 2, '아이고': 1, '뺐어': 2, '민준이가': 1, '엄마가': 1, '빠방': 2, '갖다': 1, '줄까': 1, '잡아': 1, '타': 1, '볼까': 1, '이렇게': 2, '앉아': 1, '그렇지': 2, '잡고': 1, '뿡뿡': 2, '신기한게': 2, '많아': 2, '이제': 2, '하고': 1, '싶은데': 1, '아니야': 1, '이거': 1, '할까요': 1}


In [11]:
# 단어의 토큰 + 타입

wordType = set()
wordToken = dict()

for sentence in Sentences:
  words = sentence.split(" ")
  for word in words:
    wordType.add(word)
    if wordToken.get(word) == None:
      wordToken[word] = 1
    else:
      wordToken[word] = wordToken[word] + 1

print(wordToken)
print(wordType)

{'이게': 4, '뭐야': 4, '민준아': 1, '응': 2, '뭐': 5, '또': 1, '뭐가': 2, '있어': 1, '민준이': 4, '줄까요': 1, '뛰뛰빵빵도': 1, '있는데': 1, '츄츄츄츄츄츄츄츄': 1, '츄츄츄츄츄츄츄츄츄츄츄': 1, '기린': 2, '아이고': 1, '뺐어': 2, '민준이가': 1, '엄마가': 1, '빠방': 2, '갖다': 1, '줄까': 1, '잡아': 1, '타': 1, '볼까': 1, '이렇게': 2, '앉아': 1, '그렇지': 2, '잡고': 1, '뿡뿡': 2, '신기한게': 2, '많아': 2, '이제': 2, '하고': 1, '싶은데': 1, '아니야': 1, '이거': 1, '할까요': 1}
{'뭐야', '줄까', '앉아', '민준이가', '잡아', '신기한게', '많아', '이렇게', '아이고', '민준아', '또', '하고', '아니야', '있어', '있는데', '뿡뿡', '볼까', '뭐가', '뛰뛰빵빵도', '타', '응', '그렇지', '갖다', '할까요', '이거', '줄까요', '츄츄츄츄츄츄츄츄', '이제', '엄마가', '빠방', '이게', '기린', '뭐', '뺐어', '싶은데', '츄츄츄츄츄츄츄츄츄츄츄', '잡고', '민준이'}


In [16]:
# 단어의 토큰 + 타입 + 정렬

wordType = set()
wordToken = dict()

for sentence in Sentences:
  words = sentence.split(" ")
  for word in words:
    wordType.add(word)
    if wordToken.get(word) == None:
      wordToken[word] = 1
    else:
      wordToken[word] = wordToken[word] + 1

print(wordToken)
print(wordType)

wordDicSortedRT = dict(sorted(wordToken.items(), key=lambda x: x[1], reverse=True))

print(wordDicSortedRT)


{'이게': 4, '뭐야': 4, '민준아': 1, '응': 2, '뭐': 5, '또': 1, '뭐가': 2, '있어': 1, '민준이': 4, '줄까요': 1, '뛰뛰빵빵도': 1, '있는데': 1, '츄츄츄츄츄츄츄츄': 1, '츄츄츄츄츄츄츄츄츄츄츄': 1, '기린': 2, '아이고': 1, '뺐어': 2, '민준이가': 1, '엄마가': 1, '빠방': 2, '갖다': 1, '줄까': 1, '잡아': 1, '타': 1, '볼까': 1, '이렇게': 2, '앉아': 1, '그렇지': 2, '잡고': 1, '뿡뿡': 2, '신기한게': 2, '많아': 2, '이제': 2, '하고': 1, '싶은데': 1, '아니야': 1, '이거': 1, '할까요': 1}
{'뭐야', '줄까', '앉아', '민준이가', '잡아', '신기한게', '많아', '이렇게', '아이고', '민준아', '또', '하고', '아니야', '있어', '있는데', '뿡뿡', '볼까', '뭐가', '뛰뛰빵빵도', '타', '응', '그렇지', '갖다', '할까요', '이거', '줄까요', '츄츄츄츄츄츄츄츄', '이제', '엄마가', '빠방', '이게', '기린', '뭐', '뺐어', '싶은데', '츄츄츄츄츄츄츄츄츄츄츄', '잡고', '민준이'}
{'뭐': 5, '이게': 4, '뭐야': 4, '민준이': 4, '응': 2, '뭐가': 2, '기린': 2, '뺐어': 2, '빠방': 2, '이렇게': 2, '그렇지': 2, '뿡뿡': 2, '신기한게': 2, '많아': 2, '이제': 2, '민준아': 1, '또': 1, '있어': 1, '줄까요': 1, '뛰뛰빵빵도': 1, '있는데': 1, '츄츄츄츄츄츄츄츄': 1, '츄츄츄츄츄츄츄츄츄츄츄': 1, '아이고': 1, '민준이가': 1, '엄마가': 1, '갖다': 1, '줄까': 1, '잡아': 1, '타': 1, '볼까': 1, '앉아': 1, '잡고': 1, '하고': 1, '싶은데': 1, '아니야': 1, '이거': 1, '할까요': 1

In [19]:
def searchAuto (dictInput):
  UserDict = ["뭐","더","타","또"]
  wantedWords = dict()
  for key, value in dictInput.items():
    if key in UserDict:
      if wantedWords.get(key) == None:
        wantedWords[key] = value
  return wantedWords

In [20]:
print(searchAuto(wordToken))

{'뭐': 5, '또': 1, '타': 1}


In [21]:
wordType = list(wordType)
for eachType in wordType:
  if "더" == eachType:
    print("Found")
  else:
    print("No matched")

No matched
No matched
No matched
No matched
No matched
No matched
No matched
No matched
No matched
No matched
No matched
No matched
No matched
No matched
No matched
No matched
No matched
No matched
No matched
No matched
No matched
No matched
No matched
No matched
No matched
No matched
No matched
No matched
No matched
No matched
No matched
No matched
No matched
No matched
No matched
No matched
No matched
No matched


# 음절 단위

In [25]:
# 단어의 토큰 + 타입 + 정렬

syllableType = set()
syllableToken = dict()

for sentence in Sentences:
  words = sentence.split(" ")
  for word in words:
    for syllable in word:
      syllableType.add(syllable)
      if syllableToken.get(syllable) == None:
        syllableToken[syllable] = 1
      else:
        syllableToken[syllable] = syllableToken[syllable] + 1

print(syllableToken)
print(syllableType)

syllableDicSortedRT = dict(sorted(syllableToken.items(), key=lambda x: x[1], reverse=True))

print(syllableDicSortedRT)

{'이': 15, '게': 8, '뭐': 11, '야': 5, '민': 6, '준': 6, '아': 7, '응': 2, '또': 1, '가': 4, '있': 2, '어': 3, '줄': 2, '까': 4, '요': 2, '뛰': 2, '빵': 2, '도': 1, '는': 1, '데': 2, '츄': 19, '기': 4, '린': 2, '고': 3, '뺐': 2, '엄': 1, '마': 1, '빠': 2, '방': 2, '갖': 1, '다': 1, '잡': 2, '타': 1, '볼': 1, '렇': 4, '앉': 1, '그': 2, '지': 2, '뿡': 4, '신': 2, '한': 2, '많': 2, '제': 2, '하': 1, '싶': 1, '은': 1, '니': 1, '거': 1, '할': 1}
{'빵', '한', '까', '준', '지', '린', '어', '줄', '신', '츄', '할', '민', '고', '아', '렇', '또', '데', '다', '뛰', '많', '타', '하', '요', '야', '응', '그', '있', '는', '게', '마', '잡', '방', '앉', '니', '도', '싶', '볼', '엄', '뭐', '빠', '뿡', '이', '제', '기', '가', '은', '거', '뺐', '갖'}
{'츄': 19, '이': 15, '뭐': 11, '게': 8, '아': 7, '민': 6, '준': 6, '야': 5, '가': 4, '까': 4, '기': 4, '렇': 4, '뿡': 4, '어': 3, '고': 3, '응': 2, '있': 2, '줄': 2, '요': 2, '뛰': 2, '빵': 2, '데': 2, '린': 2, '뺐': 2, '빠': 2, '방': 2, '잡': 2, '그': 2, '지': 2, '신': 2, '한': 2, '많': 2, '제': 2, '또': 1, '도': 1, '는': 1, '엄': 1, '마': 1, '갖': 1, '다': 1, '타': 1, '볼': 1, '앉': 1, '하': 1, '싶

#음소 단위

In [30]:
from jamo import h2j, j2hcj

jamoType = set()
jamoToken = dict()

for sentence in Sentences:
  jamo_sentence = j2hcj(h2j(sentence)).replace(" ","")
  for eachJamo in jamo_sentence:
    jamoType.add(eachJamo)
    if jamoToken.get(eachJamo) == None:
      jamoToken[eachJamo] = 1
    else:
      jamoToken[eachJamo] = jamoToken[eachJamo] + 1

print(jamoType)
print(jamoToken)



{'ㅌ', 'ㅟ', 'ㅑ', 'ㄱ', 'ㅏ', 'ㅊ', 'ㅡ', 'ㅗ', 'ㅃ', 'ㅛ', 'ㄷ', 'ㅔ', 'ㅁ', 'ㅅ', 'ㅠ', 'ㅍ', 'ㄶ', 'ㄵ', 'ㅝ', 'ㅇ', 'ㅂ', 'ㅆ', 'ㄲ', 'ㅣ', 'ㄹ', 'ㄸ', 'ㄴ', 'ㅎ', 'ㅈ', 'ㅐ', 'ㅜ', 'ㅓ'}
{'ㅇ': 49, 'ㅣ': 35, 'ㄱ': 23, 'ㅔ': 12, 'ㅁ': 21, 'ㅝ': 11, 'ㅑ': 5, 'ㄴ': 22, 'ㅈ': 15, 'ㅜ': 12, 'ㅏ': 34, 'ㅡ': 6, 'ㄸ': 3, 'ㅗ': 6, 'ㅆ': 4, 'ㅓ': 9, 'ㄹ': 10, 'ㄲ': 4, 'ㅛ': 2, 'ㅟ': 2, 'ㅃ': 10, 'ㄷ': 4, 'ㅊ': 19, 'ㅠ': 19, 'ㅐ': 2, 'ㅂ': 5, 'ㅌ': 1, 'ㅎ': 8, 'ㄵ': 1, 'ㅅ': 3, 'ㄶ': 2, 'ㅍ': 1}


# ADS에서의 자모 분포

In [32]:
import pandas as pd

fileDir = "drive/My Drive/2021/특강(한국음운론학회)/Data/ADS_orthographic.txt"

fr = open(fileDir, 'r')
contents = fr.readlines()
fr.close()

SentencesADS = []
for content in contents:
    SentencesADS.append(content.replace("\n",""))

print(SentencesADS)

['아니 지금 보고 있는데 여기가 어색한지 울어', '경민이 바꿔줘봐 경민이', '경민이 바꿔줘봐', '어 할머니한테', '할머니한텐 대답하네', '지 엄마한텐 대답 안하고', '우리 신림역 갔다 가야 돼', '돌잔치 하는데 돈 더 내고 가야 돼', '아니 엄마 안나올거 아냐 아버지 식사 저거 때문에 금방 했어', '그럼 경민이랑 신림역으로 나올래', '지금 우리 이제 끝나고 가면은 얼추 엄마도 준비하고 나오면 맞을것 같은데', '할머니 오오일일 타면 안되지 신림역으로 와야 되니까', '오오이사 타고 서울대입구역으로 오든지 해야지', '아 오오일일을 타든 오오이사를 타든', '아니 서울대입구역', '아빠 내가 좀 이따 전화할게 나 그거 왔어']


In [33]:
from jamo import h2j, j2hcj

jamoTypeADS = set()
jamoTokenADS = dict()

for sentence in SentencesADS:
  jamo_sentence = j2hcj(h2j(sentence)).replace(" ","")
  for eachJamo in jamo_sentence:
    jamoTypeADS.add(eachJamo)
    if jamoTokenADS.get(eachJamo) == None:
      jamoTokenADS[eachJamo] = 1
    else:
      jamoTokenADS[eachJamo] = jamoTokenADS[eachJamo] + 1

print(jamoTypeADS)
print(jamoTokenADS)

{'ㅌ', 'ㅕ', 'ㄱ', 'ㅑ', 'ㅏ', 'ㅊ', 'ㅡ', 'ㅗ', 'ㅃ', 'ㅙ', 'ㄷ', 'ㅔ', 'ㅁ', 'ㅅ', 'ㅝ', 'ㅇ', 'ㅂ', 'ㅆ', 'ㅣ', 'ㄲ', 'ㄹ', 'ㄸ', 'ㄴ', 'ㅎ', 'ㅈ', 'ㅚ', 'ㅐ', 'ㅜ', 'ㅓ', 'ㅘ'}
{'ㅇ': 77, 'ㅏ': 56, 'ㄴ': 49, 'ㅣ': 46, 'ㅈ': 17, 'ㄱ': 37, 'ㅡ': 19, 'ㅁ': 26, 'ㅂ': 12, 'ㅗ': 26, 'ㅆ': 4, 'ㄷ': 21, 'ㅔ': 10, 'ㅕ': 13, 'ㅓ': 23, 'ㅅ': 11, 'ㅐ': 11, 'ㅎ': 15, 'ㅜ': 10, 'ㄹ': 30, 'ㄲ': 4, 'ㅝ': 4, 'ㅘ': 5, 'ㅌ': 9, 'ㅑ': 5, 'ㅙ': 2, 'ㅊ': 2, 'ㄸ': 2, 'ㅚ': 2, 'ㅃ': 1}


In [35]:
# CDS
# jamoToken
# ADS
# jamoTokenADS

jaList = ["ㄱ","ㄲ","ㅋ","ㄷ","ㄸ","ㅌ","ㅈ","ㅉ","ㅊ","ㅁ","ㅍ","ㅂ","ㅃ","ㅎ","ㄴ","ㄹ","ㅇ","ㅅ","ㅆ"]

for key, value in jamoToken.items():
  if key in jaList:
    # print(key,value)
    pass
  else:
    print(key,value)


ㅣ 35
ㅔ 12
ㅝ 11
ㅑ 5
ㅜ 12
ㅏ 34
ㅡ 6
ㅗ 6
ㅓ 9
ㅛ 2
ㅟ 2
ㅠ 19
ㅐ 2
ㄵ 1
ㄶ 2


# 기저형 -> 표면형

In [39]:
from jamo import h2j, j2hcj
import re

for sentence in Sentences:
  jamo_sentence = j2hcj(h2j(sentence)).split(" ")
  for eachWordJamo in jamo_sentence:
    #자음 중화
    eachWordJamo = re.sub("ㄲ(?=ㅇ)","ㄱ",eachWordJamo)
    print(eachWordJamo)
  



ㅇㅣㄱㅔ
ㅁㅝㅇㅑ
ㅇㅣㄱㅔ
ㅁㅝㅇㅑ
ㅁㅣㄴㅈㅜㄴㅇㅏ
ㅇㅡㅇ
ㅁㅝ
ㄸㅗ
ㅁㅝㄱㅏ
ㅇㅣㅆㅇㅓ
ㅁㅝㄱㅏ
ㅁㅣㄴㅈㅜㄴㅇㅣ
ㅁㅝ
ㅈㅜㄹㄲㅏㅇㅛ
ㄸㅟㄸㅟㅃㅏㅇㅃㅏㅇㄷㅗ
ㅇㅣㅆㄴㅡㄴㄷㅔ
ㅊㅠㅊㅠㅊㅠㅊㅠㅊㅠㅊㅠㅊㅠㅊㅠ
ㅊㅠㅊㅠㅊㅠㅊㅠㅊㅠㅊㅠㅊㅠㅊㅠㅊㅠㅊㅠㅊㅠ
ㅇㅣㄱㅔ
ㅁㅝㅇㅑ
ㄱㅣㄹㅣㄴ
ㄱㅣㄹㅣㄴ
ㅇㅏㅇㅣㄱㅗ
ㅃㅐㅆㅇㅓ
ㅁㅣㄴㅈㅜㄴㅇㅣㄱㅏ
ㅃㅐㅆㅇㅓ
ㅁㅣㄴㅈㅜㄴㅇㅣ
ㅇㅓㅁㅁㅏㄱㅏ
ㅃㅏㅂㅏㅇ
ㄱㅏㅈㄷㅏ
ㅈㅜㄹㄲㅏ
ㅃㅏㅂㅏㅇ
ㅈㅏㅂㅇㅏ
ㅇㅣㄱㅔ
ㅁㅝㅇㅑ
ㅌㅏ
ㅂㅗㄹㄲㅏ
ㅇㅣㄹㅓㅎㄱㅔ
ㅇㅏㄵㅇㅏ
ㅇㅣㄹㅓㅎㄱㅔ
ㄱㅡㄹㅓㅎㅈㅣ
ㅈㅏㅂㄲㅗ
ㄱㅡㄹㅓㅎㅈㅣ
ㅃㅜㅇㅃㅜㅇ
ㅃㅜㅇㅃㅜㅇ
ㅅㅣㄴㄱㅣㅎㅏㄴㄱㅔ
ㅁㅏㄶㅇㅏ
ㅁㅣㄴㅈㅜㄴㅇㅣ
ㅅㅣㄴㄱㅣㅎㅏㄴㄱㅔ
ㅁㅏㄶㅇㅏ
ㅁㅝ
ㅇㅣㅈㅔ
ㅁㅝ
ㅎㅏㄱㅗ
ㅅㅣㅍㅇㅡㄴㄷㅔ
ㅇㅏㄴㅣㅇㅑ
ㅇㅣㄱㅓ
ㅇㅣㅈㅔ
ㅁㅣㄴㅈㅜㄴㅇㅣ
ㅁㅝ
ㅎㅏㄹㄲㅏㅇㅛ
ㅇㅡㅇ


In [43]:

input = "닦다"
jamo_sentence = j2hcj(h2j(input)).split(" ")
for eachWordJamo in jamo_sentence:
  #자음 중화
  eachWordJamo = re.sub("ㄲ(?![ㅇ])","ㄱ",eachWordJamo)
  #경음화
  eachWordJamo = re.sub("(?<=[ㄱ])ㄷ","ㄸ",eachWordJamo)
  print(eachWordJamo)

  merge_jamo = join_jamos(eachWordJamo) 
  print(merge_jamo)
  


ㄷㅏㄱㄸㅏ
닥따


# 품사 단위

In [46]:
pip install konlpy

Collecting konlpy
  Downloading konlpy-0.5.2-py2.py3-none-any.whl (19.4 MB)
[K     |████████████████████████████████| 19.4 MB 404 kB/s 
[?25hCollecting colorama
  Downloading colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Collecting JPype1>=0.7.0
  Downloading JPype1-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (448 kB)
[K     |████████████████████████████████| 448 kB 95.2 MB/s 
Collecting beautifulsoup4==4.6.0
  Downloading beautifulsoup4-4.6.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 7.4 MB/s 
Installing collected packages: JPype1, colorama, beautifulsoup4, konlpy
  Attempting uninstall: beautifulsoup4
    Found existing installation: beautifulsoup4 4.6.3
    Uninstalling beautifulsoup4-4.6.3:
      Successfully uninstalled beautifulsoup4-4.6.3
Successfully installed JPype1-1.3.0 beautifulsoup4-4.6.0 colorama-0.4.4 konlpy-0.5.2


In [56]:
from konlpy.tag import Kkma
from konlpy.utils import pprint
kkma = Kkma()


for sentence in Sentences:
  kkmaDict = dict(kkma.pos(sentence))
  # print(kkmaDict)

  for key, value in kkmaDict.items():
    if "J" in value:
      print(key, value)
    else:
      pass
  

이 JKS
야 JX
가 JKS
도 JX
이 JKS
야 JX
가 JKS
가 JKS
이 JKS
야 JX
이 JKS
이 JKS
