# 특강: 음운론적 발음 변환 규칙을 사용한 발음변환기

## 작업에 필요한 패키지 및 함수들

In [1]:
!pip install hgtk

Collecting hgtk
  Downloading hgtk-0.2.1-py2.py3-none-any.whl (12 kB)
Installing collected packages: hgtk
Successfully installed hgtk-0.2.1


## 경로 연결하고 데이터 불러오기


In [44]:
# Mount Google Drive to this Notebook instance.
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 1.언어적 요소 추출하기 - 한국어

## 데이터 불러오기
- CHILDES: https://sla.talkbank.org/TBB/childes/EastAsian/Korean/Ryu/Jong/010328.cha

In [3]:
import pandas as pd

fileDir = "drive/My Drive/SL/v1/data/KOR_CDS_orthographic.txt"

fr = open(fileDir, 'r')
contents = fr.readlines()
fr.close()

Sentences = []
for content in contents:
    Sentences.append(content.replace("\n",""))

print(Sentences)

['이게 뭐야 이게 뭐야 민준아 응', '뭐 또 뭐가 있어 뭐가', '민준이 뭐 줄까요', '뛰뛰빵빵도 있는데', '츄츄츄츄츄츄츄츄', '츄츄츄츄츄츄츄츄츄츄츄', '이게 뭐야', '기린 기린 아이고', '뺐어 민준이가 뺐어', '민준이 엄마가 빠방 갖다 줄까 빠방', '잡아 이게 뭐야', '타 볼까 이렇게 앉아 이렇게 그렇지', '잡고 그렇지 뿡뿡 뿡뿡', '신기한게 많아', '민준이 신기한게 많아', '뭐 이제 뭐 하고 싶은데 아니야 이거', '이제 민준이 뭐 할까요 응']


## 공백을 기준으로 어절(단어) 단위 분석

In [4]:
# 단어의 타입

wordType = set()

for sentence in Sentences:
  words = sentence.split(" ")
  for word in words:
    wordType.add(word)

print(wordType)

{'많아', '아니야', '이거', '뺐어', '뭐가', '빠방', '할까요', '잡아', '츄츄츄츄츄츄츄츄츄츄츄', '신기한게', '있어', '아이고', '그렇지', '응', '츄츄츄츄츄츄츄츄', '엄마가', '기린', '볼까', '앉아', '싶은데', '이게', '있는데', '하고', '갖다', '이렇게', '이제', '뛰뛰빵빵도', '줄까', '뭐야', '민준이가', '뭐', '타', '뿡뿡', '또', '줄까요', '민준이', '민준아', '잡고'}


In [5]:
# 단어의 토큰

wordToken = dict()

for sentence in Sentences:
  words = sentence.split(" ")
  for word in words:
    if wordToken.get(word) == None:
      wordToken[word] = 1
    else:
      wordToken[word] = wordToken[word] + 1

print(wordToken)

{'이게': 4, '뭐야': 4, '민준아': 1, '응': 2, '뭐': 5, '또': 1, '뭐가': 2, '있어': 1, '민준이': 4, '줄까요': 1, '뛰뛰빵빵도': 1, '있는데': 1, '츄츄츄츄츄츄츄츄': 1, '츄츄츄츄츄츄츄츄츄츄츄': 1, '기린': 2, '아이고': 1, '뺐어': 2, '민준이가': 1, '엄마가': 1, '빠방': 2, '갖다': 1, '줄까': 1, '잡아': 1, '타': 1, '볼까': 1, '이렇게': 2, '앉아': 1, '그렇지': 2, '잡고': 1, '뿡뿡': 2, '신기한게': 2, '많아': 2, '이제': 2, '하고': 1, '싶은데': 1, '아니야': 1, '이거': 1, '할까요': 1}


In [6]:
# 단어의 토큰 + 타입

wordType = set()
wordToken = dict()

for sentence in Sentences:
  words = sentence.split(" ")
  for word in words:
    wordType.add(word)
    if wordToken.get(word) == None:
      wordToken[word] = 1
    else:
      wordToken[word] = wordToken[word] + 1

print(wordToken)
print(wordType)

{'이게': 4, '뭐야': 4, '민준아': 1, '응': 2, '뭐': 5, '또': 1, '뭐가': 2, '있어': 1, '민준이': 4, '줄까요': 1, '뛰뛰빵빵도': 1, '있는데': 1, '츄츄츄츄츄츄츄츄': 1, '츄츄츄츄츄츄츄츄츄츄츄': 1, '기린': 2, '아이고': 1, '뺐어': 2, '민준이가': 1, '엄마가': 1, '빠방': 2, '갖다': 1, '줄까': 1, '잡아': 1, '타': 1, '볼까': 1, '이렇게': 2, '앉아': 1, '그렇지': 2, '잡고': 1, '뿡뿡': 2, '신기한게': 2, '많아': 2, '이제': 2, '하고': 1, '싶은데': 1, '아니야': 1, '이거': 1, '할까요': 1}
{'많아', '아니야', '이거', '뺐어', '뭐가', '빠방', '할까요', '잡아', '츄츄츄츄츄츄츄츄츄츄츄', '신기한게', '있어', '아이고', '그렇지', '응', '츄츄츄츄츄츄츄츄', '엄마가', '기린', '볼까', '앉아', '싶은데', '이게', '있는데', '하고', '갖다', '이렇게', '이제', '뛰뛰빵빵도', '줄까', '뭐야', '민준이가', '뭐', '타', '뿡뿡', '또', '줄까요', '민준이', '민준아', '잡고'}


In [7]:
# 단어의 토큰 + 타입 + 정렬

wordType = set()
wordToken = dict()

for sentence in Sentences:
  words = sentence.split(" ")
  for word in words:
    wordType.add(word)
    if wordToken.get(word) == None:
      wordToken[word] = 1
    else:
      wordToken[word] = wordToken[word] + 1

print(wordToken)
print(wordType)

wordDicSortedRT = dict(sorted(wordToken.items(), key=lambda x: x[1], reverse=True))

print(wordDicSortedRT)


{'이게': 4, '뭐야': 4, '민준아': 1, '응': 2, '뭐': 5, '또': 1, '뭐가': 2, '있어': 1, '민준이': 4, '줄까요': 1, '뛰뛰빵빵도': 1, '있는데': 1, '츄츄츄츄츄츄츄츄': 1, '츄츄츄츄츄츄츄츄츄츄츄': 1, '기린': 2, '아이고': 1, '뺐어': 2, '민준이가': 1, '엄마가': 1, '빠방': 2, '갖다': 1, '줄까': 1, '잡아': 1, '타': 1, '볼까': 1, '이렇게': 2, '앉아': 1, '그렇지': 2, '잡고': 1, '뿡뿡': 2, '신기한게': 2, '많아': 2, '이제': 2, '하고': 1, '싶은데': 1, '아니야': 1, '이거': 1, '할까요': 1}
{'많아', '아니야', '이거', '뺐어', '뭐가', '빠방', '할까요', '잡아', '츄츄츄츄츄츄츄츄츄츄츄', '신기한게', '있어', '아이고', '그렇지', '응', '츄츄츄츄츄츄츄츄', '엄마가', '기린', '볼까', '앉아', '싶은데', '이게', '있는데', '하고', '갖다', '이렇게', '이제', '뛰뛰빵빵도', '줄까', '뭐야', '민준이가', '뭐', '타', '뿡뿡', '또', '줄까요', '민준이', '민준아', '잡고'}
{'뭐': 5, '이게': 4, '뭐야': 4, '민준이': 4, '응': 2, '뭐가': 2, '기린': 2, '뺐어': 2, '빠방': 2, '이렇게': 2, '그렇지': 2, '뿡뿡': 2, '신기한게': 2, '많아': 2, '이제': 2, '민준아': 1, '또': 1, '있어': 1, '줄까요': 1, '뛰뛰빵빵도': 1, '있는데': 1, '츄츄츄츄츄츄츄츄': 1, '츄츄츄츄츄츄츄츄츄츄츄': 1, '아이고': 1, '민준이가': 1, '엄마가': 1, '갖다': 1, '줄까': 1, '잡아': 1, '타': 1, '볼까': 1, '앉아': 1, '잡고': 1, '하고': 1, '싶은데': 1, '아니야': 1, '이거': 1, '할까요': 1

### 분석 대상 어절 결과 확인하기

In [8]:
def searchAuto (dictInput, UserDict):
  wantedWords = dict()
  for key, value in dictInput.items():
    if key in UserDict:
      if wantedWords.get(key) == None:
        wantedWords[key] = value
  return wantedWords

UserDict = ["뭐","더","타","또","이게"]
print(searchAuto(wordToken, UserDict))

{'이게': 4, '뭐': 5, '또': 1, '타': 1}


## 음절 단위 분석

In [9]:
# 단어의 토큰 + 타입 + 정렬

syllableType = set()
syllableToken = dict()

for sentence in Sentences:
  words = sentence.split(" ")
  for word in words:
    for syllable in word:
      syllableType.add(syllable)
      if syllableToken.get(syllable) == None:
        syllableToken[syllable] = 1
      else:
        syllableToken[syllable] = syllableToken[syllable] + 1

print(syllableToken)
print(syllableType)

syllableDicSortedRT = dict(sorted(syllableToken.items(), key=lambda x: x[1], reverse=True))

print(syllableDicSortedRT)

{'이': 15, '게': 8, '뭐': 11, '야': 5, '민': 6, '준': 6, '아': 7, '응': 2, '또': 1, '가': 4, '있': 2, '어': 3, '줄': 2, '까': 4, '요': 2, '뛰': 2, '빵': 2, '도': 1, '는': 1, '데': 2, '츄': 19, '기': 4, '린': 2, '고': 3, '뺐': 2, '엄': 1, '마': 1, '빠': 2, '방': 2, '갖': 1, '다': 1, '잡': 2, '타': 1, '볼': 1, '렇': 4, '앉': 1, '그': 2, '지': 2, '뿡': 4, '신': 2, '한': 2, '많': 2, '제': 2, '하': 1, '싶': 1, '은': 1, '니': 1, '거': 1, '할': 1}
{'게', '요', '데', '다', '츄', '제', '야', '뛰', '하', '싶', '민', '가', '니', '많', '기', '방', '그', '거', '할', '갖', '고', '한', '린', '엄', '뺐', '응', '렇', '줄', '빠', '있', '도', '빵', '잡', '준', '은', '지', '앉', '볼', '까', '뿡', '아', '뭐', '마', '이', '타', '또', '는', '신', '어'}
{'츄': 19, '이': 15, '뭐': 11, '게': 8, '아': 7, '민': 6, '준': 6, '야': 5, '가': 4, '까': 4, '기': 4, '렇': 4, '뿡': 4, '어': 3, '고': 3, '응': 2, '있': 2, '줄': 2, '요': 2, '뛰': 2, '빵': 2, '데': 2, '린': 2, '뺐': 2, '빠': 2, '방': 2, '잡': 2, '그': 2, '지': 2, '신': 2, '한': 2, '많': 2, '제': 2, '또': 1, '도': 1, '는': 1, '엄': 1, '마': 1, '갖': 1, '다': 1, '타': 1, '볼': 1, '앉': 1, '하': 1, '싶

## 분절음 단위 분석
- https://github.com/bluedisk/hangul-toolkit?tab=readme-ov-file

In [10]:
import hgtk

jamoType = set()
jamoToken = dict()

for sentence in Sentences:
    jamo_sentence = list(hgtk.text.decompose(sentence).replace("ᴥ",""))
    for eachJamo in jamo_sentence:
        jamoType.add(eachJamo)
        if jamoToken.get(eachJamo) == None:
            jamoToken[eachJamo] = 1
        else:
            jamoToken[eachJamo] = jamoToken[eachJamo] + 1

#공백 삭제
del jamoToken[' ']

print(jamoType)
print(jamoToken)

jamoDicSortedRT = dict(sorted(jamoToken.items(), key=lambda x: x[1], reverse=True))

print(jamoDicSortedRT)


{'ㅔ', 'ㅃ', 'ㅅ', 'ㄱ', 'ㅛ', 'ㅗ', 'ㅑ', 'ㄶ', 'ㄴ', 'ㅌ', ' ', 'ㄹ', 'ㅜ', 'ㅎ', 'ㅡ', 'ㅐ', 'ㅣ', 'ㅈ', 'ㄲ', 'ㅠ', 'ㄸ', 'ㅂ', 'ㅏ', 'ㄷ', 'ㅆ', 'ㅊ', 'ㅍ', 'ㅓ', 'ㅁ', 'ㅇ', 'ㅟ', 'ㄵ', 'ㅝ'}
{'ㅇ': 49, 'ㅣ': 35, 'ㄱ': 23, 'ㅔ': 12, 'ㅁ': 21, 'ㅝ': 11, 'ㅑ': 5, 'ㄴ': 22, 'ㅈ': 15, 'ㅜ': 12, 'ㅏ': 34, 'ㅡ': 6, 'ㄸ': 3, 'ㅗ': 6, 'ㅆ': 4, 'ㅓ': 9, 'ㄹ': 10, 'ㄲ': 4, 'ㅛ': 2, 'ㅟ': 2, 'ㅃ': 10, 'ㄷ': 4, 'ㅊ': 19, 'ㅠ': 19, 'ㅐ': 2, 'ㅂ': 5, 'ㅌ': 1, 'ㅎ': 8, 'ㄵ': 1, 'ㅅ': 3, 'ㄶ': 2, 'ㅍ': 1}
{'ㅇ': 49, 'ㅣ': 35, 'ㅏ': 34, 'ㄱ': 23, 'ㄴ': 22, 'ㅁ': 21, 'ㅊ': 19, 'ㅠ': 19, 'ㅈ': 15, 'ㅔ': 12, 'ㅜ': 12, 'ㅝ': 11, 'ㄹ': 10, 'ㅃ': 10, 'ㅓ': 9, 'ㅎ': 8, 'ㅡ': 6, 'ㅗ': 6, 'ㅑ': 5, 'ㅂ': 5, 'ㅆ': 4, 'ㄲ': 4, 'ㄷ': 4, 'ㄸ': 3, 'ㅅ': 3, 'ㅛ': 2, 'ㅟ': 2, 'ㅐ': 2, 'ㄶ': 2, 'ㅌ': 1, 'ㄵ': 1, 'ㅍ': 1}


## 형태소(품사) 단위 분석
-kiwi: https://github.com/bab2min/kiwipiepy


### 형태소 분석기 설치하기

In [11]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [12]:
!pip install kiwipiepy

Collecting kiwipiepy
  Downloading kiwipiepy-0.17.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting kiwipiepy-model<0.18,>=0.17 (from kiwipiepy)
  Downloading kiwipiepy_model-0.17.0.tar.gz (34.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.7/34.7 MB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: kiwipiepy-model
  Building wheel for kiwipiepy-model (setup.py) ... [?25l[?25hdone
  Created wheel for kiwipiepy-model: filename=kiwipiepy_model-0.17.0-py3-none-any.whl size=34845033 sha256=04162b63fa75c3380437cc4001179b3a77861fad4d31771d222fae4795cab098
  Stored in directory: /root/.cache/pip/wheels/90/bb/ae/15e1cf26d17183040bac268fbf8d1ca1f45f7f4daa0fb76b91
Successfully built kiwipiepy-model
Installing collected packa

### 형태소 분석기 기능 알아보기

#### 일반적인 형태소 분석

In [13]:
from kiwipiepy import Kiwi

# Kiwi 사용을 위한 인스턴스 선언
kiwi = Kiwi()
print(kiwi.tokenize("친구가 파이썬을 재밌게 하고 있다."))

[Token(form='친구', tag='NNG', start=0, len=2), Token(form='가', tag='JKS', start=2, len=1), Token(form='파이썬', tag='NNP', start=4, len=3), Token(form='을', tag='JKO', start=7, len=1), Token(form='재밌', tag='VA', start=9, len=2), Token(form='게', tag='EC', start=11, len=1), Token(form='하', tag='VX', start=13, len=1), Token(form='고', tag='EC', start=14, len=1), Token(form='있', tag='VX', start=16, len=1), Token(form='다', tag='EF', start=17, len=1), Token(form='.', tag='SF', start=18, len=1)]


#### 불용어 관리를 위한 Stopwords 클래스

In [14]:
from kiwipiepy.utils import Stopwords
stopwords = Stopwords()
kiwi.tokenize("분석 결과에서 불용어만 제외하고 출력할 수도 있다.", stopwords=stopwords)

[Token(form='분석', tag='NNG', start=0, len=2),
 Token(form='결과', tag='NNG', start=3, len=2),
 Token(form='불', tag='NNG', start=8, len=1),
 Token(form='용어', tag='NNG', start=9, len=2),
 Token(form='제외', tag='NNG', start=13, len=2),
 Token(form='출력', tag='NNG', start=18, len=2),
 Token(form='있', tag='VA', start=25, len=1)]

In [15]:
# add, remove 메소드를 이용해 불용어 목록에 단어를 추가하거나 삭제할 수도 있다.
stopwords.add(('결과', 'NNG'))
kiwi.tokenize("분석 결과에서 불용어만 제외하고 출력할 수도 있다.", stopwords=stopwords)

[Token(form='분석', tag='NNG', start=0, len=2),
 Token(form='불', tag='NNG', start=8, len=1),
 Token(form='용어', tag='NNG', start=9, len=2),
 Token(form='제외', tag='NNG', start=13, len=2),
 Token(form='출력', tag='NNG', start=18, len=2),
 Token(form='있', tag='VA', start=25, len=1)]

In [16]:
stopwords.remove(('결과', 'NNG'))
kiwi.tokenize("분석 결과에서 불용어만 제외하고 출력할 수도 있다.", stopwords=stopwords)

[Token(form='분석', tag='NNG', start=0, len=2),
 Token(form='결과', tag='NNG', start=3, len=2),
 Token(form='불', tag='NNG', start=8, len=1),
 Token(form='용어', tag='NNG', start=9, len=2),
 Token(form='제외', tag='NNG', start=13, len=2),
 Token(form='출력', tag='NNG', start=18, len=2),
 Token(form='있', tag='VA', start=25, len=1)]

#### 사전에 새로운 단어를 추가

In [17]:
print(kiwi.tokenize("문척척이 누구야"))
kiwi.add_user_word("문척척", "NNP") #고유명사
print(kiwi.tokenize("문척척이 누구야"))

[Token(form='문', tag='NNG', start=0, len=1), Token(form='척척', tag='MAG', start=1, len=2), Token(form='이', tag='MM', start=3, len=1), Token(form='누구', tag='NP', start=5, len=2), Token(form='이', tag='VCP', start=7, len=0), Token(form='야', tag='EF', start=7, len=1)]
[Token(form='문척척', tag='NNP', start=0, len=3), Token(form='이', tag='JKS', start=3, len=1), Token(form='누구', tag='NP', start=5, len=2), Token(form='이', tag='VCP', start=7, len=0), Token(form='야', tag='EF', start=7, len=1)]


#### 형태소 분석 규칙 추가하기

In [18]:
kiwi.tokenize('걔네 둘이 사겼대')

[Token(form='걔', tag='NP', start=0, len=1),
 Token(form='네', tag='XSN', start=1, len=1),
 Token(form='둘', tag='NR', start=3, len=1),
 Token(form='이', tag='JKS', start=4, len=1),
 Token(form='사', tag='NR', start=6, len=1),
 Token(form='기', tag='VV', start=7, len=1),
 Token(form='었', tag='EP', start=7, len=1),
 Token(form='대', tag='EF', start=8, len=1)]

In [19]:
kiwi.add_pre_analyzed_word('사겼대', [('사귀', 'VV', 0, 2), ('었', 'EP', 1, 2), ('대', 'EF', 2, 3)], -3)
kiwi.tokenize('걔네 둘이 사겼대')

[Token(form='걔', tag='NP', start=0, len=1),
 Token(form='네', tag='XSN', start=1, len=1),
 Token(form='둘', tag='NR', start=3, len=1),
 Token(form='이', tag='JKS', start=4, len=1),
 Token(form='사귀', tag='VV', start=6, len=2),
 Token(form='었', tag='EP', start=7, len=1),
 Token(form='대', tag='EF', start=8, len=1)]

#### 띄어쓰기 교정하기

In [20]:
kiwi.space('친구가파이썬이유용하다고한다.')

'친구가 파이썬이 유용하다고 한다.'

#### 문장 단위로 분할하기

In [21]:
inputText = '친구가 파이썬이 유용하다고 한다. 그래서 나도 배워보려 한다.'

# 문장 단위로 분할 한다.
sentences = kiwi.split_into_sents(inputText)
for sentence in sentences:
  print(sentence.text)

친구가 파이썬이 유용하다고 한다.
그래서 나도 배워보려 한다.


### 형태소 분석기(i.e., kiwi)를 사용하여 형태소 단위로 분석하기

In [22]:
posType = set()
posToken = dict()

for sentence in Sentences:
  posTagout = kiwi.tokenize(sentence)
  for eachOut in posTagout:
    pos = eachOut[0]+"/"+eachOut[1]
    posType.add(pos)
    if posToken.get(pos) == None:
      posToken[pos] = 1
    else:
      posToken[pos] = posToken[pos] + 1

print(posToken)
print(posType)

posDicSortedRT = dict(sorted(posToken.items(), key=lambda x: x[1], reverse=True))

print(posDicSortedRT)


{'이것/NP': 4, '이/JKS': 6, '뭐/NP': 8, '이/VCP': 5, '야/EF': 4, '야/EC': 1, '민준/NNP': 6, '아/JKV': 1, '응/IC': 2, '뭐/IC': 3, '또/MAG': 1, '가/JKS': 4, '있/VA': 1, '어/EF': 5, '이/XSN': 5, '주/VV': 2, 'ᆯ까요/EF': 2, '뛰뛰빵/NNG': 1, '빵/NNG': 1, '도/JX': 1, '있/VV': 1, '는데/EC': 1, '츄츄츄츄츄츄츄츄/NNG': 1, '츄츄츄츄츄츄츄츄츄츄츄/NNG': 1, '기린/NNG': 1, '기리/VV': 1, 'ᆫ/ETM': 3, '아이/NNG': 1, '고/EC': 3, '빼/VV': 2, '었/EP': 2, '어/EC': 3, '엄마/NNG': 1, '빠방/NNG': 2, '갖/VV': 1, '다/EC': 1, 'ᆯ까/EF': 2, '잡/VV-R': 2, '타/VV': 1, '보/VX': 1, '이렇게/MAG': 1, '앉/VV': 1, '이렇/VA-I': 1, '게/EC': 1, '그렇/VA-I': 1, '지/EF': 1, '그렇지/IC': 1, '뿡뿡/MAG': 2, '신기하/VA': 1, '것/NNB': 2, '많/VA': 2, '신기/XR': 1, '하/XSA': 1, '이제/MAG': 1, '하/VV': 2, '싶/VX': 1, '은데/EC': 1, '아니/VCN': 1, '이거/NP': 1, '이제/IC': 1}
{'잡/VV-R', '뛰뛰빵/NNG', '는데/EC', '뿡뿡/MAG', '뭐/NP', '것/NNB', '이렇/VA-I', '그렇지/IC', '이/XSN', '은데/EC', '어/EF', '타/VV', '어/EC', '민준/NNP', '츄츄츄츄츄츄츄츄츄츄츄/NNG', '갖/VV', '게/EC', '야/EF', '었/EP', '그렇/VA-I', 'ᆯ까/EF', '또/MAG', '아/JKV', '하/XSA', '기리/VV', '주/VV', '뭐/IC', 'ᆫ/ETM', '이제

# 2.언어적 요소 추출하기 - 영어

## 데이터 불러오기
- CHILDES: https://sla.talkbank.org/TBB/childes/EastAsian/Korean/Ryu/Jong/010328.cha

In [23]:
import pandas as pd

fileDir = "drive/My Drive/SL/v1/data/ENG_CDS_orthographic.txt"

fr = open(fileDir, 'r')
contents = fr.readlines()
fr.close()

Sentences = []
for content in contents:
    Sentences.append(content.replace("\n",""))

print(Sentences)

['put him in the highchair for breakfast', 'there', 'that one', 'yeah', 'can you say Morag', 'Morag', 'shall we stick her in the highchair', 'yeah', 'right', 'what shall we give Morag for breakfast', 'pie', 'pie', 'what type of pie', 'what are your favorite cereals', 'what what do you like best', 'Coco', 'Coco', 'yeah', 'shall we do some Coco Pops then', 'yeah', 'there we go', 'do you wanna put the milk in as well', 'there', 'milk', 'milk', 'yeah', 'going to pour it in there', 'good girl', 'right', 'going to take take it round there', 'what else do you need', 'what are you going to feed her with', 'do you want a spoon']


## 공백을 기준으로 어절(단어) 단위 분석

In [24]:
# 단어의 토큰 + 타입 + 정렬

wordType = set()
wordToken = dict()

for sentence in Sentences:
  words = sentence.split(" ")
  for word in words:
    wordType.add(word)
    if wordToken.get(word) == None:
      wordToken[word] = 1
    else:
      wordToken[word] = wordToken[word] + 1

print(wordToken)
print(wordType)

wordDicSortedRT = dict(sorted(wordToken.items(), key=lambda x: x[1], reverse=True))

print(wordDicSortedRT)

{'put': 2, 'him': 1, 'in': 4, 'the': 3, 'highchair': 2, 'for': 2, 'breakfast': 2, 'there': 5, 'that': 1, 'one': 1, 'yeah': 5, 'can': 1, 'you': 6, 'say': 1, 'Morag': 3, 'shall': 3, 'we': 4, 'stick': 1, 'her': 2, 'right': 2, 'what': 7, 'give': 1, 'pie': 3, 'type': 1, 'of': 1, 'are': 2, 'your': 1, 'favorite': 1, 'cereals': 1, 'do': 5, 'like': 1, 'best': 1, 'Coco': 3, 'some': 1, 'Pops': 1, 'then': 1, 'go': 1, 'wanna': 1, 'milk': 3, 'as': 1, 'well': 1, 'going': 3, 'to': 3, 'pour': 1, 'it': 2, 'good': 1, 'girl': 1, 'take': 2, 'round': 1, 'else': 1, 'need': 1, 'feed': 1, 'with': 1, 'want': 1, 'a': 1, 'spoon': 1}
{'one', 'going', 'with', 'Pops', 'the', 'pie', 'spoon', 'that', 'shall', 'of', 'to', 'milk', 'cereals', 'need', 'stick', 'in', 'pour', 'it', 'highchair', 'what', 'some', 'want', 'then', 'for', 'go', 'Coco', 'can', 'breakfast', 'your', 'type', 'as', 'well', 'else', 'we', 'yeah', 'are', 'a', 'there', 'put', 'say', 'right', 'Morag', 'wanna', 'feed', 'give', 'like', 'girl', 'round', 'him'

### 분석 대상 어절 결과 확인하기

In [25]:
def searchAuto (dictInput, UserDict):
  wantedWords = dict()
  for key, value in dictInput.items():
    if key in UserDict:
      if wantedWords.get(key) == None:
        wantedWords[key] = value
  return wantedWords

UserDict = ["what","it","you"]
print(searchAuto(wordToken, UserDict))

{'you': 6, 'what': 7, 'it': 2}


## 분절음 단위 분석

In [26]:
# 단어의 토큰 + 타입 + 정렬

segmentType = set()
segmentToken = dict()

for sentence in Sentences:
  words = sentence.split(" ")
  for word in words:
    for segment in word:
      segmentType.add(segment)
      if segmentToken.get(segment) == None:
        segmentToken[segment] = 1
      else:
        segmentToken[segment] = segmentToken[segment] + 1

print(segmentToken)
print(segmentType)

segmentDicSortedRT = dict(sorted(segmentToken.items(), key=lambda x: x[1], reverse=True))

print(segmentDicSortedRT)

{'p': 9, 'u': 11, 't': 37, 'h': 37, 'i': 28, 'm': 5, 'n': 16, 'e': 50, 'g': 17, 'c': 8, 'a': 38, 'r': 26, 'f': 7, 'o': 41, 'b': 3, 'k': 9, 's': 14, 'y': 14, 'M': 3, 'l': 15, 'w': 15, 'v': 2, 'd': 9, 'C': 3, 'P': 1}
{'i', 'e', 'c', 'C', 'v', 'P', 'w', 'l', 'n', 'r', 'b', 'd', 'M', 'o', 'm', 't', 'u', 'a', 'g', 'y', 's', 'k', 'p', 'h', 'f'}
{'e': 50, 'o': 41, 'a': 38, 't': 37, 'h': 37, 'i': 28, 'r': 26, 'g': 17, 'n': 16, 'l': 15, 'w': 15, 's': 14, 'y': 14, 'u': 11, 'p': 9, 'k': 9, 'd': 9, 'c': 8, 'f': 7, 'm': 5, 'b': 3, 'M': 3, 'C': 3, 'v': 2, 'P': 1}


## 자연어처리 툴킷을 활용한 언어 분석
-nltk: https://www.nltk.org/

### nltk 설치하기

In [27]:
!pip install nltk



In [28]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

### nltk 기능 알아보기

#### 문장 단위로 분할하기
- https://www.nytimes.com/2024/06/04/travel/paris-olympics-restaurants-shops-closures.html?searchResultPosition=2

In [29]:
# 문장 단위로 분할하기
from nltk.tokenize import sent_tokenize
news = "Anyone who’s visited Paris in late July and August knows the word “fermé.”. It adorns the darkened windows of chic indie boutiques and cozy bistros whose owners, along with other locals, have fled the city on their annual vacations."

print(sent_tokenize(news))

['Anyone who’s visited Paris in late July and August knows the word “fermé.”.', 'It adorns the darkened windows of chic indie boutiques and cozy bistros whose owners, along with other locals, have fled the city on their annual vacations.']


#### 단어 단위로 분할하기

In [30]:
# 단어 단위로 분할하기
from nltk.tokenize import word_tokenize
print(word_tokenize(news))

['Anyone', 'who', '’', 's', 'visited', 'Paris', 'in', 'late', 'July', 'and', 'August', 'knows', 'the', 'word', '“', 'fermé.', '”', '.', 'It', 'adorns', 'the', 'darkened', 'windows', 'of', 'chic', 'indie', 'boutiques', 'and', 'cozy', 'bistros', 'whose', 'owners', ',', 'along', 'with', 'other', 'locals', ',', 'have', 'fled', 'the', 'city', 'on', 'their', 'annual', 'vacations', '.']


#### 음절 단위로 분할하기

In [31]:
from nltk.tokenize import SyllableTokenizer
SSP = SyllableTokenizer()
print(SSP.tokenize('boutiques'))

['bou', 'ti', 'ques']


#### 문장을 음절 단위로 분할하기

In [32]:
from nltk.tokenize import SyllableTokenizer
SSP = SyllableTokenizer()

lead = "With millions of visitors expected in the city"

for token in word_tokenize(lead):
  print(SSP.tokenize(token))

['With']
['mil', 'lions']
['of']
['vi', 'si', 'tors']
['ex', 'pec', 'ted']
['in']
['the']
['ci', 'ty']


### nltk를 사용한 음절 단위 분석

In [33]:
# 단어의 토큰 + 타입 + 정렬

syllableType = set()
syllableToken = dict()

for sentence in Sentences:
  words = word_tokenize(sentence)
  for word in words:
    syllables = SSP.tokenize(word)
    for syllable in syllables:
      syllableType.add(syllable)
      if syllableToken.get(syllable) == None:
        syllableToken[syllable] = 1
      else:
        syllableToken[syllable] = syllableToken[syllable] + 1

print(syllableToken)
print(syllableType)

syllableDicSortedRT = dict(sorted(syllableToken.items(), key=lambda x: x[1], reverse=True))

print(syllableDicSortedRT)

{'put': 2, 'him': 1, 'in': 4, 'the': 8, 'highc': 2, 'hair': 2, 'for': 2, 'brea': 2, 'kfast': 2, 're': 7, 'that': 1, 'o': 1, 'ne': 1, 'ye': 5, 'ah': 5, 'can': 1, 'yo': 7, 'u': 6, 'say': 1, 'Mo': 3, 'rag': 3, 'shall': 3, 'we': 4, 'stick': 1, 'her': 2, 'right': 2, 'what': 7, 'gi': 1, 've': 1, 'pie': 3, 'ty': 1, 'pe': 1, 'of': 1, 'a': 3, 'ur': 1, 'fa': 1, 'vo': 1, 'ri': 1, 'te': 1, 'ce': 1, 'reals': 1, 'do': 5, 'li': 1, 'ke': 3, 'best': 1, 'Co': 3, 'co': 3, 'so': 1, 'me': 1, 'Pops': 1, 'then': 1, 'go': 1, 'wan': 1, 'na': 1, 'milk': 3, 'as': 1, 'well': 1, 'going': 3, 'to': 3, 'pour': 1, 'it': 2, 'good': 1, 'girl': 1, 'ta': 2, 'round': 1, 'el': 1, 'se': 1, 'need': 1, 'feed': 1, 'with': 1, 'want': 1, 'spoon': 1}
{'ke', 'fa', 'ce', 'hair', 'going', 'ye', 'with', 'Mo', 'Pops', 'the', 'pie', 'ur', 'that', 'ah', 'shall', 'of', 'Co', 'milk', 'to', 'ty', 'gi', 'need', 'stick', 'in', 'pour', 'it', 'what', 'want', 'rag', 'reals', 'so', 'then', 'for', 'o', 'el', 'kfast', 'go', 'yo', 'can', 'spoon', 'a

### nltk를 사용한 형태소 단위 분석

#### 품사 태깅하기 (형태소 분석)

In [34]:
from nltk.tag import pos_tag
lead = "With millions of visitors expected in the city"
tagged_list = pos_tag(word_tokenize(lead))
tagged_list

[('With', 'IN'),
 ('millions', 'NNS'),
 ('of', 'IN'),
 ('visitors', 'NNS'),
 ('expected', 'VBN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('city', 'NN')]

#### nltk의 형태소 분석 기능을 사용하여 형태소 단위로 분석하기

In [35]:
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

posType = set()
posToken = dict()

for sentence in Sentences:
  tagged_list = pos_tag(word_tokenize(sentence))
  posTagout = ["/".join(p) for p in tagged_list]
  for pos in posTagout:
    posType.add(pos)
    if posToken.get(pos) == None:
      posToken[pos] = 1
    else:
      posToken[pos] = posToken[pos] + 1

print(posToken)
print(posType)

posDicSortedRT = dict(sorted(posToken.items(), key=lambda x: x[1], reverse=True))

print(posDicSortedRT)

{'put/VBD': 1, 'him/PRP': 1, 'in/IN': 4, 'the/DT': 3, 'highchair/NN': 2, 'for/IN': 2, 'breakfast/NN': 2, 'there/RB': 3, 'that/DT': 1, 'one/CD': 1, 'yeah/NN': 5, 'can/MD': 1, 'you/PRP': 6, 'say/VB': 1, 'Morag/NNP': 2, 'Morag/NN': 1, 'shall/MD': 3, 'we/PRP': 4, 'stick/VB': 1, 'her/PRP': 2, 'right/NN': 2, 'what/WP': 6, 'give/VB': 1, 'pie/NN': 3, 'type/NN': 1, 'of/IN': 1, 'what/WDT': 1, 'are/VBP': 2, 'your/PRP$': 1, 'favorite/JJ': 1, 'cereals/NNS': 1, 'do/VBP': 3, 'like/IN': 1, 'best/JJS': 1, 'Coco/NN': 2, 'do/VB': 2, 'some/DT': 1, 'Coco/NNP': 1, 'Pops/NNP': 1, 'then/RB': 1, 'there/EX': 2, 'go/VBP': 1, 'wan/VB': 1, 'na/TO': 1, 'put/VB': 1, 'milk/NN': 3, 'as/RB': 1, 'well/RB': 1, 'going/VBG': 3, 'to/TO': 3, 'pour/VB': 1, 'it/PRP': 2, 'good/JJ': 1, 'girl/NN': 1, 'take/VB': 2, 'round/VB': 1, 'else/RB': 1, 'need/VB': 1, 'feed/VB': 1, 'with/IN': 1, 'want/VB': 1, 'a/DT': 1, 'spoon/NN': 1}
{'it/PRP', 'then/RB', 'Pops/NNP', 'pie/NN', 'need/VB', 'Coco/NNP', 'we/PRP', 'put/VB', 'like/IN', 'well/RB',

# 3.음운론적 발음 변환 규칙을 사용한 발음 변환 - 한국어
- 논문: https://seongmin-mun.github.io/MyWebsite/Seongmin/Resources/2.Publicated%20Papers/KCI/The%20Korean%20Society%20for%20Language%20and%20Information/A%20proposal%20to%20improve%20on%20existing%20Grapheme-to-Phoneme%20conversion%20models%20informed%20by%20linguistics/KCI_FI002922160.pdf

## 기저형(written) -> 표면형(spoken)

In [36]:
input = "닦다"
jamo_sentence = hgtk.text.decompose(input).split(" ")
print(jamo_sentence)

['ㄷㅏㄲᴥㄷㅏᴥ']


In [37]:
import re

input = ["닦다", "있다", "짚신"]

for eachWord in input:
  jamo_sentence = hgtk.text.decompose(eachWord).split(" ")
  print("\n" + eachWord + "\n")
  for eachWordJamo in jamo_sentence:
    #자음 중화(Coda neutralization): 음절말에서 실현될 수 없는 자음이 기저형에 나타나고 후행하는 분절음이 자음일 때, 해당 자음의 조음 위치에 따라 불파음으로 교체되는 현상
    #(1) /ㅋ, ㄲ/ → [ㄱ] / ____{C, #}: 동녘[동녁], 밖[박], "닦다[닥따]"
    #(2) /ㅈ, ㅅ, ㅆ, ㅊ/ → [ㄷ] / ____{C, #}: 낮[낟], 낫[낟], 있다[읻따], 낯[낟]
    #(3) /ㅍ/ → [ㅂ] / ____{C, #}: 짚[집], 짚신[집씬]
    print("1. 자음 중화 적용")
    eachWordJamo = re.sub("[ㅋㄲ]ᴥ(?![ㅇ])","ㄱᴥ",eachWordJamo)
    eachWordJamo = re.sub("[ㅈㅅㅆㅊ]ᴥ(?![ㅇ])","ㄷᴥ",eachWordJamo)
    eachWordJamo = re.sub("[ㅍ]ᴥ(?![ㅇ])","ㅂᴥ",eachWordJamo)
    print(eachWordJamo)

    merge_jamo = hgtk.text.compose(eachWordJamo)
    print(merge_jamo)

    #경음화(Tensing): 평 장애음(예: ‘ㄱ’, ‘ㄷ’, ‘ㅂ’, ‘ㅅ’, ‘ㅈ’)이 경음(예: ‘ㄲ’, ‘ㄸ’, ‘ㅃ’, ‘ㅆ’, ‘ㅉ’)으로 교체되는 현상
    #(1) /ㄱ, ㄷ, ㅂ, ㅅ, ㅈ/ → [ㄲ, ㄸ, ㅃ, ㅆ, ㅉ] / {ㄱ, ㄷ, ㅂ}____: 잡기[잡끼], 각도[각또], 답변[답뼌]
    print("\n2. 경음화 적용")
    eachWordJamo = re.sub("(?<=[ㄱㄷㅂ])ᴥㄱ","ᴥㄲ",eachWordJamo)
    eachWordJamo = re.sub("(?<=[ㄱㄷㅂ])ᴥㄷ","ᴥㄸ",eachWordJamo)
    eachWordJamo = re.sub("(?<=[ㄱㄷㅂ])ᴥㅂ","ᴥㅃ",eachWordJamo)
    eachWordJamo = re.sub("(?<=[ㄱㄷㅂ])ᴥㅅ","ᴥㅆ",eachWordJamo)
    eachWordJamo = re.sub("(?<=[ㄱㄷㅂ])ᴥㅈ","ᴥㅉ",eachWordJamo)
    print(eachWordJamo)

    merge_jamo = hgtk.text.compose(eachWordJamo)
    print(merge_jamo)




닦다

1. 자음 중화 적용
ㄷㅏㄱᴥㄷㅏᴥ
닥다

2. 경음화 적용
ㄷㅏㄱᴥㄸㅏᴥ
닥따

있다

1. 자음 중화 적용
ㅇㅣㄷᴥㄷㅏᴥ
읻다

2. 경음화 적용
ㅇㅣㄷᴥㄸㅏᴥ
읻따

짚신

1. 자음 중화 적용
ㅈㅣㅂᴥㅅㅣㄴᴥ
집신

2. 경음화 적용
ㅈㅣㅂᴥㅆㅣㄴᴥ
집씬


# 4.발음 변환 사전을 사용한 발음 변환 - 영어

## CMU 사전 불러오기
- CMUdict (the Carnegie Mellon Pronouncing Dictionary): https://github.com/cmusphinx/cmudict

In [38]:
import re

fileDir = "drive/My Drive/SL/v1/data/cmudict.dict"

fr = open(fileDir, 'r')
contents = fr.readlines()
fr.close()

CMUdict = dict()

for content in contents:
  contentSplit = content.replace("\n","").split(" ")
  value = ""
  if " # " in content:
    value = content.replace("\n","").replace(contentSplit[0]+" ","").split(" # ")[0]
  else:
    value = content.replace("\n","").replace(contentSplit[0]+" ","")

  value = re.sub("\d+","",value)

  CMUdict[contentSplit[0]] = value

In [39]:
num = 0
for key, value in CMUdict.items():
  if num < 200:
    print(key,"  ",value)
  num += 1

'bout    B AW T
'cause    K AH Z
'course    K AO R S
'cuse    K Y UW Z
'em    AH M
'frisco    F R IH S K OW
'gain    G EH N
'kay    K EY
'm    AH M
'n    AH N
'round    R AW N D
's    EH S
'til    T IH L
'tis    T IH Z
'twas    T W AH Z
a    AH
a(2)    EY
a's    EY Z
a.    EY
a.'s    EY Z
a.d.    EY D IY
a.m.    EY EH M
a.s    EY Z
aaa    T R IH P AH L EY
aaberg    AA B ER G
aachen    AA K AH N
aachener    AA K AH N ER
aaker    AA K ER
aalborg    AO L B AO R G
aalborg(2)    AA L B AO R G
aalburg    AE L B ER G
aalen    AE L AH N
aalen(2)    AA L AH N
aaliyah    AA L IY AA
aalseth    AA L S EH TH
aalsmeer    AA L S M IH R
aalto    AA L T OW
aamodt    AA M AH T
aancor    AA N K AO R
aardema    AA R D EH M AH
aardvark    AA R D V AA R K
aardvarks    AA R D V AA R K S
aargh    AA R G
aarhus    AA HH UW S
aaron    EH R AH N
aaron's    EH R AH N Z
aarons    EH R AH N Z
aaronson    EH R AH N S AH N
aaronson(2)    AA R AH N S AH N
aaronson's    EH R AH N S AH N Z
aaronson's(2)    AA R AH N S A

## IPA 사전 불러오기
- eng_arpabet_to_ipa(International Phonetic Alphabet (IPA)): https://github.com/roedoejet/g2p/tree/main/g2p/mappings/langs/eng

In [45]:
import json

fileDir = "drive/My Drive/SL/v1/data/eng_arpabet_to_ipa(International Phonetic Alphabet (IPA))_re.json"
fr = open(fileDir, 'r', encoding='utf-8')
jsonString = json.load(fr)

IPAdict = dict()

for each in jsonString:
  IPAdict[each['in']] = each['out']

print(IPAdict)

{'AA': 'ɒ', 'AE': 'æ', 'AH': 'ə', 'AO': 'ɔ', 'AW': 'aʊ', 'AY': 'aɪ', 'EH': 'e', 'ER': 'ɜ', 'EY': 'eɪ', 'IH': 'ɪ', 'IX': 'ɨ', 'IY': 'i', 'OW': 'o', 'OY': 'ɔɪ', 'UH': 'ʊ', 'UW': 'u', 'B': 'b', 'CH': 'ʧ', 'D': 'd', 'DH': 'ð', 'F': 'f', 'G': 'g', 'HH': 'h', 'JH': 'ʤ', 'K': 'k', 'L': 'l', 'M': 'm', 'N': 'n', 'NG': 'ŋ', 'P': 'p', 'R': 'r', 'S': 's', 'SH': 'ʃ', 'T': 't', 'TH': 'ɵ', 'V': 'v', 'W': 'w', 'Y': 'j', 'Z': 'z', 'ZH': 'ʒ'}


## 기저형(written) -> 표면형(spoken)

In [46]:
input = "put him in the chair for breakfast"
outcome = ""

words = input.split(" ")
for word in words:
  output = ""
  if CMUdict.get(word) != None:
    if " " in CMUdict.get(word):
      CMUdictSplit = CMUdict.get(word).split(" ")
      for eachCMU in CMUdictSplit:
        if IPAdict.get(eachCMU) != None:
          output = output + " " + IPAdict.get(eachCMU)
        else:
          output = output + " " + eachCMU
    else:
      if IPAdict.get(CMUdict.get(word)) != None:
        output = output + " " + IPAdict.get(CMUdict.get(word))
      else:
        output = output + " " + CMUdict.get(word)
  else:
    output = " OOV"
  outcome = outcome + re.sub(" ","",output.strip()) + " "

print(outcome.strip())

pʊt hɪm ɪn ðə ʧer fɔr brekfəst
