<a href="https://colab.research.google.com/github/project-ccap/project-ccap.github.io/blob/master/2023notebooks/2021_0413ccap_PNT_phonology_semantics_visuallization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# フィラデルフィア絵画命名検査の音韻，意味情報の視覚化
- filename: 2021_0413ccap_PNT_phonology_semantics_visuallization.ipynb
- author: 浅川伸一
- date: 2021-0414


In [None]:
import numpy as np
np.set_printoptions(suppress=False, formatter={'float': '{:6.3f}'.format})
import os

pnt_stim_fname = 'pnt_stim.csv'
if not os.path.exists(pnt_stim_fname):
    #フィラデルフィア絵画命名検査の各図版の刺激名データを外部から入手
    !wget https://raw.githubusercontent.com/hanayik/Philadelphia-Naming-Test/master/assets/stim.csv -O pnt_stim.csv

i = 0
PNT = {}
PNT_words = []
with open(pnt_stim_fname, 'r') as f:
    while i < 185:
        x =  f.readline().strip().replace(' ','').split(',')
        if i == 0:
            x_keys = x
        else:
            PNT[int(x[0])] = {}
            for x_, key in zip(x, x_keys):
                PNT[int(x[0])][key] = x_
            PNT_words.append(PNT[int(x[0])]['PictureName'])
        i += 1

print(PNT_words)

# フィラデルフィア絵画命名検査の音韻情報を視覚化

In [None]:
#単語の音韻情報を得るために，カーネギーメロン大学の単語音韻辞書を読み込み
import nltk

try:
    arpabet = nltk.corpus.cmudict.dict()
except LookupError:
    nltk.download('cmudict')
    arpabet = nltk.corpus.cmudict.dict()


In [None]:
#各単語を音韻情報に変換
PNT_phoneme = {}
for word in PNT_words:
    if word in arpabet:
        PNT_phoneme[word] = arpabet[word]
#PNT_phoneme

In [None]:
import re
try:
    import Levenshtein
except ImportError:
    !pip install Levenshtein
    import Levenshtein

# CMU (カーネギーメロン大学) の単語音韻辞書には次の 39 個の音素が使われている
arpabet_phoneme=['<EOW>', 'AA', 'AE', 'AH', 'AO', 'AW', 'AY', 'B', 'CH', 'D', 'DH',
                 'EH', 'ER', 'EY', 'F', 'G', 'HH', 'IH', 'IY', 'JH', 'K',
                 'L', 'M', 'N', 'NG', 'OW', 'OY', 'P', 'R', 'S', 'SH',
                 'T', 'TH', 'UH', 'UW', 'V', 'W', 'Y', 'Z', 'ZH']

#以下は音韻情報から各単語の音韻距離を計算するための準備
PPP = np.zeros((len(PNT),len(arpabet_phoneme)), dtype=int)
max_len = 0
for i, word in enumerate(PNT_words):
    phonology_ = PNT_phoneme[word][0]
    l = len(phonology_)
    if l > max_len:
        max_len = l

    for p in phonology_:
        # 音韻から 数字を取り除く
        p_ = re.sub('[012]','', p)
        j = arpabet_phoneme.index(p_)
        PPP[i,j] += 1

pnt_phon_mat = np.zeros((len(PNT), max_len),dtype=int)
for i, word in enumerate(PNT_words):
    j = 0
    for p in PNT_phoneme[word][0]:
        p_ = re.sub('[012]','', p)
        x = arpabet_phoneme.index(p_)
        pnt_phon_mat[i,j] = x
        j += 1

i=1
for word, v in zip(PNT_words, pnt_phon_mat):
    print(f'{i:03d} {word}', end=":[")
    for _v in v:
        if _v == 0:
            print(']', end=", [")
            break
        print(_v, end=' ')
    for _v in v:
        if arpabet_phoneme[_v] == '<EOW>':
            print(']')
            break
        print(arpabet_phoneme[_v], end=" ")
    i += 1

## 以下にカーネギーメロン大学の単語音韻の情報を貼り付けます
```
# Natural Language Toolkit: Carnegie Mellon Pronouncing Dictionary Corpus Reader
#
# Copyright (C) 2001-2021 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT

"""
The Carnegie Mellon Pronouncing Dictionary [cmudict.0.6]
ftp://ftp.cs.cmu.edu/project/speech/dict/
Copyright 1998 Carnegie Mellon University

File Format: Each line consists of an uppercased word, a counter
(for alternative pronunciations), and a transcription.  Vowels are
marked for stress (1=primary, 2=secondary, 0=no stress).  E.g.:
NATURAL 1 N AE1 CH ER0 AH0 L

The dictionary contains 127069 entries.  Of these, 119400 words are assigned
a unique pronunciation, 6830 words have two pronunciations, and 839 words have
three or more pronunciations.  Many of these are fast-speech variants.

Phonemes: There are 39 phonemes, as shown below:

Phoneme Example Translation    Phoneme Example Translation
------- ------- -----------    ------- ------- -----------
AA      odd     AA D           AE      at      AE T
AH      hut     HH AH T        AO      ought   AO T
AW      cow     K AW           AY      hide    HH AY D
B       be      B IY           CH      cheese  CH IY Z
D       dee     D IY           DH      thee    DH IY
EH      Ed      EH D           ER      hurt    HH ER T
EY      ate     EY T           F       fee     F IY
G       green   G R IY N       HH      he      HH IY
IH      it      IH T           IY      eat     IY T
JH      gee     JH IY          K       key     K IY
L       lee     L IY           M       me      M IY
N       knee    N IY           NG      ping    P IH NG
OW      oat     OW T           OY      toy     T OY
P       pee     P IY           R       read    R IY D
S       sea     S IY           SH      she     SH IY
T       tea     T IY           TH      theta   TH EY T AH
UH      hood    HH UH D        UW      two     T UW
V       vee     V IY           W       we      W IY
Y       yield   Y IY L D       Z       zee     Z IY
ZH      seizure S IY ZH ER
```

In [None]:
#視覚化のためのライブラリを読み込む
import pandas as pd
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

try:
    import japanize_matplotlib
except ImportError:
    !pip install japanize_matplotlib
    import japanize_matplotlib

In [None]:
#レーベンシュタイン距離は，utf8 で符号化された文字列間の距離なので，カーネギーメロン大学の
#単語音韻辞書を用いて，単語の音素情報を擬似的な文字列に変換する。
#そのため一旦，無意味な日本語カナに変換することを試みた
ja_str = 'アイウエオカキクケコサシスセソタチツテトナニヌネノハヒフヘホマミムメモヤユヨラリルレロワヰヱヲン'
pnt_phon_Sim = np.zeros((len(PNT), len(PNT)))

for i, w1 in enumerate(PNT_words):
    p1 = PNT_phoneme[w1][0]
    p1 = [re.sub('[012]', '', x) for x in p1]
    jw1 = ""
    for x in p1:
        jw1 = jw1 + ja_str[arpabet_phoneme.index(x)]

    for j, w2 in enumerate(PNT_words):
        p2 = PNT_phoneme[w2][0]
        p2 = [re.sub('[012]', '', x) for x in p2]
        jw2 = ""
        for x in p2:
            jw2 = jw2 + ja_str[arpabet_phoneme.index(x)]
        pnt_phon_Sim[i,j] = Levenshtein.distance(jw1,jw2)

        #print(i, j, w1, p1, jw1, w2, p2, jw2, Levenshtein.distance(jw1,jw2))

pnt_phon_Sim /= pnt_phon_Sim.max()
pnt_phon_Sim_df = pd.DataFrame(data=pnt_phon_Sim, index=PNT_words)

fig, ax = plt.subplots(figsize=(12,10))         # Sample figsize in inches
sns.heatmap(pnt_phon_Sim_df, ax=ax)

In [None]:
#主成分分析による音韻距離の視覚化
def ax_scatter_gram(ax, pca1, pca2, wordlist, title=None):
    ax.scatter(pca1, pca2, s=200, color='cyan')
    for i, label in enumerate(wordlist):
        ax.annotate(label, (pca1[i], pca2[i]),fontsize=14)
    ax.set_xlabel("第一主成分")
    ax.set_ylabel("第二主成分")
    ax.set_title(title,fontsize=18)

def plot_pca(ax, R, wordlist, title=""):
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(R)
    pca1, pca2 = pca_result[:,0], pca_result[:,1]
    print('\tExplained variation per principal component: {}'.format(pca.explained_variance_ratio_))
    ax_scatter_gram(ax, pca1, pca2, wordlist, title=title)

fig, ax = plt.subplots(figsize=(8,8))         # Sample figsize in inches
#fig, ax = plt.subplots(figsize=(12,13))         # Sample figsize in inches
plot_pca(ax, pnt_phon_Sim, PNT_words, title='音韻情報のレーベンシュタイン距離による附置 (PCA)')


In [None]:
#上図と同じデータを tSNE を使って視覚化
tsne = TSNE()
pnt_tsne = TSNE(n_components=2).fit_transform(pnt_phon_Sim)
fig, ax = plt.subplots(figsize=(8,8))         # Sample figsize in inches
#fig, ax = plt.subplots(figsize=(12,13))         # Sample figsize in inches
ax.scatter(pnt_tsne[:,0], pnt_tsne[:,1], s=200, color='cyan')
for i, label in enumerate(PNT_words):
    ax.annotate(label, (pnt_tsne[i,0], pnt_tsne[i,1]),fontsize=14)
ax.set_xlabel("tSNE 1")
ax.set_ylabel("tSNE 2")
title = '音韻情報のレーベンシュタイン距離によるフィラデルフィア絵画命名検査刺激図版の附置 （ｔＳＮＥ）'
ax.set_title(title,fontsize=18)


#フィラデルフィア絵画命名検査の意味情報の視覚化

In [None]:
import gensim.downloader as api
#単語埋め込みモデルのダウンロード，数分程度時間がかかります
#glove_en = api.load('word2vec-google-news-300', return_path=True)
glove_en = api.load('word2vec-google-news-300')



In [None]:
#行列の表示桁数の設定
np.set_printoptions(suppress=False, formatter={'float': '{:6.3f}'.format})


#単語埋め込みモデルから，単語の意味情報を取り出して格納
PNT_semantics = np.zeros((len(PNT),len(glove_en['whale'])))
for i, word in enumerate(PNT_words):
    PNT_semantics[i] = np.copy(glove_en[word])

#相関係数行列を計算して表示
PNT_df = pd.DataFrame(data=PNT_semantics,index=PNT_words)
PNT_df.T.corr()

In [None]:
#意味間の相関係数行列の視覚化
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(12,10))         # Sample figsize in inches
sns.heatmap(PNT_df.T.corr(), ax=ax)

In [None]:
#上で求めた相関係数行列から主成分分析により附置を描画
def ax_scatter_gram(ax, pca1, pca2, wordlist, title=None):
    #ax.scatter(pca1[:1], pca2[:1], s=200, color='red')
    ax.scatter(pca1, pca2, s=100, color='cyan')
    #ax.scatter(pca1, pca2, s=200, color='cyan')
    for i, label in enumerate(wordlist):
        ax.annotate(label, (pca1[i], pca2[i]),fontsize=10)
        #ax.annotate(label, (pca1[i], pca2[i]),fontsize=14)
    ax.set_xlabel("第一主成分")
    ax.set_ylabel("第二主成分")
    ax.set_title(title,fontsize=10)

def plot_pca(ax, R, wordlist, title=""):
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(R)
    pca1, pca2 = pca_result[:,0], pca_result[:,1]
    print('\tExplained variation per principal component: {}'.format(pca.explained_variance_ratio_))
    ax_scatter_gram(ax, pca1, pca2, wordlist, title=title)


fig, ax = plt.subplots(figsize=(8,8))         # Sample figsize in inches
#fig, ax = plt.subplots(figsize=(12,13))         # Sample figsize in inches
plot_pca(ax, PNT_df.T.corr(), PNT_words, title='単語埋め込みモデルを用いた PNT 刺激の意味附置 (PCA)')



In [None]:
#上図と同じデータを用いて tSNE で描画
tsne = TSNE()
pnt_tsne = TSNE(n_components=2).fit_transform(PNT_semantics)
#print(pnt_tsne.shape)
fig, ax = plt.subplots(figsize=(8,8))           # Sample figsize in inches
#fig, ax = plt.subplots(figsize=(12,13))         # Sample figsize in inches
ax.scatter(pnt_tsne[:,0], pnt_tsne[:,1], s=100, color='cyan')
#ax.scatter(pnt_tsne[:,0], pnt_tsne[:,1], s=200, color='cyan')
for i, label in enumerate(PNT_words):
    ax.annotate(label, (pnt_tsne[i,0], pnt_tsne[i,1]),fontsize=10)
ax.set_xlabel("tSNE 1")
ax.set_ylabel("tSNE 2")
title = '単語埋め込みモデルを用いた PNT 刺激の意味附置 (tSNE)'
ax.set_title(title,fontsize=12)