In [None]:
# ライブラリの読み込み
import numpy as np
import pandas as pd
import matplotlib.pyplot  as plt
import itertools
import time
import re
import os
import glob
import jaconv
import mojimoji
from numpy.random import *

pd.set_option("display.max_rows", 250)
pd.set_option("display.max_columns", 100)

## データの読み込み

In [None]:
# データの読み込み
# ファイルの読み込み
path = "C:/statistics/data/scenario_extract/tdb/corpus/"
kyoto_info = pd.read_csv(path + "kyoto_info.csv", encoding="shift-jis").iloc[:, 1:]
kyoto_morpheme = kyoto_info[["serial_no", "flag", "genkei", "reading", "word_class", 
                             "class_detail1", "class_detail2", "class_detail3"]]
genkei = np.array(kyoto_morpheme["genkei"])

## 形態素を置き換え

In [None]:
# 単語および品詞単独での形態素を置き換え
# 助詞を置き換え
index_positional = np.where((kyoto_morpheme["word_class"]=="助詞") & (kyoto_morpheme["flag"]==1))[0].astype("int")
genkei[index_positional] = np.array(kyoto_morpheme["reading"].iloc[index_positional])

# 人名を置き換え
index_name = np.where(kyoto_morpheme["class_detail1"]=="人名")[0].astype("int")
genkei[index_name] = "安倍"

# 数値を置き換え
index_number = np.where(kyoto_morpheme["class_detail1"]=="数詞")[0].astype("int")
genkei[index_number] = "0"

# 地名を置き換え
index_locale1 = np.where((kyoto_morpheme["class_detail1"]=="地名") & (kyoto_morpheme["reading"]=="にほん"))[0].astype("int")
index_locale2 = np.where((kyoto_morpheme["class_detail1"]=="地名") & (kyoto_morpheme["reading"]=="にっぽん"))[0].astype("int")
index_locale = np.unique(np.append(index_locale1, index_locale2))
genkei[index_locale] = "日本"

# アルファベットを置き換え
index_alphabet = np.where(kyoto_morpheme["reading"].str.contains("^[a-zA-ZＡ-ｚ0-9０-９]+$"))[0].astype("int")
alphabet = np.array(kyoto_morpheme["reading"].iloc[index_alphabet])
alphabet = np.array([mojimoji.zen_to_han(alphabet[i]) for i in range(alphabet.shape[0])])
genkei[index_alphabet] = alphabet

In [None]:
# 品詞と品詞クラスの組み合わせでの形態素の置き換え
# 助動詞を置き換え
index = np.where((kyoto_morpheme["word_class"]=="助動詞") & (pd.isna(kyoto_morpheme["genkei"])==False))[0]
aux_morpheme = kyoto_morpheme[["genkei", "reading", "class_detail1", "class_detail2"]].iloc[index]
aux_morpheme = aux_morpheme.iloc[np.where(aux_morpheme[["reading", "class_detail1", "class_detail2"]].duplicated()==False)[0]]
aux_morpheme = aux_morpheme.rename(columns={"genkei": "new_genkei"})
aux_morpheme.index = np.arange(aux_morpheme.shape[0])
joint_morpheme = pd.merge(kyoto_morpheme, aux_morpheme, on=["reading", "class_detail1", "class_detail2"], how="left")
joint_morpheme = joint_morpheme[["serial_no", "flag", "new_genkei", "reading", "word_class"]]
index_aux = np.where((joint_morpheme["flag"]==1) & (pd.isna(joint_morpheme["new_genkei"])==False) & 
                     (joint_morpheme["word_class"]=="助動詞"))[0].astype("int")
genkei[index_aux] = np.array(joint_morpheme["new_genkei"].iloc[index_aux])

In [None]:
# 形容詞を置き換え
index = np.where((kyoto_morpheme["word_class"]=="形容詞") & (pd.isna(kyoto_morpheme["genkei"])==False))[0]
adverb_morpheme = kyoto_morpheme[["genkei", "reading", "class_detail1", "class_detail2"]].iloc[index]
adverb_morpheme = adverb_morpheme.iloc[np.where(adverb_morpheme[["reading", "class_detail1", "class_detail2"]].duplicated()==False)[0]]
adverb_morpheme = adverb_morpheme.rename(columns={"genkei": "new_genkei"})
adverb_morpheme.index = np.arange(adverb_morpheme.shape[0])
joint_morpheme = pd.merge(kyoto_morpheme, adverb_morpheme, on=["reading", "class_detail1", "class_detail2"], how="left")
joint_morpheme = joint_morpheme[["serial_no", "flag", "new_genkei", "reading", "word_class"]]
index_adverb = np.where((joint_morpheme["flag"]==1) & (pd.isna(joint_morpheme["new_genkei"])==False) &
                        (joint_morpheme["word_class"]=="形容詞"))[0].astype("int")
genkei[index_adverb] = np.array(joint_morpheme["new_genkei"].iloc[index_adverb])

In [None]:
# その他の品詞を置き換え
index1 = np.where((kyoto_morpheme["word_class"]=="指示詞") & (pd.isna(kyoto_morpheme["genkei"])==False))[0]
index2 = np.where((kyoto_morpheme["word_class"]=="接続詞") & (pd.isna(kyoto_morpheme["genkei"])==False))[0]
index3 = np.where((kyoto_morpheme["word_class"]=="連体詞") & (pd.isna(kyoto_morpheme["genkei"])==False))[0]
index = np.unique(np.hstack((index1, index2, index3)))
other_morpheme = kyoto_morpheme[["genkei", "reading", "word_class", "class_detail1", "class_detail2"]].iloc[index]
other_morpheme = other_morpheme.iloc[np.where(other_morpheme[["reading", "word_class", 
                                                              "class_detail1", "class_detail2"]].duplicated()==False)[0]]
other_morpheme = other_morpheme.rename(columns={"genkei": "new_genkei"})
other_morpheme.index = np.arange(other_morpheme.shape[0])
joint_morpheme = pd.merge(kyoto_morpheme, other_morpheme, on=["reading", "word_class", "class_detail1", "class_detail2"], how="left")
joint_morpheme = joint_morpheme[["serial_no", "flag", "new_genkei", "reading", "word_class"]]
index_other = np.where((joint_morpheme["flag"]==1) & (pd.isna(joint_morpheme["new_genkei"])==False) &
                       ((joint_morpheme["word_class"]=="指示詞") | (joint_morpheme["word_class"]=="接続詞") |
                        (joint_morpheme["word_class"]=="連体詞")))[0].astype("int")
genkei[index_other] = np.array(joint_morpheme["new_genkei"].iloc[index_other])

In [None]:
np.sum(pd.isna(genkei)==True)

In [None]:
kyoto_morpheme