In [1]:
# ライブラリの読み込み
import numpy as np
import pandas as pd
import matplotlib.pyplot  as plt
import itertools
import time
import re
import os
import glob
import jaconv
import mojimoji
from numpy.random import *

pd.set_option("display.max_rows", 250)
pd.set_option("display.max_columns", 100)

## データの前処理

In [2]:
# データの読み込み
path = "C:/statistics/data/scenario_extract/tdb/corpus/"
kyoto_text = pd.read_csv(path + "kyoto_info_original.csv")
naist_text = pd.read_csv(path + "naist_info.csv", low_memory=False)
naist_text["readings"].iloc[np.where(pd.isna(naist_text["readings"])==True)[0]] = ""

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [3]:
# 京都コーパスから単語を抽出
kyoto_symbol = np.array(kyoto_text["word"])
kyoto_readings = np.array(kyoto_text["reading"])
N1 = kyoto_readings.shape[0]

# NAISTコーパスから単語を抽出
naist_genkei = np.array(naist_text["genkei"])
naist_readings = np.array(naist_text["readings"])
N2 = kyoto_readings.shape[0]

In [4]:
# 京都コーパスの文書idを定義
sentence_id1 = np.array(kyoto_text["d_id"])
n1 = np.unique(sentence_id1).shape[0]
sentence_list1 = [i for i in range(n1)]
for i in range(n1):
    sentence_list1[i] = np.where(sentence_id1==i)[0].astype("int")
    
# NAISTコーパスの文書idを定義
sentence_id2 = np.array(naist_text["sentence_id"])
n2 = np.unique(sentence_id2).shape[0]
sentence_list2 = [i for i in range(n2)]
for i in range(n2):
    sentence_list2[i] = np.where(sentence_id2==i)[0].astype("int")

## NAISTコーパスを京都コーパスに結合

In [5]:
# 文章の一致を取得
# 文章データを作成
kyoto_sentence = np.array([pd.Series(kyoto_readings[sentence_list1[i]]).str.cat() for i in range(n1)])
naist_sentence = np.array([pd.Series(naist_readings[sentence_list2[i]]).str.cat() for i in range(n2)])
kyoto_string = pd.DataFrame({"kyoto_id": np.arange(n1), "sentence": kyoto_sentence})
naist_string = pd.DataFrame({"naist_id": np.arange(n2), "sentence": naist_sentence})
kyoto_string["sentence1"] = kyoto_string["sentence"].str[:15] + kyoto_string["sentence"].str[-15:]
kyoto_string["sentence2"] = kyoto_string["sentence"].str[:20]
kyoto_string["sentence3"] = kyoto_string["sentence"].str[-20:]
naist_string["sentence1"] = naist_string["sentence"].str[:15] + naist_string["sentence"].str[-15:]
naist_string["sentence2"] = naist_string["sentence"].str[:20]
naist_string["sentence3"] = naist_string["sentence"].str[-20:]

# 文字列の一致を取得
match_sentence = pd.merge(kyoto_string[["kyoto_id", "sentence"]], naist_string[["naist_id", "sentence"]], 
                          on="sentence", how="inner")[["kyoto_id", "naist_id", "sentence"]]
match_sentence1 = pd.merge(kyoto_string[["kyoto_id", "sentence", "sentence1"]], naist_string[["naist_id", "sentence1"]],
                           on="sentence1", how="inner")[["kyoto_id", "naist_id", "sentence", "sentence1"]]
match_sentence2 = pd.merge(kyoto_string[["kyoto_id", "sentence", "sentence2"]], naist_string[["naist_id", "sentence2"]],
                           on="sentence2", how="inner")[["kyoto_id", "naist_id", "sentence", "sentence2"]]
match_sentence3 = pd.merge(kyoto_string[["kyoto_id", "sentence", "sentence3"]], naist_string[["naist_id", "sentence3"]],
                           on="sentence3", how="inner")[["kyoto_id", "naist_id", "sentence", "sentence3"]]

In [6]:
# コーパス間の対応関係を取得
match_joint = pd.concat((match_sentence[["kyoto_id", "naist_id"]], 
                         match_sentence1[["kyoto_id", "naist_id"]],
                         match_sentence2[["kyoto_id", "naist_id"]],
                         match_sentence3[["kyoto_id", "naist_id"]]), axis=0)
unique_match = match_joint.iloc[np.where(match_joint["kyoto_id"].duplicated()==False)[0]]
m = unique_match.shape[0]

In [7]:
# 京都コーパスにNAISTコーパスの原形を加える
# 形態素のカラムを抽出
naist_morpheme = naist_text[["genkei", "readings"]]
kyoto_morpheme = kyoto_text[["serial_no", "word", "reading"]]
kyoto_morpheme.columns = ["serial_no", "word", "readings"]
naist_unique = naist_morpheme.iloc[np.where(naist_morpheme.duplicated()==False)[0]]
freq = naist_unique["readings"].value_counts()
naist_morpheme = pd.merge(naist_morpheme, pd.DataFrame({"readings": freq.index[freq > 1], "flag": 1}), on="readings", how="left")
naist_morpheme["flag"].iloc[np.where(pd.isna(naist_morpheme["flag"]))[0]] = 0
naist_morpheme["flag"] = np.array(naist_morpheme["flag"], dtype="int")

# データの設定
threshold = 5
kyoto_genkei_list = []

#センテンスごとに原形をjoinする
for i in range(m):
    if i%1000==0:
        print(i)
    
    # データを定義
    id1 = unique_match["kyoto_id"].iloc[i]
    id2 = unique_match["naist_id"].iloc[i]
    index1 = sentence_list1[id1]
    index2 = sentence_list2[id2]
    sentence1 = kyoto_sentence[id1]
    sentence2 = naist_sentence[id2]

    # 文字数がしきい値以上異なると次のセンテンスへ
    if np.abs(len(sentence1) - len(sentence2)) >= threshold:
        continue
        
    # NAISTコーパスの原形を京都コーパスに結合する
    kyoto_word = kyoto_morpheme.iloc[index1]
    naist_word = naist_morpheme.iloc[index2]
    naist_unique = naist_word.iloc[np.where(naist_word.duplicated()==False)[0]]
    freq = naist_unique["readings"].value_counts()
    index_target = np.where((np.in1d(naist_unique["readings"], freq.index[freq==1])) | (naist_unique["flag"]==0))[0].astype("int")
    naist_unique = naist_unique.iloc[index_target]
    kyoto_genkei_list.append(pd.merge(kyoto_word, naist_unique, on="readings", how="left"))

0
1000
2000
3000
4000
5000


In [8]:
# 京都コーパスに原形を結合
kyoto_genkei = pd.concat((kyoto_genkei_list), axis=0)
kyoto_genkei.index = np.arange(kyoto_genkei.shape[0])
naist_unique = naist_morpheme.iloc[np.where(naist_morpheme["flag"]==0)[0]]
naist_unique = naist_unique[["genkei", "readings"]].iloc[np.where(naist_unique.duplicated()==False)[0]]
kyoto_new1 = pd.merge(kyoto_morpheme, kyoto_genkei[["serial_no", "genkei"]], on="serial_no", how="left")
kyoto_new2 = pd.merge(kyoto_morpheme, naist_unique, on="readings", how="left")
index_genkei1 = np.where(pd.isna(kyoto_new2["genkei"])==False)[0].astype("int")
index_genkei2 = np.where(kyoto_text["genkei"]!="*")[0].astype("int")

genkei = np.array(kyoto_new1["genkei"])
genkei[index_genkei1] = np.array(kyoto_new2["genkei"].iloc[index_genkei1])
genkei[index_genkei2] = np.array(kyoto_text["genkei"].iloc[index_genkei2])
kyoto_text["new_genkei"] = genkei
kyoto_text["flag"] = np.array(pd.isna(genkei)==True, dtype="int")
del kyoto_genkei_list

## 形態素を置き換え

In [9]:
# 形態素情報を抽出
kyoto_morpheme = kyoto_text[["serial_no", "flag", "new_genkei", "reading", "word_class", 
                             "class_detail1", "class_detail2", "class_detail3"]]
kyoto_morpheme = kyoto_morpheme.rename(columns={"new_genkei": "genkei"})
genkei = np.array(kyoto_morpheme["genkei"])

In [10]:
# 単語および品詞単独での形態素を置き換え
# 助詞を置き換え
index_positional = np.where((kyoto_morpheme["word_class"]=="助詞") & (kyoto_morpheme["flag"]==1))[0].astype("int")
genkei[index_positional] = np.array(kyoto_morpheme["reading"].iloc[index_positional])

# 人名を置き換え
index_name = np.where(kyoto_morpheme["class_detail1"]=="人名")[0].astype("int")
genkei[index_name] = kyoto_morpheme["reading"].iloc[index_name]

# 数値を置き換え
index_number = np.where(kyoto_morpheme["class_detail1"]=="数詞")[0].astype("int")
genkei[index_number] = "0"

# 地名を置き換え
index_locale1 = np.where((kyoto_morpheme["class_detail1"]=="地名") & (kyoto_morpheme["reading"]=="にほん"))[0].astype("int")
index_locale2 = np.where((kyoto_morpheme["class_detail1"]=="地名") & (kyoto_morpheme["reading"]=="にっぽん"))[0].astype("int")
index_locale = np.unique(np.append(index_locale1, index_locale2))
genkei[index_locale] = "日本"

# アルファベットを置き換え
index_alphabet = np.where(kyoto_morpheme["reading"].str.contains("^[a-zA-ZＡ-ｚ0-9０-９]+$"))[0].astype("int")
alphabet = np.array(kyoto_morpheme["reading"].iloc[index_alphabet])
alphabet = np.array([mojimoji.zen_to_han(alphabet[i]) for i in range(alphabet.shape[0])])
genkei[index_alphabet] = alphabet

In [11]:
# 品詞と品詞クラスの組み合わせでの形態素の置き換え
# 助動詞を置き換え
index = np.where((kyoto_morpheme["word_class"]=="助動詞") & (pd.isna(kyoto_morpheme["genkei"])==False))[0]
aux_morpheme = kyoto_morpheme[["genkei", "reading", "class_detail1", "class_detail2"]].iloc[index]
aux_morpheme = aux_morpheme.iloc[np.where(aux_morpheme[["reading", "class_detail1", "class_detail2"]].duplicated()==False)[0]]
aux_morpheme = aux_morpheme.rename(columns={"genkei": "new_genkei"})
aux_morpheme.index = np.arange(aux_morpheme.shape[0])
joint_morpheme = pd.merge(kyoto_morpheme, aux_morpheme, on=["reading", "class_detail1", "class_detail2"], how="left")
joint_morpheme = joint_morpheme[["serial_no", "flag", "new_genkei", "reading", "word_class"]]
index_aux = np.where((joint_morpheme["flag"]==1) & (pd.isna(joint_morpheme["new_genkei"])==False) & 
                     (joint_morpheme["word_class"]=="助動詞"))[0].astype("int")
genkei[index_aux] = np.array(joint_morpheme["new_genkei"].iloc[index_aux])

In [12]:
# 形容詞を置き換え
index = np.where((kyoto_morpheme["word_class"]=="形容詞") & (pd.isna(kyoto_morpheme["genkei"])==False))[0]
adverb_morpheme = kyoto_morpheme[["genkei", "reading", "class_detail1", "class_detail2"]].iloc[index]
adverb_morpheme = adverb_morpheme.iloc[np.where(adverb_morpheme[["reading", "class_detail1", "class_detail2"]].duplicated()==False)[0]]
adverb_morpheme = adverb_morpheme.rename(columns={"genkei": "new_genkei"})
adverb_morpheme.index = np.arange(adverb_morpheme.shape[0])
joint_morpheme = pd.merge(kyoto_morpheme, adverb_morpheme, on=["reading", "class_detail1", "class_detail2"], how="left")
joint_morpheme = joint_morpheme[["serial_no", "flag", "new_genkei", "reading", "word_class"]]
index_adverb = np.where((joint_morpheme["flag"]==1) & (pd.isna(joint_morpheme["new_genkei"])==False) &
                        (joint_morpheme["word_class"]=="形容詞"))[0].astype("int")
genkei[index_adverb] = np.array(joint_morpheme["new_genkei"].iloc[index_adverb])

In [13]:
# その他の品詞を置き換え
index1 = np.where((kyoto_morpheme["word_class"]=="指示詞") & (pd.isna(kyoto_morpheme["genkei"])==False))[0]
index2 = np.where((kyoto_morpheme["word_class"]=="接続詞") & (pd.isna(kyoto_morpheme["genkei"])==False))[0]
index3 = np.where((kyoto_morpheme["word_class"]=="連体詞") & (pd.isna(kyoto_morpheme["genkei"])==False))[0]
index = np.unique(np.hstack((index1, index2, index3)))
other_morpheme = kyoto_morpheme[["genkei", "reading", "word_class", "class_detail1", "class_detail2"]].iloc[index]
other_morpheme = other_morpheme.iloc[np.where(other_morpheme[["reading", "word_class", 
                                                              "class_detail1", "class_detail2"]].duplicated()==False)[0]]
other_morpheme = other_morpheme.rename(columns={"genkei": "new_genkei"})
other_morpheme.index = np.arange(other_morpheme.shape[0])
joint_morpheme = pd.merge(kyoto_morpheme, other_morpheme, on=["reading", "word_class", "class_detail1", "class_detail2"], how="left")
joint_morpheme = joint_morpheme[["serial_no", "flag", "new_genkei", "reading", "word_class"]]
index_other = np.where((joint_morpheme["flag"]==1) & (pd.isna(joint_morpheme["new_genkei"])==False) &
                       ((joint_morpheme["word_class"]=="指示詞") | (joint_morpheme["word_class"]=="接続詞") |
                        (joint_morpheme["word_class"]=="連体詞")))[0].astype("int")
genkei[index_other] = np.array(joint_morpheme["new_genkei"].iloc[index_other])

In [14]:
# 原形が欠損している部分とその周辺を表示しておくためのflag
window = 2
flag_nan = np.repeat(0, N1)
index_nan = np.where(pd.isna(genkei))[0].astype("int")
display_list = []
for i in range(index_nan.shape[0]):
    display_list.append(np.arange(index_nan[i] - window, index_nan[i] + window))
index_nan = np.unique(np.hstack((display_list)))
index_nan = index_nan[(index_nan >= 0) & (index_nan <= N1-1)]
flag_nan[index_nan] = 1
kyoto_text["flag"] = np.array(pd.isna(genkei)==True, dtype="int")
kyoto_text["display_flag"] = flag_nan
kyoto_text["new_genkei"] = genkei

In [15]:
# データフレームを定義
kyoto_info = kyoto_text[['serial_no', 'doc_id', 'd_id', 'sentence_id', 'flag', 'display_flag', 'word', 'new_genkei', 'reading', 
                         'word_class', 'class_detail1', 'class_detail2', 'class_detail3', 'phrase_id',
                         'phrase_dependency', 'dependency_type1', 'tag_id', 'tag_dependency', 'dependency_type2', 
                         'rel', 'target', 'sid', 'tag']]
kyoto_info = kyoto_info.rename(columns={'new_genkei': 'genkei'})
kyoto_info.index = np.arange(kyoto_info.shape[0])

In [16]:
# データフレームを出力
kyoto_info.to_csv(path + "/kyoto_info.csv", index=None)
kyoto_info.to_excel(path + "/kyoto_info.xlsx")