In [None]:
# ライブラリの読み込み
import numpy as np
import pandas as pd
import matplotlib.pyplot  as plt
import itertools
import time
import re
import os
import glob
from numpy.random import *

pd.set_option("display.max_rows", 250)
pd.set_option("display.max_columns", 100)

# データの処理

In [None]:
# データの読み込み
# ファイルの取得
path = "D:/Statistics/data/scenario_extract/"
corpus_path = path + "tdb/corpus/KWDLC-1.0/dat/"
folder = np.array(os.listdir(corpus_path + "rel/"))
folder = corpus_path + "rel/" + folder.astype("object") + "/"
filelist = []
for i in range(len(folder)):
    filelist.append(glob.glob(folder[i] + "*.KNP"))
filelist = np.hstack((filelist))
m = len(filelist)

# 取得したファイルの読み込み
kwdlc_text = []
for i in range(m):
    with open(filelist[i], encoding="utf-8") as f:
        kwdlc_text.append(f.read())
d = len(kwdlc_text)

In [None]:
# コーパスの文字列から形態素解析結果と格解析結果をデータフレームに保存
# 品詞情報を定義
hinshi = " 特殊 | 動詞 | 形容詞 | 判定詞 | 助動詞 | 名詞 | 指示詞 | 副詞 | 助詞 | 接続詞 | 連体詞 | 感動詞 | 接頭辞 | 接尾辞 | 未定義語 "

# 文書ごとに解析結果を保存する
info_list = []
for i in range(d):
    if i%100==0:
        print(i)
    
    # テキスト情報を分割
    line_text = pd.Series(kwdlc_text[i].split("\n"))
    index_word = np.where(line_text.str.contains(hinshi))[0].astype("int")
    index_id = np.where(line_text.str.contains("# S-ID"))[0].astype("int")
    index_phrase = np.where(line_text.str.contains("^\* [0-9]"))[0].astype("int")
    index_tag = np.where(line_text.str.contains("^\+ [0-9]"))[0].astype("int")
    index_rel = np.where(line_text.str.contains("<rel type"))[0].astype("int")


    # 形態素情報を抽出
    word_split = line_text.iloc[index_word].str.split(" ")
    n = word_split.shape[0]
    word_info = pd.DataFrame([word_split.iloc[j] for j in range(n)])
    word_info.columns = ["word", "reading", "genkei", "word_class", "class_detail1", "class_detail2", "class_detail3"]


    # センテンスidを定義
    # 文字列を分割
    id_split = line_text.iloc[index_id].str.split(" |:")
    m = id_split.shape[0]
    get_id = np.array([id_split.iloc[j][2] for j in range(m)])

    # idを配列として保存
    allocation = np.repeat(0, m)
    for j in range(m):
        if j < m - 1:
            allocation[j] = np.sum((index_word > index_id[j]) & (index_word < index_id[j+1]))
        else:
            allocation[j] = np.sum(index_word > index_id[j])
    sentence_id = np.repeat(get_id, allocation)
    doc_id = np.repeat(i, n)
    id_info = pd.DataFrame({"doc_id": doc_id, "sentence_id": sentence_id})


    # 文節とその係り受け関係を取得
    # 文字列を分割
    phrase_split = line_text.iloc[index_phrase].str.replace("-|\* ", "").str.split(" ")
    m = phrase_split.shape[0]

    # 文節idおよび係り受けidを定義
    phrase = np.repeat(0, m)
    dependency = np.repeat(0, m)
    types = np.repeat("", m).astype("object")
    for j in range(m):
        get_phrase = phrase_split.iloc[j]
        for q in range(len(get_phrase)):
            if q==0:
                phrase[j] = int(get_phrase[q])
            else:
                dependency[j] = int(get_phrase[q][:re.search("[0-9]*", get_phrase[q]).end()])
                types[j] = get_phrase[q][re.search("[^0-9]", get_phrase[q]).start():]

    # idを配列として保存
    allocation = np.repeat(0, m)
    for j in range(m):
        if j < m - 1:
            allocation[j] = np.sum((index_word > index_phrase[j]) & (index_word < index_phrase[j+1]))
        else:
            allocation[j] = np.sum(index_word > index_phrase[j])
    phrase_id = np.repeat(phrase, allocation)
    phrase_dependency = np.repeat(dependency, allocation)
    dependency_type = np.repeat(types, allocation)
    phrase_info = pd.DataFrame({"phrase_id": phrase_id, "phrase_dependency": phrase_dependency, "dependency_type1": dependency_type})


    # タグとその係り受け関係を取得
    # 文字列を分割
    tag_split = line_text.iloc[index_tag].str.replace("-|\+ ", "").str.split(" ")
    m = tag_split.shape[0]

    # 文節idおよび係り受けidを定義
    tag = np.repeat(0, m)
    dependency = np.repeat(0, m)
    types = np.repeat("", m).astype("object")
    for j in range(m):
        get_tag = tag_split.iloc[j]
        for q in range(2):
            if q==0:
                tag[j] = int(get_tag[q])
            else:
                dependency[j] = int(get_tag[q][:re.search("[0-9]*", get_tag[q]).end()])
                types[j] = get_tag[q][re.search("[^0-9]", get_tag[q]).start():]

    # idを配列として保存
    allocation = np.repeat(0, m)
    for j in range(m):
        if j < m - 1:
            allocation[j] = np.sum((index_word > index_tag[j]) & (index_word < index_tag[j+1]))
        else:
            allocation[j] = np.sum(index_word > index_tag[j])
    tag_id = np.repeat(tag, allocation)
    tag_dependency = np.repeat(dependency, allocation)
    dependency_type = np.repeat(types, allocation)
    tag_allcation = np.repeat(index_tag, allocation)
    tag_info = pd.DataFrame({"tag_id": tag_id, "tag_dependency": tag_dependency, "dependency_type2": dependency_type})


    # 格解析結果と対応関係を取得
    # 解析結果を分割
    rel_split = line_text.iloc[index_rel].str.split("<|/><|/>")
    m = rel_split.shape[0]

    # 解析結果の格納用配列
    rel = np.repeat("", m).astype("object")
    target = np.repeat("", m).astype("object")
    sid = np.repeat("", m).astype("object")
    tag = np.repeat("", m).astype("object")
    no = np.repeat(0, m)

    # 格解析結果を配列に保存
    for j in range(m):
        # 格解析結果に関係のある文字列を取得
        get_rel = rel_split.iloc[j]
        get_rel = np.array(get_rel)[np.array(get_rel)!=""]
        no[j] = int(re.search(" [0-9]+ ", get_rel[0]).group().replace(" ", ""))
        r = get_rel.shape[0]

        # 解析結果単位ごとの格納用配列
        rel_string = np.repeat("", r-1).astype("object")
        target_string = np.repeat("", r-1).astype("object")
        sid_string = np.repeat("", r-1).astype("object")
        tag_string = np.repeat("", r-1).astype("object")

        # 解析単位ごとに格解析結果を抽出
        for q in range(1, r):
            string = get_rel[q]

            rel_match = re.search("rel type=\"[^\"]+\"", string)
            if rel_match is not None:
                rel_match = rel_match.group()
                rel_string[q-1] = re.search("\".+\"", rel_match).group().replace("\"", "")
            else:
                rel_string[q-1] = ""

            target_match = re.search("target=\"[^\"]+\"", string)
            if target_match is not None:
                target_match = target_match.group()
                target_string[q-1] = re.search("\".+\"", target_match).group().replace("\"", "")
            else:
                target_string[q-1] = ""

            sid_match = re.search("sid=\"[^\"]+\"", string)
            if sid_match is not None:
                sid_match = sid_match.group()
                sid_string[q-1] = re.search("\".+\"", sid_match).group().replace("\"", "")
            else:
                sid_string[q-1] = ""

            tag_match = re.search("tag=\"[^\"]+\"", string)
            if tag_match is not None:
                tag_match = tag_match.group()
                tag_string[q-1] = re.search("\".+\"", tag_match).group().replace("\"", "")
            else:
                tag_string[q-1] = ""

        # 文字列を結合して保存
        rel[j] = pd.Series(rel_string).str.cat(sep="; ")
        target[j] = pd.Series(target_string).str.cat(sep="; ")
        sid[j] = pd.Series(sid_string).str.cat(sep="; ")
        tag[j] = pd.Series(tag_string).str.cat(sep="; ")

    # 格関係の情報をデータフレームに格納
    temp1 = pd.DataFrame({"allocation": tag_allcation})
    temp2 = pd.DataFrame({"allocation": index_rel, "rel": rel, "target": target, "sid": sid, "tag": tag})
    rel_info = pd.merge(temp1, temp2, on="allocation", how="left").iloc[:, 1:]

    # データフレームを結合してリストに格納
    info_list.append(pd.concat((id_info, word_info, phrase_info, tag_info, rel_info), axis=1))

In [None]:
# リストをデータフレームに変換
info = pd.concat((info_list), axis=0)
info["serial_no"] = np.arange(info.shape[0])
info = info[["serial_no", "doc_id", "sentence_id", "word", "reading", "genkei", "word_class", 
             "class_detail1", "class_detail2", "class_detail3", "phrase_id", "phrase_dependency", "dependency_type1",
             "tag_id", "tag_dependency", "dependency_type2", "rel", "target", "sid", "tag"]]
info.index = np.arange(info.shape[0])

In [None]:
# イコールはエクセルで問題が生じるので変換しておく
info["rel"] = info["rel"].str.replace("=", "≡")

In [None]:
# データフレームを出力
info.to_csv(path + "tdb/corpus/kwdlc_info.csv", index=None)
info.to_excel(path + "tdb/corpus/kwdlc_info.xlsx")