In [None]:
# ライブラリの読み込み
import numpy as np
import pandas as pd
import matplotlib.pyplot  as plt
import itertools
import time
import re
import os
import glob
import jaconv
import mojimoji
from numpy.random import *

pd.set_option("display.max_rows", 250)
pd.set_option("display.max_columns", 100)

# データの処理

## データの読み込み

In [None]:
# データの読み込み
# ファイルの取得
path = "C:/statistics/data/scenario_extract/"
corpus_path = path + "tdb/corpus/NTC_1.5/dat/"
filelist = glob.glob(corpus_path + "num/ipa/*.dat")
m = len(filelist)

# 取得したファイルの読み込み
naist_text = []
for i in range(m):
    with open(filelist[i], encoding="euc-jp") as f:
        naist_text.append(np.array(f.read().split("EOS")))
naist_text = np.hstack((naist_text))
m = naist_text.shape[0]

## フレーズ情報を処理

In [None]:
# フレーズと単語情報の関係を取得
# データの格納用配列
phrase_no = 0
phrase_list = []
head_list = []
info_list = []
    
# テキストファイルごとにフレーズと単語情報をリストに格納
for i in range(m):
    print(i)
    
    # テキストを分割
    line_text = pd.Series(naist_text[i].split("\n"))
    index_phrase = np.where(line_text.str.contains("^[0-9]+/[0-9]+$"))[0].astype("int")
    index_word = np.delete(np.arange(line_text.shape[0]), index_phrase)
    n1 = index_phrase.shape[0]

    # フレーズの対応関係を取得
    for j in range(n1):
        if j < n1-1:
            index1 = index_phrase[j]
            index2 = index_phrase[j+1]
            allocation = np.array(line_text.iloc[index_word[(index_word > index1) & (index_word < index2)]])
        else:
            index1 = index_phrase[j]
            allocation = np.array(line_text.iloc[index_word[index_word > index1]])
        n2 = allocation.shape[0]
        
        # フレーズごとにリストに情報を格納
        phrase_list.append(np.repeat(phrase_no, n2))
        head_list.append(np.repeat(line_text[index_phrase[j]], n2))
        info_list.append(allocation)
        phrase_no += 1

In [None]:
# リストを配列に変換
phrase_no = np.hstack((phrase_list))
head = np.hstack((head_list))
info = np.hstack((info_list))

## 形態素情報を処理

In [None]:
# 形態素情報を配列に格納
# 形態素情報を分割
line_info = pd.Series(info).str.split("\t")
N = line_info.shape[0]
cl = 4

# データの格納用配列
types = np.repeat(0, N)
readings = np.repeat("", N).astype("object")
genkei = np.repeat("", N).astype("object")
classes = np.full((N, cl), "").astype("object")

# 単語ごとに形態素情報を分割する
for i in range(N):
    if len(line_info.iloc[i]) <= 1:
        continue
    types[i] = int(line_info.iloc[i][0])
    readings[i] = jaconv.kata2hira(line_info.iloc[i][1])
    genkei[i] = line_info.iloc[i][2]
    class_split = np.array(line_info.iloc[i][3].split("-"))
    get_cl = class_split.shape[0]
    classes[i, ] = np.append(class_split, np.repeat("", cl - get_cl))

# 文章idを定義
flag = np.repeat(0, N)
flag[np.where(genkei=="。")[0]+1] = 1
sentence_id = np.cumsum(flag)

# データフレームを作成
class_info = pd.DataFrame(classes, columns=["class", "class_detail1", "class_detail2", "class_detail3"])
phrase_info = pd.DataFrame({"sentence_id": sentence_id, "phrase_id": phrase_no, "head": head})
morpheme_info = pd.concat((pd.DataFrame({"genkei": genkei, "readings": readings}), class_info), axis=1)

In [None]:
# readigsが格納されていないセルを埋める
index_space = np.where(morpheme_info["readings"]=="")[0].astype("int")
index_replace = index_space[np.where(morpheme_info["genkei"].iloc[index_space].str.contains("[ａ-ｚＡ-Ｚあ-んア-ン]"))[0]]
target = np.array(morpheme_info["genkei"].iloc[index_replace])
morpheme_info["readings"].iloc[index_replace] = [jaconv.kata2hira(target[i]) for i in range(target.shape[0])]

## 係り受け情報と格解析情報を処理

In [None]:
# pandas seriesに変換
info_series = pd.Series(info)

# 格idを取得
index_frame = np.where(info_series.str.contains("\tid=\"| id=\""))[0].astype("int")
frame_id = np.repeat(-1, N)
n = index_frame.shape[0]
for i in range(n):
    match_string = re.findall("id=\".+?\"", info_series.iloc[index_frame[i]])[0]
    frame_id[index_frame[i]] = int(re.sub("id=|\"", "", match_string))

In [None]:
# 格と種別を取得
# インデックスを取得
index_gaoni = np.where(info_series.str.contains("ga=\"|o=\"|ni=\""))[0].astype("int")
n = index_gaoni.shape[0]

# オブジェクトの格納用配列
ga_id = np.repeat("", N).astype("object")
o_id = np.repeat("", N).astype("object")
ni_id = np.repeat("", N).astype("object")
ga_type = np.repeat("", N).astype("object")
o_type = np.repeat("", N).astype("object")
ni_type = np.repeat("", N).astype("object")

# ガ、オ、ニの格種別を取得
for i in range(n):
    index = index_gaoni[i]
    target = info_series.iloc[index]
    ga_string1 = re.findall("ga=\".+?\"", target)
    o_string1 = re.findall("o=\".+?\"", target)
    ni_string1 = re.findall("ni=\".+?\"", target)
    ga_string2 = re.findall("ga_type=\".+?\"", target)
    o_string2 = re.findall("o_type=\".+?\"", target)
    ni_string2 = re.findall("ni_type=\".+?\"", target)

    if len(ga_string1) > 0:
        ga_id[index] = re.sub("ga=|\"", "", ga_string1[0])
    if len(o_string1) > 0:
        o_id[index] = re.sub("o=|\"", "", o_string1[0])
    if len(ni_string1) > 0:
        ni_id[index] = re.sub("ni=|\"", "", ni_string1[0])
    if len(ga_string2) > 0:
        ga_type[index] = re.sub("ga_type=|\"", "", ga_string2[0])
    if len(o_string2) > 0:
        o_type[index] = re.sub("o_type=|\"", "", o_string2[0])
    if len(ni_string2) > 0:
        ni_type[index] = re.sub("ni_type=|\"", "", ni_string2[0])

In [None]:
# その他の構造を取得
# インデックスを取得
index_eq = np.where(info_series.str.contains("\teq=\"| eq=\""))[0].astype("int")
index_type = np.where(info_series.str.contains("\ttype=\"| type=\""))[0].astype("int")
index_alt = np.where(info_series.str.contains("\talt=\"| alt=\""))[0].astype("int")
index_noun_type = np.where(info_series.str.contains("\tnoun_type=\"| noun_type=\""))[0].astype("int")
index_ana_id = np.where(info_series.str.contains("\tana_id=\"| ana_id=\""))[0].astype("int")
index_ant_id = np.where(info_series.str.contains("\tant_id=\"| ant_id=\""))[0].astype("int")
index_ana_type = np.where(info_series.str.contains("\tana_type=\"| ana_type=\""))[0].astype("int")
index_refexp_type = np.where(info_series.str.contains("\trefexp_type=\"| refexp_type=\""))[0].astype("int")

# 構造を配列に格納
n = index_eq.shape[0]
eq = np.repeat("", N).astype("object")
for i in range(n):
    index = index_eq[i]
    string = re.findall("eq=\".+?\"{1}", info_series.iloc[index])[0]
    eq[index] = re.sub("eq=|\"", "", string)
    
n = index_type.shape[0]
types = np.repeat("", N).astype("object")
for i in range(n):
    index = index_type[i]
    string = re.findall("type=\".+?\"", info_series.iloc[index])[0]
    types[index] = re.sub("type=|\"", "", string)

n = index_alt.shape[0]
alt = np.repeat("", N).astype("object")
for i in range(n):
    index = index_alt[i]
    string = re.findall("alt=\".+?\"", info_series.iloc[index])[0]
    alt[index] = re.sub("alt=|\"", "", string)
    
n = index_noun_type.shape[0]
noun_type = np.repeat("", N).astype("object")
for i in range(n):
    index = index_noun_type[i]
    string = re.findall("noun_type=\".+?\"", info_series.iloc[index])[0]
    noun_type[index] = re.sub("noun_type=|\"", "", string)
    
n = index_ana_id.shape[0]
ana_id = np.repeat("", N).astype("object")
for i in range(n):
    index = index_ana_id[i]
    string = re.findall("ana_id=\".+?\"", info_series.iloc[index])[0]
    ana_id[index] = re.sub("ana_id=|\"", "", string)
    
n = index_ant_id.shape[0]
ant_id = np.repeat("", N).astype("object")
for i in range(n):
    index = index_ant_id[i]
    string = re.findall("ant_id=\".+?\"", info_series.iloc[index])[0]
    ant_id[index] = re.sub("ant_id=|\"", "", string)
    
n = index_ana_type.shape[0]
ana_type = np.repeat("", N).astype("object")
for i in range(n):
    index = index_ana_type[i]
    string = re.findall("ana_type=\".+?\"", info_series.iloc[index])[0]
    ana_type[index] = re.sub("ana_type=|\"", "", string)
    
n = index_refexp_type.shape[0]
refexp_type = np.repeat("", N).astype("object")
for i in range(n):
    index = index_refexp_type[i]
    string = re.findall("refexp_type=\".+?\"", info_series.iloc[index])[0]
    refexp_type[index] = re.sub("refexp_type=|\"", "", string)

## データフレームを出力

In [None]:
# データフレームに統合
frame_info = pd.DataFrame({"frame_id": frame_id, "ga_id": ga_id, "o_id": o_id, "ni_id": ni_id, "ga_type": ga_type,
                           "o_type": o_type, "ni_type": ni_type, "eq": eq, "type": types, "alt": alt, "noun_type": noun_type,
                           "ant_id": ant_id, "ana_id": ana_id, "ana_type": ana_type, "refexp": refexp_type})
naist_info = pd.concat((phrase_info, morpheme_info, frame_info), axis=1)
naist_info = naist_info.iloc[np.where(naist_info["genkei"]!="")[0]]
naist_info["serial_no"] = np.arange(naist_info.shape[0])
naist_info = naist_info[np.append("serial_no", np.array(naist_info.columns)[:-1]).tolist()]
naist_info.index = np.arange(naist_info.shape[0])

In [None]:
# データフレームを出力
split = 3
naist_info.to_csv(path + "tdb/corpus/naist_info.csv", index=None)
sentence_id = np.unique(naist_info["sentence_id"]).astype("int")
split_sentence = np.array_split(sentence_id, split, 0)
for j in range(split):
    index = np.where(np.in1d(naist_info["sentence_id"], split_sentence[j]))[0]
    output_split = naist_info.iloc[index]
    output_split.to_excel(path + "tdb/corpus/naist_info" + str(j) + ".xlsx")