In [None]:
# ライブラリの読み込み
import numpy as np
import pandas as pd
import matplotlib.pyplot  as plt
import itertools
import time
import re
import os
import glob
import MeCab
from numpy.random import *

pd.set_option("display.max_rows", 250)
pd.set_option("display.max_columns", 100)

# Cabochaの結果をクレンジング

In [None]:
# ファイルの読み込み
nlp_path = "D:/Statistics/data/NLP/"
tdb = pd.read_csv(nlp_path + "tdb/tdb_result_by_sentence.csv", dtype="str")
tdb["d_id"] = tdb["d_id"].astype("int")
tdb["no"] = tdb["no"].astype("int")

f = open(nlp_path + "tdb/cabocha_text_neologd.txt",encoding="utf-8")
cabocha_text = f.readlines()  # ファイル終端まで全て読んだデータを返す
f.close()

In [None]:
# 改行を変換
n = len(cabocha_text)
for i in range(n):
    cabocha_text[i] = re.sub("\n", "", cabocha_text[i])
    cabocha_text[i] = re.sub("\t", " ", cabocha_text[i])

In [None]:
# 係り受け結果を抽出
# データの格納用配列
get_id = 0
id_list = []
res_morpheme = []
res_dependency = []

# 1文ずつデータを処理
for i in range(n):
    if i%100000==0:
        print(i)
    flag = len(re.findall("\* [0-9]", cabocha_text[i][:3])) > 0
    if flag==True:
        dependency = cabocha_text[i]
    else:
        split_result = re.split("[ , \,]", cabocha_text[i])
        if len(split_result)==8:
            split_result.extend(np.repeat(split_result[0], 2).tolist())
        if len(split_result)==10:
            id_list.append(np.array(tdb["d_id"].iloc[get_id]))
            res_dependency.append(re.split(" ", dependency))
            res_morpheme.append(split_result)
    if cabocha_text[i]=="EOS":
        get_id += 1

In [None]:
# データフレームを作成
# データフレームに変換
d_id = pd.DataFrame({"d_id": np.array(id_list)})
dependency = pd.DataFrame(np.array(res_dependency)).iloc[:, 1:]
morpheme = pd.DataFrame(np.array(res_morpheme)).iloc[:, :8]
dependency.columns = ["send_id", "receive_id", "head", "score"]
morpheme.columns = ["word", "class", "class_detail1", "class_detail2", "class_detail3", "inflection1", "inflection2", "genkei"]
dependency_df = pd.concat((d_id, dependency, morpheme), axis=1)
dependency_df = dependency_df.iloc[np.where(dependency_df["word"]!="")[0]]
dependency_df.index = np.arange(dependency_df.shape[0])

In [None]:
# データフレームのデータ型を変更
n = dependency_df.shape[0]
receive_id = np.array(dependency_df["receive_id"])
dependency_df["receive_id"] = np.array([re.sub("D", "", receive_id[i]) for i in range(n)], dtype="int")
dependency_df["send_id"] = np.array(dependency_df["send_id"], dtype="int")
dependency_df["score"] = np.array(dependency_df["score"], dtype="float")

# 新しいphrase idを定義
joint_id = np.array(dependency_df["d_id"].astype("str") + "-" + dependency_df["send_id"].astype("str"))
phrase_mapping = pd.DataFrame({"joint_id": pd.unique(joint_id), "phrase_id": np.arange(len(pd.unique(joint_id)))})
phrase_id = pd.merge(pd.DataFrame({"joint_id": joint_id}), phrase_mapping, on="joint_id", how="inner")
dependency_df["phrase_id"] = phrase_id["phrase_id"]

# 補助情報を統合
result = pd.merge(dependency_df, tdb[["kgcd", "d_id", "no"]], on="d_id", how="inner")
result["serial_no"] = np.arange(result.shape[0])
result = result[["serial_no", "kgcd", "d_id", "no", "phrase_id", "send_id", "receive_id", "head", "score", "word", "genkei",
                 "class", "class_detail1", "class_detail2", "class_detail3", "inflection1", "inflection2"]]

# 「及び」を「および」に修正
index = np.where((result["word"]=="及び") & (result["class"]=="接続詞"))[0].astype("int")
result["word"].iloc[index] = "および"
result["genkei"].iloc[index] = "および"
del res_dependency, res_morpheme, id_list, dependency, morpheme

In [None]:
# データフレームを出力
result.to_csv(nlp_path + "tdb/tdb_corpus.csv", index=None)

split = 3
kgcd = np.unique(result["kgcd"])
split_kgcd = np.array_split(kgcd, split, 0)
for j in range(split):
    index = np.where(np.in1d(result["kgcd"], split_kgcd[j]))[0]
    result_split = result.iloc[index]
    result_split.to_excel(nlp_path + "tdb/tdb_corpus" + str(j) + ".xlsx")

In [None]:
len(np.unique(result["phrase_id"]))

In [None]:
np.unique(result["phrase_id"])