In [1]:
import json, pprint

import pandas as pd

import matplotlib.pyplot as plt
import japanize_matplotlib
import networkx as nx
from pyvis.network import Network
%matplotlib inline

In [2]:
import os, sys
sys.path.append(os.path.join(os.path.dirname("__file__"), '..'))

In [3]:
from classes.dependency_analysis import DependencyAnalysis



In [4]:
QUESTION_INPUT_PATH = "../datas/question_sentence/dataset_01.json"
TOKEN_OUTPUT_PATH = "../datas/token/keywords.json"

In [5]:
with open(QUESTION_INPUT_PATH, "r") as f:
    question_dict = json.load(f)

pprint.pprint(question_dict, width=150)

[{'answer': ['A = (0, 0, 0)',
             'B = (40, 0, 0)',
             'C = (40, 30, 0)',
             'D = (0, 30, 0)',
             'E = (0, 0, 50)',
             'F = (40, 0, 50)',
             'G = (40, 30, 50)',
             'H = (0, 30, 50)',
             'Segment(A, B)',
             'Segment(B, C)',
             'Segment(C, D)',
             'Segment(D, A)',
             'Segment(E, F)',
             'Segment(F, G)',
             'Segment(G, H)',
             'Segment(H, E)',
             'Segment(A, E)',
             'Segment(B, F)',
             'Segment(C, G)',
             'Segment(D, H)',
             'Segment(D, F)',
             'I = PerpendicularLine(B, Segment(D, F))'],
  'id': 1,
  'impression': ['垂線がなかったけど，PerpendicularLine関数の存在を教えたらいけた．'],
  'normalized': '立体ABCD-EFGHは，AB=40cm，AD=30cm，AE=50cmの直方体である．頂点Dと頂点Fを結び，頂点Bから線分DFに引いた垂線と線分DFとの交点をIとする．線分BIの長さは何cmか．',
  'original': '立体ABCD-EFGHは，AB=40cm，AD=30cm，AE=50cmの直方体である．頂点Dと頂点Fを結び，頂点Bから線分DFに引いた垂線と線分DFとの交点をIとする．線分BIの長さは何

In [6]:
# インスタンス作成
q1 = DependencyAnalysis(
   document = question_dict[0]["normalized"], 
   model = "ja_ginza_electra"
)
# 形態素解析を実行
q1.analyze()
print("")

  from .autonotebook import tqdm as notebook_tqdm





In [7]:
tokens_list = []
for token in q1.doc:
    token_data = {
        "text":     token.text,     # テキスト
        "lemma":    token.lemma_,   # レンマ
        "pos":      token.pos_,     # 品詞
        "tag":      token.tag_,     # 品詞詳細
        "dep":      token.dep_,     # 構文従属関係
        "shape":    token.shape_,   # 正書法の特徴(x:文字,d:数値)
        "is_alpha": token.is_alpha, # 文字かどうか
        "is_stop":  token.is_stop   # ストップリストの一部かどうか
    }
    tokens_list.append(token_data)
df_tokens = pd.DataFrame(data = tokens_list , columns = token_data)

# トークンデータフレームはExcelで可視化
df_tokens.to_excel("token_list.xlsx", index=False)

In [8]:
# インスタンスのリストを作成
q_instance_list = []
for i in range(len(question_dict)):
   q1 = DependencyAnalysis(
      document = question_dict[i]["normalized"], 
      model = "ja_ginza_electra"
   )
   # 形態素解析を実行
   q1.analyze()
   q_instance_list.append(q1)
   print(f"Created instance of {i+1}/{len(question_dict)}")

Created instance of 1/10
Created instance of 2/10
Created instance of 3/10
Created instance of 4/10
Created instance of 5/10
Created instance of 6/10
Created instance of 7/10
Created instance of 8/10
Created instance of 9/10
Created instance of 10/10


In [9]:
def AddWordDict(word_dict, word):
    if (word in word_dict.keys()):
        word_dict[word] += 1
    else:
        word_dict[word] = 1

nouns = {}
nouns_Jap = {}
nouns_symbol = {}
verbs = {}
for i in range(len(q_instance_list)):
    for token in q_instance_list[i].doc:
        # 名詞かどうか
        if token.pos_ == "NOUN":
            AddWordDict(nouns, token.text)
            # アスキー文字(=アルファベット)かどうか
            if token.text.isascii():
                AddWordDict(nouns_symbol, token.text)
            # 非アスキー文字(=日本語等)かどうか
            else:
                AddWordDict(nouns_Jap, token.text)
        
        # 名詞かどうか
        elif token.pos_ == "VERB":
            AddWordDict(verbs, token.text)

# print(f"Jap: {nouns_Jap}")
# print(f"sym: {nouns_symbol}")
# print(f"vrb: {verbs}")

# pprint.pprint(nouns_Jap)
# pprint.pprint(verbs)

In [10]:
# jsonに書き込み&新規作成
with open(TOKEN_OUTPUT_PATH, 'w', encoding="utf-8") as f:
    json.dump(nouns_Jap, f, sort_keys=False, indent=4, ensure_ascii=False)