In [1]:
####WordNetによる単語名寄せ処理####
import numpy as np
import pandas as pd
import matplotlib.pyplot  as plt
import numpy.matlib
import itertools
import scipy
from datetime import time, datetime, timedelta
from scipy import sparse
from pandas.tools.plotting import scatter_matrix
from numpy.random import *
from scipy.special import psi 
import re
import MeCab
import neologdn
import sys
import sqlite3

np.random.seed(98537)

In [2]:
##データの読み込み
def data_input(iep, flag1, remove_item, remove_element):
    ##データの設定
    #ニュースデータの読み込み
    input_path = "C:/statistics/data/DJ_news_data/custom_data/DJ_fulldata_new.csv"
    read_data = pd.read_csv(input_path, index_col=0) 
    
    #iep要因のみ抽出
    if iep==1:
        index_iep = np.array(np.where(read_data["model"]=="IEP")[0], dtype="int")
        read_data = read_data.iloc[index_iep]
        read_data.index = np.arange(read_data.shape[0])

    #広範に影響を与える要因のレコードのみを除去
    if flag1==1:
        index_item = np.array(~np.in1d(read_data["item"], remove_item), dtype="int")
        index_element = np.array(~np.in1d(read_data["element"], remove_element), dtype="int")
        index = np.array(np.where((index_item+index_element)==2)[0], dtype="int")
        a = np.unique(read_data["key"].iloc[np.delete(np.arange(read_data.shape[0]), index)])
        b = np.unique(read_data["key"].iloc[index])
        ab = pd.merge(pd.DataFrame({"key": a, "no1": np.arange(a.shape[0])}), 
                      pd.DataFrame({"key": b, "no2": np.arange(b.shape[0])}), on="key", how="left")
        remove_key = np.unique(ab["key"].iloc[np.where(pd.isna(ab["no2"]))[0]])
        ab = pd.merge(read_data[["key"]], pd.DataFrame({"key": remove_key, "no": np.arange(remove_key.shape[0])}), on="key", how="left")
        read_data = read_data.iloc[np.where(pd.isna(ab["no"]))[0]]
        read_data.index = np.arange(read_data.shape[0])
    #広範に影響を与える要因を含むニュースを除去
    if flag1==2:
        index_item = np.array(np.in1d(read_data["item"], remove_item), dtype="int")
        index_element = np.array(np.in1d(read_data["element"], remove_element), dtype="int")   
        index = np.array(np.where((index_item+index_element) > 0)[0], dtype="int")
        key = np.unique(read_data["key"].iloc[index])
        delete_key = pd.merge(read_data[["key"]], pd.DataFrame({"key": key, "no": np.arange(key.shape[0])}), on="key", how="left")
        read_data = read_data.iloc[np.where(pd.isna(delete_key["no"])==True)]
        read_data.index = np.arange(read_data.shape[0])
        
    #カラムの入れ替えとインデックスの定義
    read_data = read_data[["key", "date", "headline", "text", "area", "subject", "item", "element", "predicate", "trend",
                           "tags", "complete", "model", "aiep", "identified"]]
    read_data.index = np.arange(read_data.shape[0])
    return read_data

In [3]:
##データの前処理
def data_preprocess(read_data):
    ##分析済みのデータのみを取り出す
    index_get = np.array(np.where(np.array(pd.isna(read_data[["aiep"]])==False).reshape(-1))[0], dtype="int")
    df = read_data.iloc[index_get, ]
    df.index = np.arange(df.shape[0])
    del read_data
    
    ##単語の名寄せを行う
    #データの読み込み
    area_dic = pd.read_csv("C:/statistics/data/dic/area_pattern_freq.csv", encoding="Shift-Jis")
    item_dic = pd.read_csv("C:/statistics/data/dic/item_pattern_freq.csv", encoding="Shift-Jis")

    #辞書から単語を名寄せ
    tmp_df = df.copy()
    tmp_df = pd.merge(tmp_df, area_dic, left_on="area", right_on="input", how="left")
    tmp_df = pd.merge(tmp_df, item_dic, left_on="item", right_on="input", how="left")
    df["area"] = tmp_df["output2"]; df["item"] = tmp_df["output"]
    del tmp_df
    
    #要因がエリア以外1つしか観測されていないニュースを除く
    df = df.iloc[np.where(np.sum(np.array(~pd.isna(df[["item", "element", "subject", "trend"]])), axis=1) > 1)[0]]
    df.index = np.arange(df.shape[0])
    
    #aiepがすべてnanのニュースを取り除く
    Z = np.zeros((df.shape[0], 5), dtype="int")
    Z[:, 0] = ~pd.isna(df["area"])
    Z[:, 1] = ~pd.isna(df["item"])
    Z[:, 2] = ~pd.isna(df["element"])
    Z[:, 3] = ~pd.isna(df["subject"])
    Z[:, 4] = ~pd.isna(df["trend"])
    df = df.iloc[np.where(np.sum(Z, axis=1) >= 2)[0]]
    df.index = np.arange(df.shape[0])
    
    ##データの設定
    #日付をdatetime型に変更
    df["date"] = pd.to_datetime(df["date"].str[0:19])
    date_range = np.array([np.min(df["date"][df["date"] > "2010"]), np.max(df["date"])])
    #date_range = np.array([np.min(panel_data["日付"]), np.max(panel_data["日付"])])

    #ニュースのある期間のデータのみ抽出
    index = np.array(np.where((df["date"] > date_range[0]) & (df["date"] <= date_range[1]))[0], dtype="int")
    target_df = df.iloc[index]
    target_df.index = np.arange(target_df.shape[0])
    return target_df

In [4]:
#aiepの組み合わせの個数をカウントする
def pattern_count():
    area_vec = target_df["area"]; area_vec[pd.isna(area_vec)] = "抽出なし"
    item_vec = target_df["item"]; item_vec[pd.isna(item_vec)] = "抽出なし"
    element_vec = target_df["element"]; element_vec[pd.isna(element_vec)] = "抽出なし"
    subject_vec = target_df["subject"]; subject_vec[pd.isna(subject_vec)] = "抽出なし"
    trend_vec = target_df["trend"]; trend_vec[pd.isna(trend_vec)] = "抽出なし"
    aiep_vec = area_vec + " - " + item_vec + " - " + element_vec + " - " + subject_vec + " - " + trend_vec
    res = aiep_vec.value_counts()
    freq_df = pd.DataFrame({"pattern": np.array(res.index), "freq": np.array(res, dtype="int")})
    freq_df.to_csv("C:/statistics/data/aiep_pattern_freq.csv", sep=",")
    return freq_df

In [5]:
##ニュースソースを削減する
def delete_news(target_df):
    
    #要因の個数を集計する
    area_count = pd.Series.value_counts(target_df["area"])
    item_count = pd.Series.value_counts(target_df["item"])
    subject_count = pd.Series.value_counts(target_df["subject"])
    element_count = pd.Series.value_counts(target_df["element"])

    area_count.to_csv("C:/statistics/data/area_pattern_freq.csv", sep=",")
    item_count.to_csv("C:/statistics/data/item_pattern_freq.csv", sep=",")
    subject_count.to_csv("C:/statistics/data/subject_pattern_freq.csv", sep=",")
    element_count.to_csv("C:/statistics/data/element_pattern_freq.csv", sep=",")
    
    ##aiepに数値idを設定
    #ユニークな要素を抽出
    unique_area = pd.unique(target_df["area"]); area_n = unique_area.shape[0]
    unique_item = pd.unique(target_df["item"]); item_n = unique_item.shape[0]
    unique_subject = pd.unique(target_df["subject"]); subject_n = unique_subject.shape[0]
    unique_element = pd.unique(target_df["element"]); element_n = unique_element.shape[0]
    unique_trend = pd.unique(target_df["trend"]); trend_n = unique_trend.shape[0]
    unique_predicate = pd.unique(target_df["predicate"]); predicate_n = unique_predicate.shape[0]
    unique_tags = pd.unique(target_df["tags"]); tags_n = unique_tags.shape[0]

    #マスターデータにidを設定
    area_df = pd.DataFrame({"area": unique_area, "id": np.arange(area_n)})
    area_id = np.array(pd.merge(target_df[["area"]], area_df, on="area", how="left")["id"], dtype="int")

    unique_item = np.append(unique_item[~pd.isna(pd.Series(unique_item))], np.nan)
    item_df = pd.DataFrame({"item": unique_item, "id": np.arange(item_n)})
    item_id = np.array(pd.merge(target_df[["item"]], item_df, on="item", how="left")["id"], dtype="int")

    unique_subject = np.append(unique_subject[~pd.isna(pd.Series(unique_subject))], np.nan)
    subject_df = pd.DataFrame({"subject": unique_subject, "id": np.arange(subject_n)})
    subject_id = np.array(pd.merge(target_df[["subject"]], subject_df, on="subject", how="left")["id"], dtype="int")

    unique_element = np.append(unique_element[~pd.isna(pd.Series(unique_element))], np.nan)
    element_df = pd.DataFrame({"element": unique_element, "id": np.arange(element_n)})
    element_id = np.array(pd.merge(target_df[["element"]], element_df, on="element", how="left")["id"], dtype="int")

    unique_trend = np.append(unique_trend[~pd.isna(pd.Series(unique_trend))], np.nan)
    trend_df = pd.DataFrame({"trend": unique_trend, "id": np.arange(trend_n)})
    trend_id = np.array(pd.merge(target_df[["trend"]], trend_df, on="trend", how="left")["id"], dtype="int")
    return target_df, area_df, area_id, item_df, item_id, subject_df, subject_id, element_df, element_id, trend_df, trend_id

In [6]:
##ニュースソースの重複を削除する
def correspond_data(target_df, area_id, item_id, subject_id, element_id, trend_id):
    #ニュースデータの日付を市場が開いている時間に繰り越す
    index = np.array(np.where((target_df["date"].apply(lambda x:x.time()) >= time(hour=0)) & 
                              (target_df["date"].apply(lambda x:x.time()) <= time(hour=15)))[0], dtype="int")
    index_target = np.delete(np.arange(target_df.shape[0]), index)
    new_date = target_df[["date"]].copy()
    new_date["date"].iloc[index_target] = target_df["date"].iloc[index_target] + timedelta(days=1)

    #日付のデータ型を数値型に変更
    df_date = np.array((new_date["date"].dt.date.astype("str")).str.replace("-", ""), dtype="int")
    unique_date = np.array(np.sort(np.unique(df_date)), dtype="int")  
    date_n = unique_date.shape[0]

    #重複しているニュースを特定
    tmp_df = pd.concat((pd.DataFrame(df_date), target_df[["area", "subject", "item", "element", "trend"]]), axis=1)
    tmp_df = tmp_df.rename(columns={0: "date"})
    tmp_df = tmp_df.fillna("hoge")
    index_dup = np.array(tmp_df.duplicated())
    joint_tag = tmp_df["date"].astype("U8") + "-" + tmp_df["area"] + "-" + tmp_df["subject"] +\
                    "- " + tmp_df["item"] + "-" + tmp_df["element"] + "-" + tmp_df["trend"]
    joint_count = joint_tag.value_counts()
    pd.DataFrame({"tag": joint_count.index, "freq": np.array(joint_count, dtype="int")}).to_csv("C:/statistics/data/record_dup.csv")

    #重複を削除
    target_df = target_df.iloc[~index_dup, ]
    target_df.index = np.arange(target_df.shape[0])
    area_id = area_id[~index_dup]
    item_id = item_id[~index_dup]
    subject_id = subject_id[~index_dup]
    element_id = element_id[~index_dup]
    trend_id = trend_id[~index_dup]
    df_date = df_date[~index_dup]
    
    return target_df, area_id, item_id, subject_id, element_id, trend_id, df_date

In [7]:
##インデックスを設定
def create_index(area_id, item_id, subject_id, element_id, trend_id):
    #アイテムごとのユニーク数を数える
    area_n = np.unique(area_id).shape[0]
    item_n = np.unique(item_id).shape[0]
    subject_n = np.unique(subject_id).shape[0]
    element_n = np.unique(element_id).shape[0]
    trend_n = np.unique(trend_id).shape[0]
    
    #インデックスを定義
    index_area = [i for i in range(area_n)]
    index_item = [i for i in range(item_n)]
    index_subject = [i for i in range(subject_n)]
    index_element = [i for i in range(element_n)]
    index_trend = [i for i in range(trend_n)]
    for i in range(area_n):
        index_area[i] = np.array(np.where(area_id==i)[0], dtype="int")
    for i in range(item_n):
        index_item[i] = np.array(np.where(item_id==i)[0], dtype="int")
    for i in range(subject_n):
        index_subject[i] = np.array(np.where(subject_id==i)[0], dtype="int")
    for i in range(element_n):
        index_element[i] = np.array(np.where(element_id==i)[0], dtype="int")
    for i in range(trend_n):
        index_trend[i] = np.array(np.where(trend_id==i)[0], dtype="int")
    return index_area, index_item, index_subject, index_element, index_trend

In [8]:
##文書をテキストとaiepに分離する
def df_allocation(target_df):
    #ユニークなテキストを抽出
    index = np.array(np.where(~target_df["key"].duplicated()==True)[0], dtype="int")
    key_id = pd.DataFrame(np.arange(index.shape[0])[:, np.newaxis]).rename(columns={0: "key_id"})
    text_data = target_df[["key", "date", "headline", "text"]].iloc[index]
    text_data.index = np.arange(index.shape[0])
    text_data = pd.concat((key_id, text_data), axis=1)

    #aiepのデータフレームを作成
    aiep_data = target_df[["key", "date", "area", "subject", "item", "element", "predicate", "trend", "tags", "model", "aiep"]]
    temp_id = pd.merge(aiep_data[["key"]], text_data[["key", "key_id"]], on="key", how="left")[["key_id"]]
    aiep_data = pd.concat((temp_id, aiep_data), axis=1)
    return text_data, aiep_data

In [9]:
##パラグラフ単位のテキストを結合
def paragraph_text(text_data):
    #データの読み込み
    pf2010 = pd.read_csv("C:/statistics/data/DJ_news_data/DJNML/csv_paragraph/DJNWS_2010.csv")
    pf2010 = pf2010[["key", "date_jst", "type", "p_num", "text"]]
    pf2011 = pd.read_csv("C:/statistics/data/DJ_news_data/DJNML/csv_paragraph/DJNWS_2011.csv")
    pf2011 = pf2011[["key", "date_jst", "type", "p_num", "text"]]
    pf2012 = pd.read_csv("C:/statistics/data/DJ_news_data/DJNML/csv_paragraph/DJNWS_2012.csv")
    pf2012 = pf2012[["key", "date_jst", "type", "p_num", "text"]]
    pf2013 = pd.read_csv("C:/statistics/data/DJ_news_data/DJNML/csv_paragraph/DJNWS_2013.csv")
    pf2013 = pf2013[["key", "date_jst", "type", "p_num", "text"]]
    pf2014 = pd.read_csv("C:/statistics/data/DJ_news_data/DJNML/csv_paragraph/DJNWS_2014.csv")
    pf2014 = pf2014[["key", "date_jst", "type", "p_num", "text"]]
    pf2015 = pd.read_csv("C:/statistics/data/DJ_news_data/DJNML/csv_paragraph/DJNWS_2015.csv")
    pf2015 = pf2015[["key", "date_jst", "type", "p_num", "text"]]
    pf2016 = pd.read_csv("C:/statistics/data/DJ_news_data/DJNML/csv_paragraph/DJNWS_2016.csv")
    pf2016 = pf2016[["key", "date_jst", "type", "p_num", "text"]]
    pf2017 = pd.read_csv("C:/statistics/data/DJ_news_data/DJNML/csv_paragraph/DJNWS_2017.csv")
    pf2017 = pf2017[["key", "date_jst", "type", "p_num", "text"]]
    pf2018 = pd.read_csv("C:/statistics/data/DJ_news_data/DJNML/csv_paragraph/DJNWS_2018.csv")
    pf2018 = pf2018[["key", "date_jst", "type", "p_num", "text"]]
    pf = pd.concat((pf2010, pf2011, pf2012, pf2013, pf2014, pf2015, pf2016, pf2017, pf2018), axis=0)
    pf.index = np.arange(pf.shape[0])

    #ターゲットのテキストを抽出
    key_id = np.array(pd.merge(pf[["key"]], text_data[["key", "key_id"]], on="key", how="left")[["key_id"]]).reshape(-1)
    index_target = np.array(np.where(np.isnan(key_id)==False)[0], dtype="int")
    target_pf = pf.iloc[index_target]
    target_pf["key_id"] = np.array(key_id[index_target], dtype="int")
    target_pf = target_pf[["key_id", "key", "date_jst", "type", "p_num", "text"]]
    target_pf.index = np.arange(target_pf.shape[0])
    return target_pf

In [88]:
# 特定の単語を入力とした時に、類義語を検索する関数
def SearchSimilarWords(word):

    # 問い合わせしたい単語がWordnetに存在するか確認する
    cur = conn.execute("select wordid from word where lemma='%s'" % word)
    word_id = 99999999  #temp 
    for row in cur:
        word_id = row[0]

    # Wordnetに存在する語であるかの判定
    if word_id==99999999:
        print("「%s」は、Wordnetに存在しない単語です。" % word)
        return 
    else:
        print("【「%s」の類似語を出力します】\n" % word)

    # 入力された単語を含む概念を検索する
    cur = conn.execute("select synset from sense where wordid='%s'" % word_id)
    synsets = []
    for row in cur:
        synsets.append(row[0])

    # 概念に含まれる単語を検索して画面出力する
    no = 0
    conception = ["" for j in range(len(synsets))]
    meaning = ["" for j in range(len(synsets))]
    synonym = ["" for j in range(len(synsets))]
    for synset in synsets:
        cur1 = conn.execute("select name from synset where synset='%s'" % synset)
        conception0 = np.array([])
        for row1 in cur1:
            conception0 = np.append(conception0, np.array(row1[0]))
        cur2 = conn.execute("select def from synset_def where (synset='%s' and lang='jpn')" % synset)
        meaning0 = np.array([])
        for row2 in cur2:
            meaning0 = np.append(meaning0, np.array(row2[0]))
        cur3 = conn.execute("select wordid from sense where (synset='%s' and wordid!=%s)" % (synset,word_id))
        synonym0 = np.array([])
        for row3 in cur3:
            target_word_id = row3[0]
            cur3_1 = conn.execute("select lemma from word where wordid=%s" % target_word_id)
            for row3_1 in cur3_1:
                synonym0 = np.append(synonym0, np.array(row3_1[0]))
        conception[no] = np.array(conception0, dtype="object")
        meaning[no] = np.array(meaning0, dtype="object")
        synonym[no] = np.array(np.sort(np.append(synonym0, word)), dtype="object")
        no += 1
    return conception, meaning, synonym

In [155]:
##データ解析対象のテキストの抽出とクレンジング
#データの読み込み
remove_item = np.array(["原油", "石油", "米国債", "米ドル", "ユーロ", "日本円", "日本国債", "株式"])
remove_element = np.array(["政策金利", "金利", "株式市場", "米国株", "株価", "利回り", "経済"])
read_data = data_input(1, 1, remove_item, remove_element)

  if (yield from self.run_code(code, result)):
  mask |= (ar1 == a)


In [156]:
#データの加工とデータの絞り込み
target_df = data_preprocess(read_data)
res_delete_news = delete_news(target_df)
target_df, area_df, area_id, item_df, item_id, subject_df, subject_id, element_df, element_id, trend_df, trend_id = res_delete_news
del res_delete_news

#テキスト本文とaiepをそれぞれデータフレーム化する
res_correspond_data = correspond_data(target_df, area_id, item_id, subject_id, element_id, trend_id)
target_df, area_id, item_id, subject_id, element_id, trend_id, df_date = res_correspond_data
del res_correspond_data
index_area, index_item, index_subject, index_element, index_trend = create_index(area_id, item_id, subject_id, element_id, trend_id)
text_data, aiep_data = df_allocation(target_df)
copy_data1 = text_data.copy(); copy_data = aiep_data.copy()   #バックアップ
target_pf = paragraph_text(text_data)   #パラグラフ単位のテキストと結合

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [157]:
#単語の名寄せ処理
conn = sqlite3.connect("C:/statistics/data/dic/wnjpn.db")   #データベースに接続

In [158]:
unique_item = pd.unique(aiep_data["item"].iloc[np.where(~pd.isna(aiep_data["item"]))[0]])
unique_element = pd.unique(aiep_data["element"].iloc[np.where(~pd.isna(aiep_data["element"]))[0]])

In [159]:
# 問い合わせしたい単語がWordnetに存在するか確認する
no_list = [i for i in range(unique_item.shape[0])]
word_list = [i for i in range(unique_item.shape[0])]
synonym_list = [i for i in range(unique_item.shape[0])]
for i in range(unique_item.shape[0]):
    out = SearchSimilarWords(unique_item[i])
    if len(out[0])==0:
        no_list[i] = np.repeat(i, 1)
        word_list[i] = np.array([unique_item[i]], dtype="object")
        synonym_list[i] = np.array(["Wordnetには存在しません"])
        continue
    word = np.array(np.repeat("", len(out[2])), dtype="object")
    synonym = np.array(np.repeat("", len(out[2])), dtype="object")
    for j in range(len(out[2])):
        word[j] = unique_item[i]
        q = out[2][j]
        synonym[j] = " - ".join([str(n) for n in q])
    no_list[i] = np.repeat(i, word.shape[0])
    word_list[i] = word
    synonym_list[i] = synonym

In [174]:
# 類義語のパターンを出力
no = np.array(list(itertools.chain(*[no_list[i] for i in range(unique_item.shape[0])])))
word = np.array(list(itertools.chain(*[word_list[i] for i in range(unique_item.shape[0])])), dtype="object")
synonym = np.array(list(itertools.chain(*[synonym_list[i] for i in range(unique_item.shape[0])])), dtype="object")
out_data = pd.DataFrame({"no": no, "word": word, "synonym": synonym})
out_data = out_data.sort_values(by="synonym")
out_data.index = np.arange(out_data.shape[0])
temp_data = pd.DataFrame({"synonym": pd.unique(synonym), "pattern": np.arange(np.unique(synonym).shape[0])})
out_data = pd.merge(out_data, temp_data, on="synonym", how="left")
out_data = out_data[["no", "word", "pattern", "synonym"]]
out_data = out_data.sort_values(by="pattern")
freq = out_data["pattern"].value_counts()
freq = pd.DataFrame({"pattern": np.array(freq.index, dtype="int"), "freq" :np.array(freq, dtype="int")})
out_data = pd.merge(out_data, freq, on="pattern", how="left")
out_data.to_csv("C:/statistics/data/synonym_data.csv", index=None, encoding="Shift-Jis")