In [11]:
####トピックモデルによる単語の低ランク表現####
import numpy as np
import pandas as pd
import matplotlib.pyplot  as plt
import numpy.matlib
import itertools
import scipy
from datetime import time, datetime, timedelta
from scipy import sparse
from pandas.tools.plotting import scatter_matrix
from numpy.random import *
from scipy.special import psi 
import re
import MeCab
import neologdn
import sys
import gc

np.random.seed(98537)

In [12]:
##データの読み込み
def data_input(iep, flag1, remove_item, remove_element):
    ##データの設定
    #ニュースデータの読み込み
    input_path = "D:/Statistics/data/DJ_news_data/custom_data/DJ_fulldata_new.csv"
    read_data = pd.read_csv(input_path, index_col=0) 
    
    #iep要因のみ抽出
    if iep==1:
        index_iep = np.array(np.where(read_data["model"]=="IEP")[0], dtype="int")
        read_data = read_data.iloc[index_iep]
        read_data.index = np.arange(read_data.shape[0])

    #広範に影響を与える要因のレコードのみを除去
    if flag1==1:
        index_item = np.array(~np.in1d(read_data["item"], remove_item), dtype="int")
        index_element = np.array(~np.in1d(read_data["element"], remove_element), dtype="int")
        index = np.array(np.where((index_item+index_element)==2)[0], dtype="int")
        a = np.unique(read_data["key"].iloc[np.delete(np.arange(read_data.shape[0]), index)])
        b = np.unique(read_data["key"].iloc[index])
        ab = pd.merge(pd.DataFrame({"key": a, "no1": np.arange(a.shape[0])}), 
                      pd.DataFrame({"key": b, "no2": np.arange(b.shape[0])}), on="key", how="left")
        remove_key = np.unique(ab["key"].iloc[np.where(pd.isna(ab["no2"]))[0]])
        ab = pd.merge(read_data[["key"]], pd.DataFrame({"key": remove_key, "no": np.arange(remove_key.shape[0])}), on="key", how="left")
        read_data = read_data.iloc[np.where(pd.isna(ab["no"]))[0]]
        read_data.index = np.arange(read_data.shape[0])
        
    #広範に影響を与える要因を含むニュースを除去
    if flag1==2:
        index_item = np.array(np.in1d(read_data["item"], remove_item), dtype="int")
        index_element = np.array(np.in1d(read_data["element"], remove_element), dtype="int")   
        index = np.array(np.where((index_item+index_element) > 0)[0], dtype="int")
        key = np.unique(read_data["key"].iloc[index])
        delete_key = pd.merge(read_data[["key"]], pd.DataFrame({"key": key, "no": np.arange(key.shape[0])}), on="key", how="left")
        read_data = read_data.iloc[np.where(pd.isna(delete_key["no"])==True)]
        read_data.index = np.arange(read_data.shape[0])
        
    #カラムの入れ替えとインデックスの定義
    read_data = read_data[["key", "date", "headline", "text", "area", "subject", "item", "element", "predicate", "trend",
                           "tags", "complete", "model", "aiep", "identified"]]
    read_data.index = np.arange(read_data.shape[0])
    return read_data

In [13]:
##データの前処理
def data_preprocess(read_data):
    ##分析済みのデータのみを取り出す
    index_get = np.array(np.where(np.array(pd.isna(read_data[["aiep"]])==False).reshape(-1))[0], dtype="int")
    df = read_data.iloc[index_get, ]
    df.index = np.arange(df.shape[0])
    del read_data
    
    ##単語の名寄せを行う
    #データの読み込み
    area_dic = pd.read_csv("D:/Statistics/data/dic/area_pattern_freq.csv", encoding="Shift-Jis")
    item_dic = pd.read_csv("D:/Statistics/data/dic/item_pattern_freq.csv", encoding="Shift-Jis")

    #辞書から単語を名寄せ
    tmp_df = df.copy()
    tmp_df = pd.merge(tmp_df, area_dic, left_on="area", right_on="input", how="left")
    tmp_df = pd.merge(tmp_df, item_dic, left_on="item", right_on="input", how="left")
    df["area"] = tmp_df["output2"]; df["item"] = tmp_df["output"]
    del tmp_df
    
    #要因がエリア以外1つしか観測されていないニュースを除く
    df = df.iloc[np.where(np.sum(np.array(~pd.isna(df[["item", "element", "subject", "trend"]])), axis=1) > 1)[0]]
    df.index = np.arange(df.shape[0])
    
    #aiepがすべてnanのニュースを取り除く
    Z = np.zeros((df.shape[0], 5), dtype="int")
    Z[:, 0] = ~pd.isna(df["area"])
    Z[:, 1] = ~pd.isna(df["item"])
    Z[:, 2] = ~pd.isna(df["element"])
    Z[:, 3] = ~pd.isna(df["subject"])
    Z[:, 4] = ~pd.isna(df["trend"])
    df = df.iloc[np.where(np.sum(Z, axis=1) >= 2)[0]]
    df.index = np.arange(df.shape[0])
    
    ##データの設定
    #日付をdatetime型に変更
    df["date"] = pd.to_datetime(df["date"].str[0:19])
    date_range = np.array([np.min(df["date"][df["date"] > "2010"]), np.max(df["date"])])
    #date_range = np.array([np.min(panel_data["日付"]), np.max(panel_data["日付"])])

    #ニュースのある期間のデータのみ抽出
    index = np.array(np.where((df["date"] > date_range[0]) & (df["date"] <= date_range[1]))[0], dtype="int")
    target_df = df.iloc[index]
    target_df.index = np.arange(target_df.shape[0])
    return target_df

In [14]:
#aiepの組み合わせの個数をカウントする
def pattern_count():
    area_vec = target_df["area"]; area_vec[pd.isna(area_vec)] = "抽出なし"
    item_vec = target_df["item"]; item_vec[pd.isna(item_vec)] = "抽出なし"
    element_vec = target_df["element"]; element_vec[pd.isna(element_vec)] = "抽出なし"
    subject_vec = target_df["subject"]; subject_vec[pd.isna(subject_vec)] = "抽出なし"
    trend_vec = target_df["trend"]; trend_vec[pd.isna(trend_vec)] = "抽出なし"
    aiep_vec = area_vec + " - " + item_vec + " - " + element_vec + " - " + subject_vec + " - " + trend_vec
    res = aiep_vec.value_counts()
    freq_df = pd.DataFrame({"pattern": np.array(res.index), "freq": np.array(res, dtype="int")})
    freq_df.to_csv("D:/Statistics/data/aiep_pattern_freq.csv", sep=",")
    return freq_df

In [15]:
##ニュースソースを削減する
def delete_news(target_df):
    
    #要因の個数を集計する
    area_count = pd.Series.value_counts(target_df["area"])
    item_count = pd.Series.value_counts(target_df["item"])
    subject_count = pd.Series.value_counts(target_df["subject"])
    element_count = pd.Series.value_counts(target_df["element"])

    area_count.to_csv("D:/Statistics/data/area_pattern_freq.csv", sep=",")
    item_count.to_csv("D:/Statistics/data/item_pattern_freq.csv", sep=",")
    subject_count.to_csv("D:/Statistics/data/subject_pattern_freq.csv", sep=",")
    element_count.to_csv("D:/Statistics/data/element_pattern_freq.csv", sep=",")
    
    ##aiepに数値idを設定
    #ユニークな要素を抽出
    unique_area = pd.unique(target_df["area"]); area_n = unique_area.shape[0]
    unique_item = pd.unique(target_df["item"]); item_n = unique_item.shape[0]
    unique_subject = pd.unique(target_df["subject"]); subject_n = unique_subject.shape[0]
    unique_element = pd.unique(target_df["element"]); element_n = unique_element.shape[0]
    unique_trend = pd.unique(target_df["trend"]); trend_n = unique_trend.shape[0]
    unique_predicate = pd.unique(target_df["predicate"]); predicate_n = unique_predicate.shape[0]
    unique_tags = pd.unique(target_df["tags"]); tags_n = unique_tags.shape[0]

    #マスターデータにidを設定
    area_df = pd.DataFrame({"area": unique_area, "id": np.arange(area_n)})
    area_id = np.array(pd.merge(target_df[["area"]], area_df, on="area", how="left")["id"], dtype="int")

    unique_item = np.append(unique_item[~pd.isna(pd.Series(unique_item))], np.nan)
    item_df = pd.DataFrame({"item": unique_item, "id": np.arange(item_n)})
    item_id = np.array(pd.merge(target_df[["item"]], item_df, on="item", how="left")["id"], dtype="int")

    unique_subject = np.append(unique_subject[~pd.isna(pd.Series(unique_subject))], np.nan)
    subject_df = pd.DataFrame({"subject": unique_subject, "id": np.arange(subject_n)})
    subject_id = np.array(pd.merge(target_df[["subject"]], subject_df, on="subject", how="left")["id"], dtype="int")

    unique_element = np.append(unique_element[~pd.isna(pd.Series(unique_element))], np.nan)
    element_df = pd.DataFrame({"element": unique_element, "id": np.arange(element_n)})
    element_id = np.array(pd.merge(target_df[["element"]], element_df, on="element", how="left")["id"], dtype="int")

    unique_trend = np.append(unique_trend[~pd.isna(pd.Series(unique_trend))], np.nan)
    trend_df = pd.DataFrame({"trend": unique_trend, "id": np.arange(trend_n)})
    trend_id = np.array(pd.merge(target_df[["trend"]], trend_df, on="trend", how="left")["id"], dtype="int")
    return target_df, area_df, area_id, item_df, item_id, subject_df, subject_id, element_df, element_id, trend_df, trend_id

In [16]:
##ニュースソースの重複を削除する
def correspond_data(target_df, area_id, item_id, subject_id, element_id, trend_id):
    #ニュースデータの日付を市場が開いている時間に繰り越す
    index = np.array(np.where((target_df["date"].apply(lambda x:x.time()) >= time(hour=0)) & 
                              (target_df["date"].apply(lambda x:x.time()) <= time(hour=15)))[0], dtype="int")
    index_target = np.delete(np.arange(target_df.shape[0]), index)
    new_date = target_df[["date"]].copy()
    new_date["date"].iloc[index_target] = target_df["date"].iloc[index_target] + timedelta(days=1)

    #日付のデータ型を数値型に変更
    df_date = np.array((new_date["date"].dt.date.astype("str")).str.replace("-", ""), dtype="int")
    unique_date = np.array(np.sort(np.unique(df_date)), dtype="int")  
    date_n = unique_date.shape[0]

    #重複しているニュースを特定
    tmp_df = pd.concat((pd.DataFrame(df_date), target_df[["area", "subject", "item", "element", "trend"]]), axis=1)
    tmp_df = tmp_df.rename(columns={0: "date"})
    tmp_df = tmp_df.fillna("hoge")
    index_dup = np.array(tmp_df.duplicated())
    joint_tag = tmp_df["date"].astype("U8") + "-" + tmp_df["area"] + "-" + tmp_df["subject"] +\
                    "- " + tmp_df["item"] + "-" + tmp_df["element"] + "-" + tmp_df["trend"]
    joint_count = joint_tag.value_counts()
    pd.DataFrame({"tag": joint_count.index, "freq": np.array(joint_count, dtype="int")}).to_csv("D:/Statistics/data/record_dup.csv")

    #重複を削除
    target_df = target_df.iloc[~index_dup, ]
    target_df.index = np.arange(target_df.shape[0])
    area_id = area_id[~index_dup]
    item_id = item_id[~index_dup]
    subject_id = subject_id[~index_dup]
    element_id = element_id[~index_dup]
    trend_id = trend_id[~index_dup]
    df_date = df_date[~index_dup]
    
    return target_df, area_id, item_id, subject_id, element_id, trend_id, df_date

In [17]:
##インデックスを設定
def create_index(area_id, item_id, subject_id, element_id, trend_id):
    #アイテムごとのユニーク数を数える
    area_n = np.unique(area_id).shape[0]
    item_n = np.unique(item_id).shape[0]
    subject_n = np.unique(subject_id).shape[0]
    element_n = np.unique(element_id).shape[0]
    trend_n = np.unique(trend_id).shape[0]
    
    #インデックスを定義
    index_area = [i for i in range(area_n)]
    index_item = [i for i in range(item_n)]
    index_subject = [i for i in range(subject_n)]
    index_element = [i for i in range(element_n)]
    index_trend = [i for i in range(trend_n)]
    for i in range(area_n):
        index_area[i] = np.array(np.where(area_id==i)[0], dtype="int")
    for i in range(item_n):
        index_item[i] = np.array(np.where(item_id==i)[0], dtype="int")
    for i in range(subject_n):
        index_subject[i] = np.array(np.where(subject_id==i)[0], dtype="int")
    for i in range(element_n):
        index_element[i] = np.array(np.where(element_id==i)[0], dtype="int")
    for i in range(trend_n):
        index_trend[i] = np.array(np.where(trend_id==i)[0], dtype="int")
    return index_area, index_item, index_subject, index_element, index_trend

In [18]:
##文書をテキストとaiepに分離する
def df_allocation(target_df):
    #ユニークなテキストを抽出
    index = np.array(np.where(~target_df["key"].duplicated()==True)[0], dtype="int")
    key_id = pd.DataFrame(np.arange(index.shape[0])[:, np.newaxis]).rename(columns={0: "key_id"})
    text_data = target_df[["key", "date", "headline", "text"]].iloc[index]
    text_data.index = np.arange(index.shape[0])
    text_data = pd.concat((key_id, text_data), axis=1)

    #aiepのデータフレームを作成
    aiep_data = target_df[["key", "date", "area", "subject", "item", "element", "predicate", "trend", "tags", "model", "aiep"]]
    temp_id = pd.merge(aiep_data[["key"]], text_data[["key", "key_id"]], on="key", how="left")[["key_id"]]
    aiep_data = pd.concat((temp_id, aiep_data), axis=1)
    return text_data, aiep_data

In [19]:
##パラグラフ単位のテキストを結合
def paragraph_text(text_data):
    #データの読み込み
    pf2010 = pd.read_csv("D:/Statistics/data/DJ_news_data/DJNML/csv_paragraph/DJNWS_2010.csv")
    pf2010 = pf2010[["key", "date_jst", "type", "p_num", "text"]]
    pf2011 = pd.read_csv("D:/Statistics/data/DJ_news_data/DJNML/csv_paragraph/DJNWS_2011.csv")
    pf2011 = pf2011[["key", "date_jst", "type", "p_num", "text"]]
    pf2012 = pd.read_csv("D:/Statistics/data/DJ_news_data/DJNML/csv_paragraph/DJNWS_2012.csv")
    pf2012 = pf2012[["key", "date_jst", "type", "p_num", "text"]]
    pf2013 = pd.read_csv("D:/Statistics/data/DJ_news_data/DJNML/csv_paragraph/DJNWS_2013.csv")
    pf2013 = pf2013[["key", "date_jst", "type", "p_num", "text"]]
    pf2014 = pd.read_csv("D:/Statistics/data/DJ_news_data/DJNML/csv_paragraph/DJNWS_2014.csv")
    pf2014 = pf2014[["key", "date_jst", "type", "p_num", "text"]]
    pf2015 = pd.read_csv("D:/Statistics/data/DJ_news_data/DJNML/csv_paragraph/DJNWS_2015.csv")
    pf2015 = pf2015[["key", "date_jst", "type", "p_num", "text"]]
    pf2016 = pd.read_csv("D:/Statistics/data/DJ_news_data/DJNML/csv_paragraph/DJNWS_2016.csv")
    pf2016 = pf2016[["key", "date_jst", "type", "p_num", "text"]]
    pf2017 = pd.read_csv("D:/Statistics/data/DJ_news_data/DJNML/csv_paragraph/DJNWS_2017.csv")
    pf2017 = pf2017[["key", "date_jst", "type", "p_num", "text"]]
    pf2018 = pd.read_csv("D:/Statistics/data/DJ_news_data/DJNML/csv_paragraph/DJNWS_2018.csv")
    pf2018 = pf2018[["key", "date_jst", "type", "p_num", "text"]]
    pf = pd.concat((pf2010, pf2011, pf2012, pf2013, pf2014, pf2015, pf2016, pf2017, pf2018), axis=0)
    pf.index = np.arange(pf.shape[0])

    #ターゲットのテキストを抽出
    key_id = np.array(pd.merge(pf[["key"]], text_data[["key", "key_id"]], on="key", how="left")[["key_id"]]).reshape(-1)
    index_target = np.array(np.where(np.isnan(key_id)==False)[0], dtype="int")
    target_pf = pf.iloc[index_target]
    target_pf["key_id"] = np.array(key_id[index_target], dtype="int")
    target_pf = target_pf[["key_id", "key", "date_jst", "type", "p_num", "text"]]
    target_pf.index = np.arange(target_pf.shape[0])
    return target_pf

In [20]:
##データ解析対象のテキストの抽出とクレンジング
#データの読み込み
remove_item = np.array(["原油", "石油", "米国債", "米ドル", "ユーロ", "日本円", "日本国債", "株式"])
remove_element = np.array(["政策金利", "金利", "株式市場", "米国株", "株価", "利回り", "経済"])
read_data = data_input(1, 2, remove_item, remove_element)

  if self.run_code(code, result):
  mask |= (ar1 == a)


In [21]:
#データの加工とデータの絞り込み
target_df = data_preprocess(read_data)
res_delete_news = delete_news(target_df)
target_df, area_df, area_id, item_df, item_id, subject_df, subject_id, element_df, element_id, trend_df, trend_id = res_delete_news
del res_delete_news

#テキスト本文とaiepをそれぞれデータフレーム化する
res_correspond_data = correspond_data(target_df, area_id, item_id, subject_id, element_id, trend_id)
target_df, area_id, item_id, subject_id, element_id, trend_id, df_date = res_correspond_data
del res_correspond_data
index_area, index_item, index_subject, index_element, index_trend = create_index(area_id, item_id, subject_id, element_id, trend_id)
text_data, aiep_data = df_allocation(target_df)
copy_data1 = text_data.copy(); copy_data = aiep_data.copy()   #バックアップ
target_pf = paragraph_text(text_data)   #パラグラフ単位のテキストと結合

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [22]:
####テキストの形態素解析の実行####
##keyのインデックスとidを作成する関数
def make_index(df):
    #データの準備
    key = np.array(df[["key_id"]]).reshape(-1)
    unique_key = np.unique(key)
    N = key.shape[0]
    n = unique_key.shape[0]
    a = pd.DataFrame({"key": key})
    b = pd.DataFrame({"no": np.arange(unique_key.shape[0]), "key": unique_key})
    temporal_data = pd.merge(a, b , on="key", how="left")
    temporal_id = np.array(temporal_data[["no"]]).reshape(-1)

    #オブジェクトの格納用配列
    index_key = [i for i in range(n)]
    n_key = np.repeat(0, n)
    first_key = np.repeat(0, n)

    #keyのインデックスを作成
    for i in range(n):
        index_key[i] = np.array(np.where(temporal_id==i)[0], dtype="int")
        n_key[i] = index_key[i].shape[0]
        first_key[i] = index_key[i][0]
    return key, unique_key, temporal_id, N, n, index_key, n_key, first_key

In [23]:
##textの形態素解析の実行と結果の整形する関数
def mecab_text(df_part, get_class):
    #Mecabの設定
    m = MeCab.Tagger()

    #データの設定
    n = df_part.shape[0]
    text_vec = np.array(df_part[["text"]]).reshape(-1)
    text_list = [i for i in range(n)]
    id_list = [i for i in range(n)]
    index_list = [i for i in range(n)]
    w = np.repeat(0, n)
    max_n = 0

    #文書ごとに形態素解析結果をリストに格納
    for i in range(n):
        if i%10000==0:
            print(i)
        #形態素解析を実行
        n_text = neologdn.normalize(text_vec[i])
        res = m.parse(n_text)

        #結果を整形
        parsed_text = res.split('\n')
        parsed_results = pd.Series(parsed_text).str.split('\t|,').tolist()
        temp_df = pd.DataFrame.from_records(parsed_results[0:len(parsed_results)-2])
        temp_text = np.array(temp_df)[:, np.array([0, 7, 1, 2])]
        index_get = np.where(np.in1d(temp_text[:, 2], get_class))[0]
        if index_get.shape[0]==0:
            continue
        text_list[i] = temp_text[index_get, ]

        #IDを格納
        w[i] = text_list[i].shape[0]
        id_list[i] = np.repeat(i, w[i])
        index_list[i] = np.arange(w[i]) + max_n
        max_n = np.max(index_list[i]) + 1
    return text_list, id_list, index_list, w

In [24]:
##形態素結果データフレームを作成
def create_word_df(text_list, id_list, index_list, w, target_pf):
    #データの設定
    d = len(index_list)
    f = np.max(index_list[d-1]) + 1
    text = np.zeros((f, 4), dtype="object")
    d_id = np.repeat(0, f)

    #単語配列と単語IDを作成
    for i in range(d):
        text[index_list[i], ] = text_list[i]
        d_id[index_list[i], ] = id_list[i]
    index_genkei = np.array(np.where((text[:, 0]!=text[:, 1]) & (text[:, 1]!="*"))[0], dtype="int")
    text[index_genkei, 0] = text[index_genkei, 1]
    text = text[:, np.array([0, 2, 3])]

    #key_idを単語単位に拡張する
    index_w = np.where(w > 0)[0]
    key_id = np.repeat(np.array(target_pf["key_id"].iloc[index_w]), w[index_w])

    #データフレームを作成
    word_df = pd.DataFrame({"key": key_id, "id": d_id, "word": text[:, 0], "word_class1": text[:, 1], "word_class2": text[:, 2]})
    return word_df

In [25]:
##単語の名寄せを行う
def replace_word(word_df):
    #データの読み込み
    area_dic = pd.read_csv("D:/Statistics/data/dic/area_pattern_freq.csv", encoding="Shift-Jis")
    item_dic = pd.read_csv("D:/Statistics/data/dic/item_pattern_freq.csv", encoding="Shift-Jis")

    #単語の置き換え
    temp_df = word_df.copy()
    replace_word = pd.merge(temp_df, item_dic[["input", "output"]], left_on="word", right_on="input", how="left")["output"]
    index_target = np.array(np.where(pd.isna(replace_word)==False)[0], dtype="int")
    temp_df["word"].iloc[index_target] = replace_word.iloc[index_target]
    return temp_df

In [26]:
##不要な単語を削除
def delete_word(word_df, trunc_word):
    ##低頻度語を削除
    #単語頻度を集計
    word_freq = word_df["word"].value_counts()
    word_freq = pd.DataFrame({"word": np.array(word_freq.index), "freq": word_freq})
    word_freq.index = np.arange(word_freq.shape[0])
    
    #低頻度語を特定し、データフレームから削除
    index_target = np.array(np.where(word_freq["freq"] >= trunc_word)[0], dtype="int")
    target_word = np.array(word_freq["word"].iloc[index_target])
    j = np.array(pd.merge(word_df, pd.DataFrame({"word": target_word, "no": 1}), on="word", how="left")["no"])
    word_df = word_df.iloc[np.where(~np.isnan(j))[0]]
    word_df.index = np.arange(word_df.shape[0])

    ##ストップワードを削除
    #辞書の読み込み
    stopword = pd.read_table("D:/Statistics/data/dic/stopword_jp.txt", header=None)
    stopword = stopword.rename(columns={0: "word"})
    stopword["flag"] = 1

    #データフレームから単語を削除
    j = np.array(pd.merge(word_df, stopword, on="word", how="left")["flag"])
    index_target = np.array(np.where(np.isnan(j))[0], dtype="int")
    word_df = word_df.iloc[index_target]
    word_df.index = np.arange(word_df.shape[0])
    
    ##隣接する重複を削除する
    index1 = np.arange(word_df.shape[0])[:word_df.shape[0]-1]
    index2 = np.arange(word_df.shape[0])[np.arange(1, word_df.shape[0])]
    index_dup1 = np.array(word_df["word"].iloc[index1])==np.array(word_df["word"].iloc[index2])
    index_dup1 = np.array(index_dup1, dtype="int")
    index_dup2 = np.array(word_df["id"].iloc[index1])==np.array(word_df["id"].iloc[index2])
    index_dup2 = np.array(index_dup2, dtype="int")
    index_dup = np.array(np.where((index_dup1+index_dup2) < 2)[0], dtype="int")
    word_df = word_df.iloc[index_dup]
    word_df.index = np.arange(word_df.shape[0])
    
    #数値を削除
    index = np.array(np.where(word_df["word_class2"]!="数")[0])
    word_df = word_df.iloc[index]
    word_df.index = np.arange(word_df.shape[0])
    
    ##単語頻度の集計と低頻度語の処理
    #単語頻度を集計
    word_freq = word_df["word"].value_counts()
    word_freq = pd.DataFrame({"word": np.array(word_freq.index), "freq": word_freq})
    word_freq.index = np.arange(word_freq.shape[0])
    word_freq.to_csv("D:/Statistics/data/word_freq.csv", index=None)
        
    #一定上の頻度ごとの語彙数と総単語数を集計
    freq = np.array([1, 3, 5, 10, 20, 30, 50, 75, 100])
    w_agg = np.zeros((len(freq), 3), dtype="int")
    for j in range(len(freq)):
        index = np.where(word_freq["freq"] >= freq[j])[0]
        target = word_freq["freq"].iloc[index]
        w_agg[j, ] = np.array([freq[j], target.shape[0], np.sum(target)], dtype="int")
    return word_df, word_freq, w_agg

In [27]:
##IDを割り当て直す
def setting_id(word_df):
    #sentence idを設定
    sentence_id = np.unique(word_df["id"])
    s = sentence_id.shape[0]
    word_df = word_df.rename(columns={"id": "original_id"})
    new_id = pd.DataFrame({"original_id": sentence_id, "new_id": np.arange(s)})
    word_df = pd.merge(word_df, new_id, on="original_id", how="left")

    #word idを設定
    word = pd.unique(word_df["word"])
    v = word.shape[0]
    word_id = pd.DataFrame({"word": word, "word_id": np.arange(v)})
    word_df = pd.merge(word_df, word_id, on="word", how="left")
    word_df = word_df[["key", "original_id", "new_id", "word", "word_id", "word_class1", "word_class2"]]
    
    #idをベクトル化
    key = np.array(word_df["key"])
    sentence = np.array(word_df["new_id"])
    wd = np.array(word_df["word_id"])
    return word_df, key, sentence, wd, new_id, word_id, s, v

In [28]:
##インデックスの作成
def setting_index(key, sentence, wd, n, s, v):
    #頻度を集計
    sentence_freq = pd.Series(word_df["new_id"]).value_counts()
    sentence_freq = np.array(sentence_freq.iloc[np.argsort(np.array(sentence_freq.index))], dtype="int")
    key_freq = pd.Series(word_df["key"]).value_counts()
    key_freq = np.array(key_freq.iloc[np.argsort(np.array(key_freq.index))], dtype="int")

    #インデックスの格納用
    key_list = [i for i in range(n)]
    sentence_list = [i for i in range(s)]
    wd_list = [i for i in range(v)]

    #インデックスをリストに格納
    max_n = 0
    for i in range(n):
        key_list[i] = np.arange(key_freq[i]) + max_n
        max_n = np.max(key_list[i]) + 1
    max_n = 0
    for i in range(s):
        sentence_list[i] = np.arange(sentence_freq[i]) + max_n
        max_n = np.max(sentence_list[i]) + 1
    for i in range(v):
        wd_list[i] = np.array(np.where(wd==i)[0], dtype="int")
    return key_list, sentence_list, wd_list

In [29]:
##テキストのクレンジング
#文字列の正規化や記号の削除を行う
target_pf[["text"]] = target_pf["text"].str.lower()
target_pf[["text"]] = target_pf["text"].str.normalize("NFKC")
target_pf[["text"]] = target_pf["text"].str.replace(",", "")
target_pf[["text"]] = target_pf["text"].str.replace("*", "")
target_pf[["text"]] = target_pf["text"].str.replace("[0-9]+", "0")

#データのkeyのインデックスとidを作成
res = make_index(target_pf)
key = res[0]
unique_key = res[1]
temporal_id = res[2]
N = res[3]
n = res[4]
index_key = res[5]
n_key = res[6]
first_key = res[7]

In [30]:
##形態素解析を実行
#文書ごとに形態素解析
get_class = np.array(["名詞", "動詞", "形容詞"])
res = mecab_text(target_pf, get_class)
text_list = res[0]
id_list = res[1]
index_list = res[2]
w = res[3]

#形態素結果のデータフレームを作成
word_df = create_word_df(text_list, id_list, index_list, w, target_pf)
word_df.to_csv("D:/Statistics/data/word_df_backup.csv", index=None)
del res, text_list, id_list, index_list, w

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000


In [31]:
##不要な単語の削除とidの設定
#単語の名寄せ
word_df = pd.read_csv("D:/Statistics/data/word_df_backup.csv", index_col=None)
word_df = replace_word(word_df)   

#低頻度語とストップワード削除
res = delete_word(word_df, 10)
word_df = res[0]
del res

#センテンスと単語のidを設定
res = setting_id(word_df)
word_df = res[0]
key = res[1]
sentence = res[2]
wd = res[3]
co_sentence = res[4]
co_word = res[5]
s = res[6]
v = res[7]
del res

#インデックスの作成
res = setting_index(key, sentence, wd, n, s, v)
key_list = res[0]
sentence_list = res[1]
wd_list = res[2]
del res

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [32]:
####Joint Hierarchical Structured Topic Modelを推定####
##パラメータを推定するための関数
#多項分布の乱数を生成する関数
def rmnom(pr, n, k, no, pattern):
    z_id = np.argmax((np.cumsum(pr, axis=1) >= np.random.uniform(0, 1, n)[:, np.newaxis]), axis=1)
    if pattern==1:
        Z = sparse.coo_matrix((np.repeat(1, n), (no, np.array(z_id))), shape=(n, k))   #スパース行列の設定
        return z_id, Z
    return z_id

#トピック尤度と負担率を計算する関数
def LLho(theta, phi, d_id, wd, f, k):
    Lho = theta[d_id, ] * (phi.T)[wd, ]
    topic_rate = Lho / np.sum(Lho, axis=1)[:, np.newaxis]
    return Lho, topic_rate

In [33]:
##アルゴリズムの設定
k = 50
R = 500
keep = 1
burnin = int(250/keep)
iter = 0
disp = 10
e1 = 0.005
e2 = 0.025
e3 = 0.01
L1 = 3
L2 = 1
k_vec = np.repeat(1, k)
f = word_df.shape[0] 

In [34]:
##名詞と動詞を分離する
#新しい単語idを作成
index_noun = np.array(np.where(word_df["word_class1"]=="名詞")[0], dtype="int")
index_verb = np.array(np.where(word_df["word_class1"]!="名詞")[0], dtype="int")
wd_df1 = wd[index_noun]
wd_df2 = wd[index_verb]
unique_wd1 = np.unique(wd_df1); v1 = unique_wd1.shape[0]
unique_wd2 = np.unique(wd_df2); v2 = unique_wd2.shape[0]
wd_df1 = pd.merge(pd.DataFrame({"wd": wd_df1}), pd.DataFrame({"wd": unique_wd1, "wd1": np.arange(v1)}), on="wd", how="left")
wd_df2 = pd.merge(pd.DataFrame({"wd": wd_df2}), pd.DataFrame({"wd": unique_wd2, "wd2": np.arange(v2)}), on="wd", how="left")
wd_df1["word"] = np.array(word_df["word"].iloc[index_noun])
wd_df2["word"] = np.array(word_df["word"].iloc[index_verb])
wd1 = np.array(wd_df1["wd1"], dtype="int")
wd2 = np.array(wd_df2["wd2"], dtype="int")

#sentence idを分離する
sentence1 = sentence[index_noun]
sentence2 = sentence[index_verb]

#インデックスを作成
wd_list1 = [i for i in range(v1)]; wd_dt1 = [i for i in range(v1)]
wd_list2 = [i for i in range(v2)]; wd_dt2 = [i for i in range(v2)]
for i in range(v1):
    wd_list1[i] = np.array(np.where(wd1==i)[0], dtype="int")
    wd_dt1[i] = np.repeat(1, wd_list1[i].shape[0])
for i in range(v2):
    wd_list2[i] = np.array(np.where(wd2==i)[0], dtype="int")
    wd_dt2[i] = np.repeat(1, wd_list2[i].shape[0])

In [35]:
##文書とタグを対応付ける(itemとelementを統合)
#aiep idを作成
index_item = np.array(np.where(pd.isna(aiep_data["item"])==False)[0], dtype="int")
index_element = np.array(np.where(pd.isna(aiep_data["element"])==False)[0], dtype="int")
joint_id = np.array(aiep_data["key_id"].iloc[np.append(index_item, index_element)], dtype="int")
sortlist = np.array(np.argsort(joint_id), dtype="int")
joint_id = joint_id[sortlist]
aiep = np.append(aiep_data["item"].iloc[index_item], aiep_data["element"].iloc[index_element])[sortlist]
unique_aiep = pd.unique(aiep); aiep_n = unique_aiep.shape[0]
aiep_df = pd.merge(pd.DataFrame({"aiep": aiep}), pd.DataFrame({"aiep": unique_aiep, "id": np.arange(aiep_n)}), on="aiep", how="left")
aiep_id = np.array(aiep_df["id"])

#インデックスを作成
aiep_list = [i for i in range(aiep_n)]
aiep_dt = [i for i in range(aiep_n)]
for i in range(aiep_n):
    aiep_list[i] = np.array(np.where(aiep_id==i)[0], dtype="int")
    aiep_dt[i] = np.repeat(1, aiep_list[i].shape[0])
g = aiep_id.shape[0]

In [37]:
##文書とタグを対応付ける(itemとelementは別々)
#aiep idを作成
index_item = np.array(np.where(pd.isna(aiep_data["item"])==False)[0], dtype="int")
index_element = np.array(np.where(pd.isna(aiep_data["element"])==False)[0], dtype="int")
joint_id1 = np.array(aiep_data["key_id"].iloc[index_item], dtype="int")
joint_id2 = np.array(aiep_data["key_id"].iloc[index_element], dtype="int")
aiep_item = np.array(aiep_data["item"].iloc[index_item])
aiep_element = np.array(aiep_data["element"].iloc[index_element])
unique_item = pd.unique(aiep_item); unique_element = pd.unique(aiep_element)
item_n = unique_item.shape[0]; element_n = unique_element.shape[0]
aiep_df1 = pd.merge(pd.DataFrame({"aiep": aiep_item}), 
                    pd.DataFrame({"aiep": unique_item, "id": np.arange(item_n)}), on="aiep", how="left")
aiep_df2 = pd.merge(pd.DataFrame({"aiep": aiep_element}), 
                    pd.DataFrame({"aiep": unique_element, "id": np.arange(element_n)}), on="aiep", how="left")
aiep_id1 = np.array(aiep_df1["id"]); aiep_id2 = np.array(aiep_df2["id"])

#インデックスを作成
aiep_list1 = [i for i in range(item_n)]; aiep_list2 = [i for i in range(element_n)]
aiep_dt1 = [i for i in range(item_n)]; aiep_dt2 = [i for i in range(element_n)]
for i in range(item_n):
    aiep_list1[i] = np.array(np.where(aiep_id1==i)[0], dtype="int")
    aiep_dt1[i] = np.repeat(1, aiep_list1[i].shape[0])
for i in range(element_n):
    aiep_list2[i] = np.array(np.where(aiep_id2==i)[0], dtype="int")
    aiep_dt2[i] = np.repeat(1, aiep_list2[i].shape[0])  
g1 = aiep_id1.shape[0]
g2 = aiep_id2.shape[0]
g = g1 + g2

In [38]:
#和を計算するためのnベクトルを定義
noun_vec = np.array(word_df["word_class1"]=="名詞", dtype="int")
key_dt = [i for i in range(n)]
noun_dt = [i for i in range(n)]
sentence_dt = [i for i in range(s)]
noun_n = np.repeat(0, n)
sentence_n = np.repeat(0, s)
for i in range(n):
    key_dt[i] = np.repeat(1, key_list[i].shape[0])
    noun_dt[i] = noun_vec[key_list[i]]
    noun_n[i] = np.sum(noun_dt[i])
for i in range(s):
    sentence_dt[i] = np.repeat(1, sentence_list[i].shape[0])
    sentence_n[i] = sentence_dt[i].shape[0]    
noun_n = noun_n[:, np.newaxis]

In [39]:
#文書とセンテンスのidを対応付ける
index_dup = np.array(np.where(word_df[["key", "new_id"]].duplicated()==False)[0], dtype="int")
correspond_id = np.array(word_df["key"].iloc[index_dup])

In [40]:
##事前分布の設定
alpha01 = 0.25
beta01 = 0.1
beta02 = 0.1

In [41]:
##初期値の設定
#ハイパーパラメータの初期値
delta = np.array(np.full((s, k), 0.25), dtype="float32")
delta_n = sentence_n.copy(); delta_n[delta_n < 5] = 5
er = 0.001

#トピック分布の初期値
theta_d = np.array(np.random.dirichlet(np.repeat(5.0, k), n), dtype="float32")
theta_s = np.array(np.random.dirichlet(np.repeat(5.0, k), s), dtype="float32")

#単語分布の初期値
phi = np.array(np.random.dirichlet(np.repeat(5.0, v), k), dtype="float32")
phi1 = np.array(np.random.dirichlet(np.repeat(5.0, v1), k), dtype="float32")
phi2 = np.array(np.random.dirichlet(np.repeat(5.0, v2), k), dtype="float32")
omega1 = np.array(np.random.dirichlet(np.repeat(5.0, item_n), k), dtype="float32")
omega2 = np.array(np.random.dirichlet(np.repeat(5.0, element_n), k), dtype="float32")

#アルゴリズム推定用配列
Lho1 = np.zeros((f, k), dtype="float32")
Lho21 = np.zeros((g1, k), dtype="float32")
Lho22 = np.zeros((g2, k), dtype="float32")

In [42]:
##パラメータの格納用配列
#トピックの格納用配列
SEG1 = np.zeros((f, k), dtype="int16")
SEG21 = np.zeros((g1, k), dtype="int16")
SEG22 = np.zeros((g2, k), dtype="int16")

#パラメータの格納用配列
THETA_D = np.zeros((n, k), dtype="float32")
THETA_S = np.zeros((s, k), dtype="float32")
PHI1 = np.zeros((k, v1), dtype="float32")
PHI2 = np.zeros((k, v2), dtype="float32")
OMEGA1 = np.zeros((k, item_n), dtype="float32")
OMEGA2 = np.zeros((k, element_n), dtype="float32")

In [None]:
####パラメータをサンプリング####
for rp in range(R):

    ##単語トピックを生成
    #トピック選択確率を定義 
    Lho1[index_noun, ] = theta_s[sentence1, ] * (phi1.T)[wd1, ]
    Lho1[index_verb, ] = theta_s[sentence2, ] * (phi2.T)[wd2, ]
    topic_prob = Lho1 / np.dot(Lho1, k_vec)[:, np.newaxis]

    #多項分布からトピックを生成
    Zi = np.array(rmnom(topic_prob, f, k, np.arange(f), 1)[1].todense(), dtype="int8")

    ##ディリクレ分布からパラメータをサンプリング
    #グローバルトピック分布をサンプリング
    y = np.zeros((n, k), dtype="int")
    for i in range(n):
        z = Zi[key_list[i], ]
        y[i, ] = np.dot(noun_dt[i], z)
        x = np.dot(key_dt[i], z) + alpha01
        theta_d[i, ] = np.random.dirichlet(x, 1).reshape(-1)

    #ハイパーパラメータを更新
    delta_vec = np.dot(delta, k_vec)
    delta1 = psi(theta_d[correspond_id, ] + delta) - psi(delta)
    delta2 = (psi(delta_n + delta_vec) - psi(delta_vec))[:, np.newaxis]
    new_delta = delta * (delta1 / delta2) + er

    #ローカルトピック分布をサンプリング
    for i in range(s):
        x = np.dot(sentence_dt[i], Zi[sentence_list[i], ]) + new_delta[i, ]
        theta_s[i, ] = np.random.dirichlet(x, 1)
    delta = new_delta.copy()

    #単語分布サンプリング
    Zi1 = Zi[index_noun, ]; Zi2 = Zi[index_verb, ]
    x1 = np.zeros((k, v1)); x2 = np.zeros((k, v2))
    for i in range(v1):
        x1[:, i] = np.dot(wd_dt1[i], Zi1[wd_list1[i], ]) 
        if i < v2:
            x2[:, i] = np.dot(wd_dt2[i], Zi2[wd_list2[i], ]) 
    for j in range(k):
        phi1[j, ] = np.random.dirichlet(x1[j, ] + beta01, 1)
        phi2[j, ] = np.random.dirichlet(x2[j, ] + beta01, 1)
    del Zi1, Zi2

    ##タグトピックを生成
    #トピック選択確率を定義
    theta_mu = y / noun_n   #グローバルトピックの経験分布
    Lho21 = theta_mu[joint_id1, ] * (omega1.T)[aiep_id1, ]
    Lho22 = theta_mu[joint_id2, ] * (omega2.T)[aiep_id2, ]
    topic_prob1 = Lho21 / np.dot(Lho21, k_vec)[:, np.newaxis]
    topic_prob2 = Lho22 / np.dot(Lho22, k_vec)[:, np.newaxis]

    #多項分布からトピックを生成
    Si1 = np.array(rmnom(topic_prob1, g1, k, np.arange(g1), 1)[1].todense(), dtype="int8")
    Si2 = np.array(rmnom(topic_prob2, g2, k, np.arange(g2), 1)[1].todense(), dtype="int8")

    ##ディリクレ分布からパラメータをサンプリング
    #タグ分布をサンプリング
    x1 = np.zeros((k, item_n))
    x2 = np.zeros((k, element_n))
    for i in range(item_n):
        x1[:, i] = np.dot(aiep_dt1[i], Si1[aiep_list1[i], ])
        if i < element_n:
            x2[:, i] = np.dot(aiep_dt2[i], Si2[aiep_list2[i], ]) 
    for j in range(k):
        omega1[j, ] = np.random.dirichlet(x1[j, ] + beta02, 1)
        omega2[j, ] = np.random.dirichlet(x2[j, ] + beta02, 1)


    ##サンプリング結果の格納用と表示
    if (rp >= burnin) & (rp%keep==0):
        SEG1 += Zi
        SEG21 += Si1
        SEG22 += Si2
        THETA_D += theta_d
        THETA_S += theta_s
        PHI1 += phi1
        PHI2 += phi2
        OMEGA1 += omega1
        OMEGA2 += omega2
        
    if rp%disp==0: 
        #対数尤度を更新
        LLho1 = np.sum(np.log(np.sum(Lho1, axis=1)))
        LLho21 = np.sum(np.log(np.sum(Lho21, axis=1)))
        LLho22 = np.sum(np.log(np.sum(Lho22, axis=1)))
        LLho = LLho1 + LLho21 + LLho22

        #サンプリング結果を表示
        print(rp)
        print([np.round(LLho, 1), np.round(LLho1, 1), np.round(LLho21, 1), np.round(LLho22, 1)])

0
[-58485881.8, -57435252.0, -434232.9, -616400.9]
10
[-58474526.2, -57423904.0, -434687.6, -615934.6]
20
[-58464869.5, -57413856.0, -434962.4, -616051.2]
30
[-58462197.7, -57410696.0, -435187.1, -616318.6]
40
[-58451161.2, -57399830.0, -435379.9, -615949.3]
50
[-58445364.0, -57393184.0, -435904.1, -616275.9]
60
[-58436630.2, -57384650.0, -436300.1, -615682.2]
70
[-58430367.5, -57378376.0, -436417.5, -615574.0]
80
[-58423310.0, -57370732.0, -436879.9, -615698.1]
90
[-58411978.7, -57359604.0, -437023.4, -615355.3]
100
[-58412000.0, -57359320.0, -437146.2, -615529.9]
110
[-58409343.8, -57357068.0, -436947.0, -615328.8]
120
[-58403195.6, -57350296.0, -437282.8, -615616.9]
130
[-58406197.6, -57353250.0, -437188.4, -615761.3]
140
[-58401722.2, -57349184.0, -437118.3, -615419.9]
150
[-58401240.6, -57348344.0, -437419.8, -615472.8]
160
[-58397962.9, -57344972.0, -437357.1, -615633.7]
170
[-58396375.1, -57343884.0, -436998.7, -615488.5]
180
[-58397113.8, -57344504.0, -437203.7, -615402.1]
190


In [None]:
##結果の確認
r = np.arange(burnin, R).shape[0]

In [None]:
##タグの類似度を出力
#phiのサンプリング結果
new_word = wd_df1.sort_values(by="wd1")
new_word.index = np.arange(new_word.shape[0])
word = np.array(new_word["word"].iloc[np.where(new_word["wd1"].duplicated()==False)[0]])
word_freq = np.array([np.sum(wd_dt1[i]) for i in range(v1)])
phi_sum = word_freq[:, np.newaxis] * (PHI1.T)
topic_prob = phi_sum / np.sum(phi_sum, axis=1)[:, np.newaxis]

#omegaのサンプリング結果
tag_freq = np.array([aiep_list[i].shape[0] for i in range(aiep_n)])
omega_sum = tag_freq[:, np.newaxis] * (OMEGA.T)

#文書とタグの両方に出てくる単語を抽出
j = pd.merge(pd.DataFrame({"word": word}), pd.DataFrame({"word": unique_aiep, "flag": 1}), on="word", how="left")
index_target = np.array(np.where(pd.isna(j["flag"])==False)[0], dtype="int")
j_target = j.iloc[index_target]; j_target.index = np.arange(j_target.shape[0])
phi_target = phi_sum[index_target, ]
freq_target = word_freq[index_target, ]

#文書全体のトピックをタグに結合
j_freq = tag_freq.copy()
j_index = np.array(pd.merge(pd.DataFrame({"word": j_target["word"]}), 
                   pd.DataFrame({"word": unique_aiep, "no": np.arange(unique_aiep.shape[0])}), on="word", how="left")["no"])
omega_sum[j_index, ] = phi_target + omega_sum[j_index, ]
j_freq[j_index] = freq_target + j_freq[j_index]

#結合分布を出力
aux_prob = omega_sum / np.sum(omega_sum, axis=1)[:, np.newaxis]
res_omega = pd.concat((pd.DataFrame({"id": np.arange(aiep_n), "aiep": unique_aiep}), pd.DataFrame(aux_prob)), axis=1)
res_omega.to_csv("D:/Statistics/data/res_omega.csv", index=None, encoding="Shift-Jis")

#コサイン類似度を計算
q = np.sqrt(np.sum(np.power(aux_prob, 2), axis=1))[:, np.newaxis]
cos = np.dot(aux_prob, aux_prob.T) / np.dot(q, q.T)
cos = pd.DataFrame(cos)
cos.columns = unique_aiep; cos.index = unique_aiep
cos.to_csv("D:/Statistics/data/omega_cos.csv", encoding="Shift-Jis")

#タグごとに類似している上位10単語を出力
aiep_freq = np.repeat(0, unique_aiep.shape[0])
similar_word = np.full((unique_aiep.shape[0], 10), "", dtype="object")
similar_score = np.zeros((unique_aiep.shape[0], 10), dtype="float32")
for i in range(unique_aiep.shape[0]):
    aiep_freq[i] = aiep_list[i].shape[0]
    a = cos[[unique_aiep[i]]].sort_values(by=unique_aiep[i], ascending=False)
    similar_word[i, ] = np.array(a.iloc[1:11].index)
    similar_score[i, ] = np.array(a.iloc[1:11]).reshape(-1)
similar_out = pd.concat((pd.DataFrame({"aiep": unique_aiep, "freq1": aiep_freq, "freq2": j_freq}),
                         pd.DataFrame(similar_word), pd.DataFrame(similar_score)), axis=1)
similar_out.to_csv("D:/Statistics/data/similar_out.csv", index=None, encoding="Shift-Jis")