# 基礎集計

1. 形態素解析  
1. 係り受け解析  
1. N-gram解析  
上記を行い、カテゴリ別に出現する単語の傾向把握と、全カテゴリとのリフト値から差分を確認する

# 出力ディレクトリの指定

In [1]:
# 変更箇所
input_dir = "../data/preprocessing/"
output_dir = "../data/basic_agg/"

In [2]:
import os

if os.path.isdir(output_dir):
    print("存在するディレクトリです")
else:
    os.makedirs(output_dir)
    print("出力ディレクトリを作成しました")

存在するディレクトリです


In [3]:
import glob

input_paths = glob.glob(f"{input_dir}*.csv")
print("データ数: ", len(input_paths))
print(input_paths)

データ数:  1
['../data/preprocessing/all_category_df.csv']


In [4]:
# 基礎集計に使用するDataFrameのパスを指定
input_path = input_paths[0]

----

In [5]:
import itertools
from tqdm import tqdm
import re

import pandas as pd
import MeCab

mc = MeCab.Tagger()

In [6]:
all_category_df = pd.read_csv(input_path)
all_category_df.head(2)

Unnamed: 0,category,text
0,dokujo-tsushin,友人代表のスピーチ、独女はどうこなしている?\n もうすぐジューン・ブライドと呼ばれる0月。...
1,dokujo-tsushin,ネットで断ち切れない元カレとの縁\n 携帯電話が普及する以前、恋人への連絡ツールは一般電話が...


# 形態素解析

In [40]:
def extract_noun(text):
    """ テキストから名詞を抜き出してカウンターを作成"""
    
    counter = {} # (名詞, 出現数)のcounter

    for one_pos in mc.parse(text).split("\n"):
        if (one_pos == "EOS") or (len(one_pos) == 0):
            continue
        surf, mc_result = one_pos.split("\t")
        #品詞, 品詞細分類1,_,_,_,_,原型, _ 
        pos0, pos1, _, _, _, _, base, *_ = mc_result.split(",")
        if (pos0 == "名詞") & (pos1 == "一般"):

            if base == "*":
                noun = surf
            else:
                noun = base

            if noun in counter:
                counter[noun] += 1
            else:
                counter[noun] = 1
    
    counter = sorted(counter.items(), 
                          key=lambda x:x[1], 
                          reverse=True)
    return counter

def extract_verb(text):
    """ テキストから動詞を抜き出してカウンターを作成"""
    
    # Dockerfileで未知語の更新ができなかったため無理やり
    # 記号系が後述の[名詞　サ変接続]に引っかかることを回避
    mark = "[!#$%&\'\\\\()*+,-./:;<=>?@[\\]^_`"\
           "\{|}~「」〔〕“”〈〉『』【】＆＊・（）＄＃＠。、？！｀＋￥％ 　]"
    stop_mark = re.compile(mark)
    word = "(する|いる|れる|なる|ある|できる|られる|せる|おる|てる)"
    stop_word = re.compile(word)
    
    counter = {} # (名詞, 出現数)のcounter
    for one_pos in mc.parse(text).split("\n"):
        if (one_pos == "EOS") or (len(one_pos) == 0):
            continue

        surf, mc_result = one_pos.split("\t")
        #品詞, 品詞細分類1,_,_,_,_,原型, _ 
        pos0, pos1, _, _, _, _, base, *_ = mc_result.split(",")

        if (stop_mark.search(surf)) or (stop_word.search(base)):
            continue
    
        if pos0 == "動詞":
            if base == "*":
                verb = surf
            else:
                verb = base
            if verb in counter:

                counter[verb] += 1
            else:
                counter[verb] = 1

        elif (pos0 == "名詞") & (pos1 == "サ変接続"):
                
            if base == "*":
                verb = f"{surf}する"
            else:
                verb = f"{base}する"

            if verb in counter:
                counter[verb] += 1
            else:
                counter[verb] = 1
    
    counter = sorted(counter.items(), 
                          key=lambda x:x[1], 
                          reverse=True)
    return counter

def extract_adjective(text):
    """ テキストから名詞を抜き出してカウンターを作成"""
    
    counter = {} # (名詞, 出現数)のcounter

    for one_pos in mc.parse(text).split("\n"):
        if (one_pos == "EOS") or (len(one_pos) == 0):
            continue
        surf, mc_result = one_pos.split("\t")
        #品詞, 品詞細分類1,_,_,_,_,原型, _ 
        pos0, pos1, _, _, _, _, base, *_ = mc_result.split(",")
        if ((pos0 == "形容詞") & (pos1 == "自立")) or \
           ((pos0 == "名詞") & (pos1 == "形容動詞語幹")):

            if base == "*":
                noun = surf
            else:
                noun = base

            if noun in counter:
                counter[noun] += 1
            else:
                counter[noun] = 1
    
    counter = sorted(counter.items(), 
                          key=lambda x:x[1], 
                          reverse=True)
    return counter

In [58]:
def create_pos_counter_df(texts, pos=""):
    """ 品詞別のカウントDataFrame作成
    
        args:
            texts(list): データフレームの1カラムをリスト化
            pos(str): noun or verb or adjective
    """
    
    counter = {}
    for text in tqdm(texts):

        if pos == "noun":
            counter_1text = extract_noun(text)
        elif pos == "verb":
            counter_1text = extract_verb(text)
        elif pos == "adjective":
            counter_1text = extract_adjective(text)
        else:
            print("posを指定してください。空DFを返します")
            return pd.DataFrame()

        for key, val in counter_1text:
            if key in counter:
                counter[key] += 1
            else:
                counter[key] = 1

    counter = sorted(counter.items(), 
                          key=lambda x:x[1], 
                          reverse=True)

    result = pd.DataFrame(counter, columns=[pos, "count"])
    
    return result

## 全体

In [59]:
texts = all_category_df["text"].tolist()

for pos in ["noun", "verb", "adjective"]:
    result = create_pos_counter_df(texts, pos=pos)
    
    # カテゴリ別リフト値を算出するために使用
    count_sum = result["count"].sum()
    result["ratio"] = result["count"].apply(lambda x: x / count_sum)

    result.to_csv(f"{output_dir}all_{pos}.csv", 
                  encoding="utf-8-sig", index=False)

100%|██████████| 7367/7367 [00:15<00:00, 470.09it/s]
100%|██████████| 7367/7367 [00:16<00:00, 446.51it/s]
100%|██████████| 7367/7367 [00:14<00:00, 519.84it/s]


## カテゴリ別

In [60]:
for pos in ["noun", "verb", "adjective"]:
    print(pos)
    all_df = pd.read_csv(f"{output_dir}all_{pos}.csv")
    all_df = all_df.rename(columns={"count": "all_count",
                                    "ratio": "all_ratio"})

    for category in all_category_df["category"].unique():
        texts = all_category_df.query("category == @category")["text"].tolist()

        result = create_pos_counter_df(texts, pos=pos)
        # カテゴリ別リフト値を算出するために使用
        count_sum = result["count"].sum()
        result["ratio"] = result["count"].apply(lambda x: x / count_sum)
        
        result = pd.merge(result, all_df, on=pos, how="left")
        result["lift"] = result["ratio"] / result["all_ratio"]
        
        result.to_csv(f"{output_dir}{category}_{pos}.csv", 
                      encoding="utf-8-sig", index=False)

noun


100%|██████████| 870/870 [00:02<00:00, 384.97it/s]
100%|██████████| 870/870 [00:01<00:00, 483.31it/s]
100%|██████████| 864/864 [00:01<00:00, 712.45it/s]
100%|██████████| 511/511 [00:01<00:00, 366.96it/s]
100%|██████████| 870/870 [00:02<00:00, 394.36it/s]
100%|██████████| 842/842 [00:01<00:00, 449.56it/s]
100%|██████████| 870/870 [00:02<00:00, 400.69it/s]
100%|██████████| 900/900 [00:01<00:00, 847.60it/s]
100%|██████████| 770/770 [00:00<00:00, 825.37it/s]


verb


100%|██████████| 870/870 [00:02<00:00, 331.38it/s]
100%|██████████| 870/870 [00:02<00:00, 431.31it/s]
100%|██████████| 864/864 [00:01<00:00, 660.87it/s]
100%|██████████| 511/511 [00:01<00:00, 345.73it/s]
100%|██████████| 870/870 [00:02<00:00, 375.46it/s]
100%|██████████| 842/842 [00:02<00:00, 404.75it/s]
100%|██████████| 870/870 [00:02<00:00, 363.48it/s]
100%|██████████| 900/900 [00:01<00:00, 712.52it/s]
100%|██████████| 770/770 [00:01<00:00, 719.61it/s]


adjective


100%|██████████| 870/870 [00:02<00:00, 384.71it/s]
100%|██████████| 870/870 [00:01<00:00, 495.64it/s]
100%|██████████| 864/864 [00:01<00:00, 757.87it/s]
100%|██████████| 511/511 [00:01<00:00, 389.70it/s]
100%|██████████| 870/870 [00:02<00:00, 417.24it/s]
100%|██████████| 842/842 [00:01<00:00, 464.34it/s]
100%|██████████| 870/870 [00:02<00:00, 412.39it/s]
100%|██████████| 900/900 [00:01<00:00, 788.88it/s]
100%|██████████| 770/770 [00:00<00:00, 843.40it/s]


# N-gram

In [45]:
def n_gram(text_list, n):
    return [text_list[idx:idx+n] for idx in range(len(text_list))]

def create_n_gram_counter_df(texts, n):
    """ 品詞別のカウントDataFrame作成
    
        args:
            texts(list): データフレームの1カラムをリスト化
            n(int): n-gramの数
        Note:
            バグ：前単語と後単語のn-gramが入っている
                例　["スマート", "フォン", "関連する", "記事"]の場合
                    理想： "スマート-フォン", "関連する-記事"
                    現実: "スマート-フォン", "フォン-関連する", "関連する-記事"
    """
    # Dockerfileで未知語の更新ができなかったため無理やり
    # 記号系が後述の[名詞　サ変接続]に引っかかることを回避
    mark = "[!#$%&\'\\\\()*+,-./:;<=>?@[\\]^_`"\
           "\{|}~「」〔〕“”〈〉『』【】＆＊・（）＄＃＠。、？！｀＋￥％ 　]"
    stop_mark = re.compile(mark)
    word = "(する|いる|れる|なる|ある|できる|られる|せる|おる|てる)"
    stop_word = re.compile(word)
    
    counter  = {}
    for text in tqdm(texts):
        
        surf_list = []
        past_i = 0
        past_pos = ""
        for i, one_pos in enumerate(mc.parse(text).split("\n")):
            if (one_pos == "EOS") or (len(one_pos) == 0):
                continue

            # n-gramに使用する品詞のみのリスト作成
            surf, mc_result = one_pos.split("\t")
            #品詞, 品詞細分類1,_,_,_,_,原型, _ 
            pos0, pos1, _, _, _, _, base, *_ = mc_result.split(",")
            if (stop_mark.search(surf)) or (stop_word.search(base)):
                continue
            
            if (pos0 in ["名詞", "動詞", "形容詞"]) & \
               (pos1 in ["一般", "固有名詞", "自立",
                         "サ変接続", "形容動詞語幹"]):

                if (pos0 == "名詞") & (pos1 == "サ変接続"):
                    if base == "*":
                        pos = f"{surf}する"
                    else:
                        pos = f"{base}する"
                else:
                    if base == "*":
                        pos = surf
                    else:
                        pos = base
                
                if (i - past_i) == 1:
                    if surf_list:
                        if surf_list[-1] != past_pos:
                            surf_list.append(past_pos)
                    surf_list.append(pos)
                    
                past_i = i
                past_pos = pos

            else:
                past_i = 0
                past_pos = ""

        # n-gramによるカウンター作成
        n_gram_list = n_gram(surf_list, n)
        for one_n_gram in n_gram_list:
            one_n_gram_str = "-".join(one_n_gram)

            if one_n_gram_str in counter:
                counter[one_n_gram_str] += 1
            else:
                counter[one_n_gram_str] = 1

    counter = sorted(counter.items(), 
                     key=lambda x:x[1], 
                     reverse=True)
    result = pd.DataFrame(counter, columns=[f"{n}gram", "count"])
    return result

In [51]:
texts = all_category_df["text"].tolist()
n = 2
result = create_n_gram_counter_df(texts, n=n)

# カテゴリ別リフト値を算出するために使用
count_sum = result["count"].sum()
result["ratio"] = result["count"].apply(lambda x: x / count_sum)

result.to_csv(f"{output_dir}all_{n}gram.csv", 
              encoding="utf-8-sig", index=False)

100%|██████████| 7367/7367 [00:18<00:00, 408.54it/s]


In [52]:
for category in all_category_df["category"].unique():

    texts = all_category_df.query("category == @category")["text"].tolist()
    n = 2
    result = create_n_gram_counter_df(texts, n=n)

    # カテゴリ別リフト値を算出するために使用
    count_sum = result["count"].sum()
    result["ratio"] = result["count"].apply(lambda x: x / count_sum)

    result.to_csv(f"{output_dir}{category}_{n}gram.csv", 
                  encoding="utf-8-sig", index=False)

100%|██████████| 870/870 [00:02<00:00, 329.74it/s]
100%|██████████| 870/870 [00:02<00:00, 425.16it/s]
100%|██████████| 864/864 [00:01<00:00, 564.20it/s]
100%|██████████| 511/511 [00:01<00:00, 298.06it/s]
100%|██████████| 870/870 [00:02<00:00, 366.78it/s]
100%|██████████| 842/842 [00:02<00:00, 404.63it/s]
100%|██████████| 870/870 [00:02<00:00, 299.28it/s]
100%|██████████| 900/900 [00:01<00:00, 726.57it/s]
100%|██████████| 770/770 [00:01<00:00, 732.61it/s]
