In [1]:
# ライブラリの読み込み
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy
import os
import glob
import re

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)

## 関数を定義

In [2]:
# 記事情報をデータフレームに格納する関数
def text_corpus(article_file, file_name, hr_columns, max_hr):
    
    # データの定義
    columns = ["doc_id", "title", "table"] + hr_columns.tolist() + ["text"]
    article_list = []

    for art_no in range(len(article_file)):

        # 記事を抽出
        n = len(article_file)
        raw_article = article_file[art_no]

        # タイトルを抽出
        try:
            doc_id0 = re.findall("<doc id=\"[0-9].+?\"", raw_article)[0]
            doc_id0 = int(re.findall("[0-9]+", doc_id0)[0])
            title0 = re.findall("title=\".*\"", raw_article)[0]
            title0 = re.sub("title=\"|\"", "", title0)
            # print(art_no, title0)
        except:
            continue

        # header文を抽出
        replace_str = "(\n.*\n\n|\n<h[0-9]>|&lt.*?&gt;|href=\".+?\")"
        candidate_header0 = re.split("\n<h[0-9]>.+?</h[0-9]>", raw_article)
        for i in range(len(candidate_header0)):
            if len(re.findall("<doc id=.+>", candidate_header0[i])) > 0:
                header0 = candidate_header0[i]
                break
            else:
                header0 = candidate_header0[0]
        header0 = re.sub("<doc id=.+>{,1}", "", header0)
        header0 = re.sub("[ ]", "", header0)
        header0 = re.sub(replace_str, "", header0)

        # 本文を抽出
        replace_str1 = "(&lt.*?&gt;|\n)"
        replace_str2 = "(<ul>|</ul>|<li>|</li>|<dl>|</dl>|<ls[0-9]></ls[0-9]>|</(?![a-zA-Z0-9]+>)|&lt;a[ ]?|a&gt;|href=\".+?\"|^(\n)+)"
        text_candidate = re.split("<h[0-9]>", raw_article)
        text_table0 = []
        text_list0 = []
        for i in range(len(text_candidate)):
            try:
                table0 = re.findall("^.+?</h[0-9]>", text_candidate[i])[0]
                table0 = re.sub(replace_str1, "", table0)
                text0 = re.sub("^.+?</h[0-9]>", "", text_candidate[i])
                text0 = re.sub("^\n.+\.\n", "", text0)
                text0 = re.sub("^.+\.\n", "", text0)
                text0 = re.sub("&lt;main&gt;(.+?)&lt;/main&gt;", "<main>[[\\1]]</main>", text0)
                text0 = re.sub("&lt;also&gt;(.+?)&lt;/also&gt;", "<also>[[\\1]]</also> ", text0)
                text0 = re.sub("&lt;main&gt;", "", text0)
                text0 = re.sub("&lt;also&gt;", "", text0)
                text0 = re.sub(replace_str2, "", text0)
                text0 = re.sub("&gt;(.+?)&lt;/", "[[\\1]]", text0)
                if len(table0) > 0:
                    text_table0.append(table0)
                    text_list0.append(text0)
            except:
                continue

        # テキストが無ければ飛ばす
        if len(text_list0)==0:
            continue

        # tableの階層構造を定義
        text_table0 = np.array(text_table0)
        table0 = np.array([re.sub("</h[0-9]>", "", text_table0[i]) for i in range(len(text_table0))])

        table_hr = np.array([int(re.findall("</h[0-9]>", text_table0[i])[0][3]) for i in range(len(text_table0))])
        index1 = np.where(table_hr==2)[0].astype("int")
        index2 = np.arange(np.min(np.where(table_hr==2)[0]), len(text_table0)).astype("int")
        hr1 = np.repeat("", len(text_table0)).astype("object")
        hr1[index2] = np.repeat(table0[index1], np.append(index1[1:], len(text_table0)) - index1)
        hr1 = np.append("header", hr1)
        hr2 = np.full((len(text_table0)+1, max_hr), "").astype("object")
        for i in range(max_hr):
            index2 = np.where(table_hr==i+3)[0].astype("int")
            if len(index2)==0:
                continue
            for j in range(len(index2)):
                try:
                    last_raw = np.min(np.hstack((index1[np.argmax(index1 >= index2[j])], index2[j+1]))) - 1
                except:
                    last_raw = index1[np.argmax(index1 >= index2[j])] - 1
                raw = np.arange(index2[j], last_raw + 1)
                hr2[raw+1, i] = str(table0[index2[j]])
            index1 = np.unique(np.append(index1, index2))

        # 新しい項目を定義
        new_text_table0 = np.repeat("", len(text_table0)).astype("object")
        for i in range(len(text_table0)):
            tag = re.findall("</h[0-9]>", text_table0[i])[0]
            new_text_table0[i] = re.sub("/", "", tag) + text_table0[i]

        # データフレームに格納
        titles = np.repeat(title0, len(text_table0) + 1).astype("U")
        doc_ids = ids = np.repeat(doc_id0, len(text_table0) + 1)
        tables = np.hstack((np.append("header", new_text_table0)[:, np.newaxis], hr1[:, np.newaxis], hr2)).astype("U")
        texts = np.append(header0, np.hstack((text_list0))).astype("U")
        article0 = pd.concat((pd.DataFrame(doc_ids), pd.DataFrame(titles), pd.DataFrame(tables), pd.DataFrame(texts)), axis=1)
        article0.columns = columns

        # リストに格納
        article_list.append(article0)
        
    # データフレームを作成
    articles = pd.concat((article_list), axis=0)
    M = articles.shape[0]
    articles.index = np.arange(M)
    articles = pd.concat((pd.DataFrame({"no": np.arange(M), "file": np.repeat(file_name, M)}), articles), axis=1)
    return articles


# パラグラフ単位のコーパスを定義する関数
def paragraph_corpus(articles, hr_columns, link_columns, max_link):

    # カラムを定義
    aux_columns = ["file", "doc_id", "title", "table"] + hr_columns.tolist()
    ref_columns = ["main", "also"]
    item_columns = ["item", "item_detail"]    
    info_columns = ref_columns + item_columns + link_columns.tolist() + ["itemization", "text"]

    # データの定義
    ol_flag = 0
    article0 = articles[aux_columns]
    text0 = np.array(articles["text"])
    article_info_list = []
    
    
    # textごとにパラグラフ単位でデータフレームに格納
    for i in range(len(text0)):

        # textをパラグラフに分割
        split_text = text0[i].split("\n")

        # 分割したtextが空白なら空白行を生成
        if len(split_text)==0:
            text_list0 = np.array([""]).astype("object")
            ol_list0 = np.array([""]).astype("object")
            main_list0 = np.array([""]).astype("object")
            also_list0 = np.array([""]).astype("object")
            item_list0 = np.array([""]).astype("object")
            detail_list0 = np.array([""]).astype("object")
            link_list0 = np.repeat("", max_list).astype("object")
            text_info = pd.DataFrame(np.hstack((main_list0[:, np.newaxis], also_list0[:, np.newaxis], item_list0[:, np.newaxis],
                                                detail_list0[:, np.newaxis], link_list0, ol_list0[:, np.newaxis],
                                                text_list0[:, np.newaxis])))
            text_info.columns = info_columns

        # リストの階層構造を定義
        elif len(split_text) > 0:
            text_list0 = np.repeat("", len(split_text)).astype("object")
            ol_list0 = np.repeat("", len(split_text)).astype("object")
            main_list0 = np.repeat("", len(split_text)).astype("object")
            also_list0 = np.repeat("", len(split_text)).astype("object")
            item_list0 = np.repeat("", len(split_text)).astype("object")
            detail_list0 = np.repeat("", len(split_text)).astype("object")
            link_list0 = np.full((len(split_text), max_link), "").astype("object")
            item_type= "dt"
            item0 = ""

            for j in range(len(split_text)):

                if split_text[j]=="<ol>":
                    ol_flag = 1
                    ol_list0[j] = split_text[j]

                elif ol_flag==1:
                    if split_text[j]=="</ol>":
                        ol_flag = 0
                        ol_list0[j] = split_text[j]
                    elif (len(split_text)-1)==j:
                        ol_flag = 0
                        text_list0[j] = split_text[j]
                    else:
                        ol_list0[j] = split_text[j]

                elif split_text[j][:6]=="<main>":
                    main_list0[j] = split_text[j]
                    
                elif split_text[j][:6]=="<also>":
                    also_list0[j] = split_text[j]
                    
                elif split_text[j][:4]=="<dt>":
                    item0 = split_text[j]
                    item_type = "dt"
                    try:
                        if split_text[j+1][:4]!="<dd>":
                            item_list0[j] = item0
                    except:
                        item_list0[j] = item0
                
                elif split_text[j][:4]=="<dd>":
                    detail0 = split_text[j]
                    detail_list0[j] = detail0
                    item_type = "dd"
                    if (item_type=="dt") | (item_type=="dd"):
                        item_list0[j] = item0

                elif len(re.findall("<ls[0-9]>", split_text[j])) > 0:
                    col = int(split_text[j][3]) - 1
                    link_list0[j, col] = split_text[j]

                else:
                    if len(split_text[j]) > 0:
                        text_list0[j] = split_text[j] 
                        item_type = ""

            # リストの階層構造を取得
            new_link_list0 = np.full((len(split_text), max_link), "").astype("object")
            concat = np.hstack((text_list0[:, np.newaxis], item_list0[:, np.newaxis], detail_list0[:, np.newaxis], link_list0))
            for j1 in range(max_link):
                index1 = np.where(link_list0[:, j1])[0].astype("int")
                index2 = np.append(np.where(np.sum(concat[:, :j1+4]!="", axis=1) > 0)[0], link_list0.shape[0]).astype("int")
                for j2 in range(len(index1)):
                    raw = np.arange(index1[j2], np.min(index2[index2 > index1[j2]]))
                    new_link_list0[raw, j1] = link_list0[index1[j2], j1]

            # 情報を結合
            text_info = pd.DataFrame(np.hstack((main_list0[:, np.newaxis], also_list0[:, np.newaxis], item_list0[:, np.newaxis],
                                                detail_list0[:, np.newaxis], link_list0, ol_list0[:, np.newaxis],
                                                text_list0[:, np.newaxis])))
            text_info.columns = info_columns
            
            # 情報がないレコードを削除
            ol_tag = ["<ol>", "</ol>"]
            ol_itemization = np.array(text_info["itemization"])
            ol_text = np.array(text_info["text"])
            index_ol = (ol_itemization!=ol_tag[0]) & (ol_itemization!=ol_tag[1]) & (ol_text!=ol_tag[0]) & (ol_text!=ol_tag[1])
            text_info = text_info.iloc[np.where(index_ol)[0]]
            text_info = text_info.iloc[np.where(np.sum(text_info!="", axis=1) > 0)[0]]            
            text_info.index = np.arange(text_info.shape[0])
            
            
        # データフレームを結合
        rep_id = np.repeat(i, text_info.shape[0])
        aux_info = article0.iloc[rep_id]
        aux_info.index = np.arange(len(rep_id))
        info = pd.concat((aux_info, text_info), axis=1)
        article_info_list.append(info)

    # データフレームを作成
    article_info = pd.concat((article_info_list), axis=0)
    N = article_info.shape[0]
    article_info.index = np.arange(N)
    article_info = pd.concat((pd.DataFrame({"no": np.arange(N)}), article_info), axis=1)
    return article_info

## コーパスを作成

In [3]:
# データの読み込み
# パスの定義
path ="E:/Statistics/data/wikipedia/"
folder = os.listdir(path + "extract/")
filelist = [glob.glob(path + "extract/" + folder[i] + "/*") for i in range(len(folder))]

In [None]:
# コーパスを作成
# カラムを定義
max_hr = 6
max_link = 9
hr_columns = "table" + np.arange(max_hr+1).astype("U").astype("object")
link_columns = "link" + np.arange(max_link).astype("U").astype("object")

# フォルダごとにコーパスを作成
# テキストファイルの読み込み
for rp in range(len(filelist)): 
    print(folder[rp])
    m = len(filelist[rp])
    folder_name = folder[rp]
    data = []
    article_file = []
    file_name = []

    for j in range(m):
        f = open(filelist[rp][j] , "r")
        data.append(f.read())
        f.close()

        # 記事を分割
        file_name.append(re.sub("\\\\", "/", re.split("extract/", filelist[rp][j])[1]))
        article_file.append(data[j].split("\n</doc>\n"))


    # text単位のコーパスを作成
    article_list = []
    print("------------テキスト単位のコーパスを作成------------")
    for files in range(len(article_file)):
        print(files)
        article_list.append(text_corpus(article_file[files], file_name[files], hr_columns, max_hr))

    # パラグラフ単位のコーパスを作成
    article_info_list = []
    print("------------パラグラフ単位のコーパスを作成------------")
    for files in range(len(article_list)):
        print(files)
        articles = article_list[files]
        article_info_list.append(paragraph_corpus(articles, hr_columns, link_columns, max_link))

    # リストを結合
    articles = pd.concat((article_list), axis=0)
    articles["no"] = np.arange(articles.shape[0])
    articles.index = np.arange(articles.shape[0])
    article_info = pd.concat((article_info_list), axis=0)
    article_info["no"] = np.arange(article_info.shape[0])
    article_info.index = np.arange(article_info.shape[0])
    
    # コーパスをデータフレームとして出力
    # csvで出力
    articles.to_csv(path + "corpus/wikipedia_corpus_{}.csv".format(folder_name), index=None)
    article_info.to_csv(path + "corpus/wikipedia_detail_corpus_{}.csv".format(folder_name), index=None)

    # ファイルを分割してexcel出力
    split_size = 10
    try:
        os.mkdir(path + "corpus/{}".format(folder_name))
    except:
        print("{}のフォルダは存在します。".format(folder_name))
    split_index = np.array_split(np.arange(len(file_name)), split_size)

    for i in range(len(split_index)):
        split_index[i]
        output_path = path + "corpus/{}/".format(folder_name)
        target_file = np.array(file_name)[split_index[i]]
        articles0 = articles.iloc[np.where(np.in1d(articles["file"], target_file))[0]]
        articles0.index = np.arange(articles0.shape[0])
        article_info0 = article_info.iloc[np.where(np.in1d(article_info["file"], target_file))[0]]
        article_info0.index = np.arange(article_info0.shape[0])
        articles0.to_excel(output_path + "wikipedia_corpus_{}{}.xlsx".format(folder_name, str(i)) , index=None)
        article_info0.to_excel(output_path + "wikipedia_detail_corpus_{}{}.xlsx".format(folder_name, str(i)) , index=None)

AA
------------テキスト単位のコーパスを作成------------
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
------------パラグラフ単位のコーパスを作成------------
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
AAのフォルダは存在します。
AB
------------テキスト単位のコーパスを作成------------
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
