In [1]:
# ライブラリのインポート
import glob
import os
import io
import string
import re
import torchtext
import random
from torchtext.vocab import Vectors

In [16]:
random.seed(1234)

## IMDbデータセットをtsv形式に変換

In [15]:
# 訓練データのtsvファイルを作成
path = "D:/Statistics/data/deep_leraning/nlp/"

f = open(path + "IMDb_train.tsv", "w", encoding="utf-8")

positive_path = path + "aclImdb/train/pos/"
for fname in glob.glob(os.path.join(positive_path, "*.txt")):
    with io.open(fname, "r", encoding="utf-8") as ff:
        text = ff.readline()

        # タブがあれば消去
        text = text.replace("\t", " ")

        text = text+"\t"+"1"+"\t"+"\n"
        f.write(text)

negative_path = path + "aclImdb/train/neg/"
for fname in glob.glob(os.path.join(negative_path, "*.txt")):
    with io.open(fname, "r", encoding="utf-8") as ff:
        text = ff.readline()

        # タブがあれば消去
        text = text.replace("\t", " ")

        text = text+"\t"+"0"+"\t"+"\n"
        f.write(text)

f.close()

In [3]:
# テストデータのtsvファイルを作成
f = open(path + "IMDb_test.tsv", "w", encoding="utf-8")

positive_path = path + "aclImdb/test/pos/"
for fname in glob.glob(os.path.join(positive_path, "*.txt")):
    with io.open(fname, "r", encoding="utf-8") as ff:
        text = ff.readline()

        # タブがあれば消去
        text = text.replace("\t", " ")

        text = text+"\t"+"1"+"\t"+"\n"
        f.write(text)

negative_path = path + "aclImdb/test/neg/"
for fname in glob.glob(os.path.join(negative_path, "*.txt")):
    with io.open(fname, "r", encoding="utf-8") as ff:
        text = ff.readline()

        # タブがあれば消去
        text = text.replace("\t", " ")

        text = text+"\t"+"0"+"\t"+"\n"
        f.write(text)

f.close()

## 前処理と単語分割の関数を定義

In [4]:
print("区切り文字：", string.punctuation)

# 前処理


def preprocessing_text(text):
    # 改行コードを消去
    text = re.sub('<br />', '', text)

    # カンマ、ピリオド以外の記号をスペースに置換
    for p in string.punctuation:
        if (p == ".") or (p == ","):
            continue
        else:
            text = text.replace(p, " ")

    # ピリオドなどの前後にはスペースを入れておく
    text = text.replace(".", " . ")
    text = text.replace(",", " , ")
    return text

# 分かち書き（今回はデータが英語で、簡易的にスペースで区切る）
def tokenizer_punctuation(text):
    return text.strip().split()


# 前処理と分かち書きをまとめた関数を定義
def tokenizer_with_preprocessing(text):
    text = preprocessing_text(text)
    ret = tokenizer_punctuation(text)
    return ret


# 動作を確認します
print(tokenizer_with_preprocessing('I like cats.'))


区切り文字： !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['I', 'like', 'cats', '.']


## DataLoaderの作成

In [5]:
# 文章とラベルの両方を用意
max_length = 256
TEXT = torchtext.data.Field(sequential=True, tokenize=tokenizer_with_preprocessing, use_vocab=True,
                            lower=True, include_lengths=True, batch_first=True, fix_length=max_length, 
                            init_token="<cls>", eos_token="<eos>")
LABEL = torchtext.data.Field(sequential=False, use_vocab=False)

In [6]:
# フォルダ「data」からtsvファイルを読み込み
train_val_ds, test_ds = torchtext.data.TabularDataset.splits(path=path, train="IMDb_train.tsv", test="IMDb_test.tsv", format="tsv",
                                                             fields=[("Text", TEXT), ("Label", LABEL)])

# 動作確認
print('訓練および検証のデータ数', len(train_val_ds))
print('1つ目の訓練および検証のデータ', vars(train_val_ds[0]))

訓練および検証のデータ数 25000
1つ目の訓練および検証のデータ {'Text': ['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', '.', 'it', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life', ',', 'such', 'as', 'teachers', '.', 'my', '35', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me', 'to', 'believe', 'that', 'bromwell', 'high', 's', 'satire', 'is', 'much', 'closer', 'to', 'reality', 'than', 'is', 'teachers', '.', 'the', 'scramble', 'to', 'survive', 'financially', ',', 'the', 'insightful', 'students', 'who', 'can', 'see', 'right', 'through', 'their', 'pathetic', 'teachers', 'pomp', ',', 'the', 'pettiness', 'of', 'the', 'whole', 'situation', ',', 'all', 'remind', 'me', 'of', 'the', 'schools', 'i', 'knew', 'and', 'their', 'students', '.', 'when', 'i', 'saw', 'the', 'episode', 'in', 'which', 'a', 'student', 'repeatedly', 'tried', 'to', 'burn', 'down', 'the', 'school', ',', 'i', 'immediately', 'recalled', '.', '.', '.', '.', '.', '.', '.', '.', '.', 'at', '.',

In [7]:
# torchtext.data.Datasetのsplit関数で訓練データと検証データに分割
train_ds, val_ds = train_val_ds.split(split_ratio=0.8, random_state=random.seed(1234))

# 動作確認
print('訓練データの数', len(train_ds))
print('検証データの数', len(val_ds))
print('1つ目の訓練データ', vars(train_ds[0]))

訓練データの数 20000
検証データの数 5000
1つ目の訓練データ {'Text': ['this', 'movie', 'has', 'got', 'to', 'be', 'one', 'of', 'the', 'worst', 'i', 'have', 'ever', 'seen', 'make', 'it', 'to', 'dvd', 'the', 'story', 'line', 'might', 'have', 'clicked', 'if', 'the', 'film', 'had', 'more', 'funding', 'and', 'writers', 'that', 'would', 'have', 'cut', 'the', 'nonsense', 'and', 'sickly', 'scenes', 'that', 'i', 'highly', 'caution', 'parents', 'on', '.', '.', '.', '.', 'but', 'the', 'story', 'line', 'is', 'like', 'a', 'loose', 'cannon', '.', 'if', 'there', 'was', 'such', 'a', 'thing', 'as', 'a', 'drive', 'thru', 'movie', 'maker', 'this', 'one', 'would', 'have', 'sprung', 'from', 'that', '.', 'it', 'reminded', 'me', 'a', 'lot', 'of', 'the', 'quickie', 'films', 'that', 'were', 'put', 'out', 'in', 'the', '1960', 's', ',', 'poor', 'script', 'writing', 'and', 'filming', '.', 'the', 'only', 'sensible', 'characters', 'in', 'the', 'whole', 'movie', 'was', 'the', 'bartender', 'and', 'beaver', '.', 'the', 'rest', 'of', 'the', '

## ボキャブラリーを作成

In [8]:
# torchtextで単語ベクトルとして英語学習済みモデルを読み込み
load_path = path + "wiki-news-300d-1M.vec" 
english_fasttext_vectors = Vectors(name=load_path)

# 単語ベクトルの中身を確認
print("1単語を表現する次元数：", english_fasttext_vectors.dim)
print("単語数：", len(english_fasttext_vectors.itos))

1単語を表現する次元数： 300
単語数： 999994


In [9]:
# ベクトル化したバージョンのボキャブラリーを作成
TEXT.build_vocab(train_ds, vectors=english_fasttext_vectors, min_freq=10)

# ボキャブラリーのベクトルを確認
print(TEXT.vocab.vectors.shape)
print(TEXT.vocab.vectors)

# ボキャブラリーの単語の順番を確認
print(TEXT.vocab.stoi)

torch.Size([17915, 300])
tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [-0.0802,  0.1372, -0.0504,  ...,  0.1850, -0.0759, -0.0204],
        [ 0.0344, -0.0601, -0.0251,  ...,  0.2494,  0.3044,  0.0519],
        [ 0.0553, -0.0843, -0.1171,  ..., -0.0713,  0.0784, -0.0163]])


In [10]:
# DataLoaderを作成
train_dl = torchtext.data.Iterator(train_ds, batch_size=24, train=True, sort=False)
val_dl = torchtext.data.Iterator(test_ds, batch_size=24, train=False, sort=False)

# 動作確認
batch = next(iter(val_dl))
print(batch.Text)
print(batch.Label)

(tensor([[   2,   14,  438,  ...,    1,    1,    1],
        [   2,  277,  670,  ...,    5, 9657,    3],
        [   2,   19,    8,  ..., 4545,    6,    3],
        ...,
        [   2,  239,   15,  ...,    1,    1,    1],
        [   2,  474,    4,  ...,    1,    1,    1],
        [   2, 3621,   13,  ...,    1,    1,    1]]), tensor([169, 256, 256, 150, 219, 256, 256, 160, 184, 256, 256, 158, 238, 188,
        120, 256, 230, 256, 214, 150, 256, 218, 224,  55]))
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
