In [3]:
# ライブラリの読み込み
import numpy as np
import pandas as pd
import matplotlib.pyplot  as plt
import torch
import torch.nn as nn
import torch.optim as optimizers
import torchtext
import glob
import os
import io
import string
import re
import random
from torchtext.vocab import Vectors
from utils.dataloader import get_IMDb_DataLoaders_and_TEXT

np.random.seed(9837)
torch.manual_seed(9837)
pd.set_option("display.max_rows", 250)
pd.set_option("display.max_columns", 100)

# データの前処理

## データファイルの作成

In [4]:
# 前処理の関数
def preprocessing_text(text):
    # 改行コードを消去
    text = re.sub('<br />', '', text)

    # カンマ、ピリオド以外の記号をスペースに置換
    for p in string.punctuation:
        if (p == ".") or (p == ","):
            continue
        else:
            text = text.replace(p, " ")

    # ピリオドなどの前後にはスペースを入れておく
    text = text.replace(".", " . ")
    text = text.replace(",", " , ")
    return text

# 分かち書き（今回はデータが英語で、簡易的にスペースで区切る）
def tokenizer_punctuation(text):
    return text.strip().split()


# 前処理と分かち書きをまとめた関数を定義
def tokenizer_with_preprocessing(text):
    text = preprocessing_text(text)
    ret = tokenizer_punctuation(text)
    return ret

In [5]:
# 訓練データのtsvファイルを作成
path = "D:/Statistics/data/deep_leraning/nlp/"

f = open(path + "IMDb_train.tsv", "w", encoding="utf-8")

positive_path = path + "aclImdb/train/pos/"
for fname in glob.glob(os.path.join(positive_path, "*.txt")):
    with io.open(fname, "r", encoding="utf-8") as ff:
        text = ff.readline()

        # タブがあれば消去
        text = text.replace("\t", " ")

        text = text+"\t"+"1"+"\t"+"\n"
        f.write(text)

negative_path = path + "aclImdb/train/neg/"
for fname in glob.glob(os.path.join(negative_path, "*.txt")):
    with io.open(fname, "r", encoding="utf-8") as ff:
        text = ff.readline()

        # タブがあれば消去
        text = text.replace("\t", " ")

        text = text+"\t"+"0"+"\t"+"\n"
        f.write(text)

f.close()

# テストデータのtsvファイルを作成
f = open(path + "IMDb_test.tsv", "w", encoding="utf-8")

positive_path = path + "aclImdb/test/pos/"
for fname in glob.glob(os.path.join(positive_path, "*.txt")):
    with io.open(fname, "r", encoding="utf-8") as ff:
        text = ff.readline()

        # タブがあれば消去
        text = text.replace("\t", " ")

        text = text+"\t"+"1"+"\t"+"\n"
        f.write(text)

negative_path = path + "aclImdb/test/neg/"
for fname in glob.glob(os.path.join(negative_path, "*.txt")):
    with io.open(fname, "r", encoding="utf-8") as ff:
        text = ff.readline()

        # タブがあれば消去
        text = text.replace("\t", " ")

        text = text+"\t"+"0"+"\t"+"\n"
        f.write(text)

f.close()

## DataLoaderの作成

In [7]:
# textとラベルを定義
# 文章とラベルの両方を用意
max_length=256
TEXT = torchtext.data.Field(sequential=True, tokenize=tokenizer_with_preprocessing, use_vocab=True,
                            lower=True, include_lengths=True, batch_first=True, fix_length=max_length, 
                            init_token="<cls>", eos_token="<eos>")
LABEL = torchtext.data.Field(sequential=False, use_vocab=False)

# フォルダ「data」からtsvファイルを読み込み
train_val_ds, test_ds = torchtext.data.TabularDataset.splits(path=path, train="IMDb_train.tsv", test="IMDb_test.tsv", format="tsv",
                                                             fields=[("Text", TEXT), ("Label", LABEL)])

# torchtext.data.Datasetのsplit関数で訓練データと検証データに分割
train_ds, val_ds = train_val_ds.split(split_ratio=0.8, random_state=random.seed(1234))


# ボキャブラリーを作成
# torchtextで単語ベクトルとして英語学習済みモデルを読み込み
load_path = path + "wiki-news-300d-1M.vec" 
english_fasttext_vectors = Vectors(name=load_path)

# ベクトル化したバージョンのボキャブラリーを作成
TEXT.build_vocab(train_ds, vectors=english_fasttext_vectors, min_freq=10)

# DataLoaderを作成
batch_size=24
train_dl = torchtext.data.Iterator(train_ds, batch_size=batch_size, train=True)
val_dl = torchtext.data.Iterator(test_ds, batch_size=batch_size, train=False, sort=False)
test_dl = torchtext.data.Iterator(test_ds, batch_size=batch_size, train=False, sort=False)

# Transformerのblockを定義

In [None]:
# Embedding層を定義
class Embedder(nn.Module):
    # idで示される単語をベクトルに変換
    
    def __init__(self, text_embedding_vectors):
        super(Embedder, self).__init__()
        
        # 学習済み単語ベクトルを読み込み(freeze=Trueで学習しない)
        self.embeddings = nn.Embedding.from_pretrained(embeddings=text_embedding_vectors, freeze=True)
        
    def forward(self, x):
        x_vec = self.embedding(x)
        return x_vec        
    
# 動作を確認



In [None]:
get_IMDb_DataLoaders_and_TEXT

In [None]:
import glob
import os
import io
import string
import re
import torchtext
import random
from torchtext.vocab import Vectors

In [None]:
# 訓練データのtsvファイルを作成します
f = open(path + "IMDb_train.tsv", "w", encoding="utf-8")

positive_path = path + "aclImdb/train/pos/"
for fname in glob.glob(os.path.join(positive_path, "*.txt")):
    with io.open(fname, "r", encoding="utf-8") as ff:
        text = ff.readline()

        # タブがあれば消去
        text = text.replace("\t", " ")

        text = text+"\t"+"1"+"\t"+"\n"
        f.write(text)