In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import torchtext
from torchtext.legacy import datasets

import numpy as np
import time
import os
import string
import random

In [2]:
TEXT = torchtext.legacy.data.Field(lower=True,  fix_length=200, batch_first=False)
LABEL = torchtext.legacy.data.Field(sequential=False)

- train/val/test data 분리

In [4]:
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

In [5]:
print(vars(train_data.examples[0]))

{'text': ['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy.', 'it', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life,', 'such', 'as', '"teachers".', 'my', '35', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me', 'to', 'believe', 'that', 'bromwell', "high's", 'satire', 'is', 'much', 'closer', 'to', 'reality', 'than', 'is', '"teachers".', 'the', 'scramble', 'to', 'survive', 'financially,', 'the', 'insightful', 'students', 'who', 'can', 'see', 'right', 'through', 'their', 'pathetic', "teachers'", 'pomp,', 'the', 'pettiness', 'of', 'the', 'whole', 'situation,', 'all', 'remind', 'me', 'of', 'the', 'schools', 'i', 'knew', 'and', 'their', 'students.', 'when', 'i', 'saw', 'the', 'episode', 'in', 'which', 'a', 'student', 'repeatedly', 'tried', 'to', 'burn', 'down', 'the', 'school,', 'i', 'immediately', 'recalled', '.........', 'at', '..........', 'high.', 'a', 'classic', 'line:', 'inspector:', "i'm", 'here', 'to', 'sack', 'one', 'of', '

In [7]:
for exam in train_data.examples:
    text = [x.lower() for x in vars(exam)['text']]
    text = [x.replace('<br', '') for x in text]
    text = [''.join(c for c in s if c not in string.punctuation) for s in text]
    text = [s for s in text if s]
    vars(exam)['text'] = text

In [12]:
train_data, val_data = train_data.split(random_state=random.seed(0), split_ratio=0.8)

In [13]:
print(f'Number of train examples: {len(train_data)}')
print(f'Number of val examples: {len(val_data)}')
print(f'Number of test examples: {len(test_data)}')

Number of train examples: 20000
Number of val examples: 5000
Number of test examples: 25000


- 단어 집합 만들기 : build_vocab
    - max_size : 단어 집합의 크기로 단어 집합에 포함되는 어휘 수
    - min_freq : 특정 단어의 최소 등장 횟수
    - vector : 임베딩 지정

In [14]:
TEXT.build_vocab(train_data, max_size=10000, min_freq=10, vectors=None)
LABEL.build_vocab(train_data)

print(f'Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}')
print(f'Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}')

Unique tokens in TEXT vocabulary: 10002
Unique tokens in LABEL vocabulary: 3


In [16]:
print(LABEL.vocab.stoi)

defaultdict(<bound method Vocab._default_unk_index of <torchtext.legacy.vocab.Vocab object at 0x00000189B6169F10>>, {'<unk>': 0, 'pos': 1, 'neg': 2})


In [17]:
BATCH_SIZE = 100
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [19]:
train_dataloader, val_dataloader, test_dataloader = torchtext.legacy.data.BucketIterator.splits((train_data, val_data, test_data), batch_size=BATCH_SIZE, device=DEVICE)

In [20]:
vocab_size = len(TEXT.vocab)
n_classes = 2 # 긍정/부정

In [21]:
from model import BasicRNN