In [31]:
import urllib.request
import pandas as pd
from torchtext.legacy import data
from torchtext.legacy.data import TabularDataset, Iterator

In [2]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/LawrenceDuan/IMDb-Review-Analysis/master/IMDb_Reviews.csv", filename="IMDb_Reviews.csv")

('IMDb_Reviews.csv', <http.client.HTTPMessage at 0x11115c490>)

In [3]:
df = pd.read_csv('../dataset/imdb_reviews/IMDb_Reviews.csv', encoding='latin1')

In [4]:
df.head()

Unnamed: 0,review,sentiment
0,My family and I normally do not watch local mo...,1
1,"Believe it or not, this was at one time the wo...",0
2,"After some internet surfing, I found the ""Home...",0
3,One of the most unheralded great works of anim...,1
4,"It was the Sixties, and anyone with long hair ...",0


In [5]:
print('# of total samples: {}'.format(len(df)))

# of total samples: 50000


In [6]:
train_df = df[:25000]
test_df = df[25000:]

In [7]:
train_df.to_csv("../dataset/imdb_reviews/train_data.csv", index=False)
test_df.to_csv("../dataset/imdb_reviews/test_data.csv", index=False)

In [19]:
TEXT = data.Field(sequential=True,
                  use_vocab=True,
                  tokenize=str.split,
                  lower=True,
                  batch_first=True,
                  fix_length=20)

LABEL = data.Field(sequential=False,
                   use_vocab=False,
                   batch_first=False,
                   is_target=True)

## Create dataset

In [22]:
train_data, test_data = TabularDataset.splits(
        path='.', train='../dataset/imdb_reviews/train_data.csv', test='../dataset/imdb_reviews/test_data.csv', format='csv',
        fields=[('text', TEXT), ('label', LABEL)], skip_header=True)

In [23]:
print('# of Training samples : {}'.format(len(train_data)))
print('# of Test samples : {}'.format(len(test_data)))

# of Training samples : 25000
# of Test samples : 25000


In [24]:
print(vars(train_data[0]))

{'text': ['my', 'family', 'and', 'i', 'normally', 'do', 'not', 'watch', 'local', 'movies', 'for', 'the', 'simple', 'reason', 'that', 'they', 'are', 'poorly', 'made,', 'they', 'lack', 'the', 'depth,', 'and', 'just', 'not', 'worth', 'our', 'time.<br', '/><br', '/>the', 'trailer', 'of', '"nasaan', 'ka', 'man"', 'caught', 'my', 'attention,', 'my', 'daughter', 'in', "law's", 'and', "daughter's", 'so', 'we', 'took', 'time', 'out', 'to', 'watch', 'it', 'this', 'afternoon.', 'the', 'movie', 'exceeded', 'our', 'expectations.', 'the', 'cinematography', 'was', 'very', 'good,', 'the', 'story', 'beautiful', 'and', 'the', 'acting', 'awesome.', 'jericho', 'rosales', 'was', 'really', 'very', 'good,', "so's", 'claudine', 'barretto.', 'the', 'fact', 'that', 'i', 'despised', 'diether', 'ocampo', 'proves', 'he', 'was', 'effective', 'at', 'his', 'role.', 'i', 'have', 'never', 'been', 'this', 'touched,', 'moved', 'and', 'affected', 'by', 'a', 'local', 'movie', 'before.', 'imagine', 'a', 'cynic', 'like', 'me

In [25]:
print(train_data.fields.items())

dict_items([('text', <torchtext.legacy.data.field.Field object at 0x1249f1410>), ('label', <torchtext.legacy.data.field.Field object at 0x1024ab090>)])


## Create vocab index

In [26]:
TEXT.build_vocab(train_data, min_freq=10, max_size=10000)

In [29]:
print('Index of each vocab  : {}'.format(len(TEXT.vocab)))

Index of each vocab  : 10002


In [30]:
# print(TEXT.vocab.stoi)

## Create dataloader using torchtext

In [32]:
batch_size = 5

In [33]:
train_loader = Iterator(dataset=train_data, batch_size = batch_size)
test_loader = Iterator(dataset=test_data, batch_size = batch_size)

In [35]:
print('# of mini batch for training data : {}'.format(len(train_loader)))
print('# of mini batch for test data : {}'.format(len(test_loader)))

# of mini batch for training data : 5000
# of mini batch for test data : 5000


In [36]:
batch = next(iter(train_loader)) # First mini batch

In [37]:
print(type(batch))

<class 'torchtext.legacy.data.batch.Batch'>


In [38]:
print(batch.text)

tensor([[   2, 2923,  706,   31, 1859, 9836,   14,    2,   59,   87,  169,   42,
           10,  118,  878,   12,    7,   36,    2, 1169],
        [   4,  189,   79,    2,  754,  274,    5,  269,    0,   11,   41,   71,
         2631,  182,  223,    2,   49,   82,  505,   18],
        [   0, 1315,    0,   13,  159, 1969,  141,    9,  380,  190,   18,    9,
         1969,  141,    9,    0,   49,    0,   11,   41],
        [   9,  109,   96, 3959,   13,  254,   20,    7,   22,   56,  221,    6,
           28,    3,   96,  118,   22,   32,    0,    5],
        [  10,    7,  711,   30,    5,   55,  487, 4991,    5,   36,  289, 4908,
            0,  244,   15,  572,    6,  151, 1611,   19]])


In [40]:
print(batch.text.shape)

torch.Size([5, 20])


## See the difference when (batch_first = False)

In [42]:
TEXT = data.Field(sequential=True,
                  use_vocab=True,
                  tokenize=str.split,
                  lower=True,
                  fix_length=20)

LABEL = data.Field(sequential=False,
                   use_vocab=False,
                   batch_first=False,
                   is_target=True)

In [46]:
train_data, test_data = TabularDataset.splits(
        path='.', train='../dataset/imdb_reviews/train_data.csv', test='../dataset/imdb_reviews/test_data.csv', format='csv',
        fields=[('text', TEXT), ('label', LABEL)], skip_header=True)

In [47]:
TEXT.build_vocab(train_data, min_freq=10, max_size=10000)

In [48]:
batch_size = 5
train_loader = Iterator(dataset=train_data, batch_size = batch_size)
batch = next(iter(train_loader))

In [49]:
print(batch.text)

tensor([[   2,    4,    0,    9,   10],
        [2923,  189, 1315,  109,    7],
        [ 706,   79,    0,   96,  711],
        [  31,    2,   13, 3959,   30],
        [1859,  754,  159,   13,    5],
        [9836,  274, 1969,  254,   55],
        [  14,    5,  141,   20,  487],
        [   2,  269,    9,    7, 4991],
        [  59,    0,  380,   22,    5],
        [  87,   11,  190,   56,   36],
        [ 169,   41,   18,  221,  289],
        [  42,   71,    9,    6, 4908],
        [  10, 2631, 1969,   28,    0],
        [ 118,  182,  141,    3,  244],
        [ 878,  223,    9,   96,   15],
        [  12,    2,    0,  118,  572],
        [   7,   49,   49,   22,    6],
        [  36,   82,    0,   32,  151],
        [   2,  505,   11,    0, 1611],
        [1169,   18,   41,    5,   19]])


In [50]:
print(batch.text.shape)

torch.Size([20, 5])
