In [None]:
# https://pytorch.org/get-started/previous-versions/

In [1]:
# CUDA 11.8
#conda install pytorch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 pytorch-cuda=11.8 -c pytorch -c nvidia
# CUDA 12.1
!pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1
#conda install pytorch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 cpuonly -c pytorch

Defaulting to user installation because normal site-packages is not writeable


In [2]:
!pip install torchtext torchdata portalocker

Defaulting to user installation because normal site-packages is not writeable


In [3]:
import urllib.request
import pandas as pd
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data import to_map_style_dataset
from torch.utils.data import DataLoader



In [4]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

In [5]:
!curl -o IMDb_Reviews.csv https://raw.githubusercontent.com/LawrenceDuan/IMDb-Review-Analysis/master/IMDb_Reviews.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 62.8M  100 62.8M    0     0  15.8M      0  0:00:03  0:00:03 --:--:-- 15.8M


In [6]:
df = pd.read_csv('IMDb_Reviews.csv', encoding='latin1')
print('전체 샘플의 개수 : {}'.format(len(df)))

전체 샘플의 개수 : 50000


In [7]:
df.head()

Unnamed: 0,review,sentiment
0,My family and I normally do not watch local mo...,1
1,"Believe it or not, this was at one time the wo...",0
2,"After some internet surfing, I found the ""Home...",0
3,One of the most unheralded great works of anim...,1
4,"It was the Sixties, and anyone with long hair ...",0


In [8]:
df.tail()

Unnamed: 0,review,sentiment
49995,the people who came up with this are SICK AND ...,0
49996,"The script is so so laughable... this in turn,...",0
49997,"""So there's this bride, you see, and she gets ...",0
49998,Your mind will not be satisfied by this noÂbu...,0
49999,The chaser's war on everything is a weekly sho...,1


In [9]:
train_df = df[:25000]
test_df = df[25000:]

In [10]:
train_df.to_csv("train_data.csv", index=False)
test_df.to_csv("test_data.csv", index=False)

In [11]:
tokenizer = get_tokenizer("basic_english")
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
label_pipeline = lambda x: x

In [12]:
def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

In [13]:
train_iter = iter(train_df[['sentiment', 'review']].values)
vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])

In [14]:
train_iter = iter(train_df[['sentiment', 'review']].values)
label, text = next(train_iter)
print(label)
print(text)

1
My family and I normally do not watch local movies for the simple reason that they are poorly made, they lack the depth, and just not worth our time.<br /><br />The trailer of "Nasaan ka man" caught my attention, my daughter in law's and daughter's so we took time out to watch it this afternoon. The movie exceeded our expectations. The cinematography was very good, the story beautiful and the acting awesome. Jericho Rosales was really very good, so's Claudine Barretto. The fact that I despised Diether Ocampo proves he was effective at his role. I have never been this touched, moved and affected by a local movie before. Imagine a cynic like me dabbing my eyes at the end of the movie? Congratulations to Star Cinema!! Way to go, Jericho and Claudine!!


In [15]:
from torch.utils.data import DataLoader, Dataset

# Dataset을 정의해서 text를 tokenize하는 파이프라인을 구동
class IMDbDataset(Dataset):
    def __init__(self, df, text_pipeline, label_pipeline):
        self.df = df
        self.text_pipeline = text_pipeline
        self.label_pipeline = label_pipeline

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = torch.tensor(self.text_pipeline(self.df.iloc[idx]['review']), dtype=torch.int64)
        label = torch.tensor(self.label_pipeline(self.df.iloc[idx]['sentiment']), dtype=torch.int64)
        return text, label

In [16]:
# 여기서의 text는 text_pipeline을 거쳐서 tensor로 바뀐 상태
def collate_batch(batch):
    label_list, text_list = [], []
    for (_text, _label) in batch:
        label_list.append(_label)
        text_list.append(_text)
    
    text_list = torch.nn.utils.rnn.pad_sequence(text_list, batch_first=True, padding_value=vocab['<pad>'])
    label_list = torch.tensor(label_list, dtype=torch.int64)
    return text_list.to(device), label_list.to(device)


In [17]:
train_data = IMDbDataset(train_df, text_pipeline, label_pipeline)
test_data = IMDbDataset(test_df, text_pipeline, label_pipeline)

In [18]:
batch_size = 5
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)

In [19]:
print('훈련 데이터의 미니 배치 수 : {}'.format(len(train_loader)))
print('테스트 데이터의 미니 배치 수 : {}'.format(len(test_loader)))

훈련 데이터의 미니 배치 수 : 5000
테스트 데이터의 미니 배치 수 : 5000


In [20]:
for text, label in train_loader:
    print(text, label)
    break

tensor([[   2,  612,  577,  ...,    1,    1,    1],
        [   5,   13,  120,  ...,  252, 6555,    3],
        [ 576,  684,    6,  ...,    1,    1,    1],
        [  12,   41,  602,  ...,    1,    1,    1],
        [  82,   11,    9,  ...,    1,    1,    1]], device='cuda:0') tensor([0, 1, 0, 0, 0], device='cuda:0')
