# 1 CNN

#1 CNN (MNIST)

https://colab.research.google.com/github/tensorflow/docs-l10n/blob/master/site/ko/tutorials/images/cnn.ipynb?hl=ko#scrollTo=jKgyC5K_4O0d

### 1) MNIST 데이터셋 다운로드하고 준비하기

In [1]:
try:
    import torch
    print(torch.__version__)
except:
    pass

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR

2.0.1+cu118


In [2]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5), (0.5)),
])

In [3]:
train_dataset = datasets.MNIST('./data', train=True, download=True, transform=transform)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=256, shuffle=True)

test_dataset = datasets.MNIST('./data', train=False, download=True, transform=transform)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=256, shuffle=False)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:00<00:00, 106936989.26it/s]


Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 129142530.73it/s]


Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1648877/1648877 [00:00<00:00, 26445587.60it/s]


Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 18676988.99it/s]


Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw



### 2) CNN 만들기

In [4]:
class MyCNN(nn.Module):
    def __init__(self):
        super(MyCNN, self).__init__()

        # self.name
        self.model = nn.Sequential(
            nn.Conv2d(1, 32, 3),
            nn.ReLU(True),
            nn.MaxPool2d(2),
            nn.Conv2d(32, 64, 3),
            nn.ReLU(True),
            nn.MaxPool2d(2),
            nn.Conv2d(64, 64, 3),
            nn.ReLU(True),
            nn.Flatten(),

            nn.Linear(576, 64),
            nn.ReLU(True),
            nn.Linear(64, 10),
        )

    def forward(self, x):
        output = self.model(x)
        return output


model = MyCNN()

In [5]:
model

MyCNN(
  (model): Sequential(
    (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
    (7): ReLU(inplace=True)
    (8): Flatten(start_dim=1, end_dim=-1)
    (9): Linear(in_features=576, out_features=64, bias=True)
    (10): ReLU(inplace=True)
    (11): Linear(in_features=64, out_features=10, bias=True)
  )
)

### 3) 손실함수, 옵티마이저, 스케줄러 설정하고 모델 훈련하기

In [6]:
# 모델 학습을 위한 옵티마이저와 스케줄러 정의
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# loss를 multiclass 분류에 적합한 loss인 cross entropy loss를 사용
criterion = nn.CrossEntropyLoss().cuda()

optimizer = optim.Adam(model.parameters(), lr=0.01)
scheduler = StepLR(optimizer, step_size=2, gamma=0.7)

In [7]:
# 모델 학습 (tensorflow는 fit 하나로 아래의 과정을 처리한다.)
epochs = 5
dry_run = False # 1 배치만 훈련

for epoch in range(1, epochs+1):
    # 학습
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))
            if dry_run:
                break

    # 테스트
    model.eval()
    test_loss = 0
    correct = 0
    for data, target in test_loader:
        data, target = data.to(device), target.to(device)

        with torch.no_grad():
            output = model(data)
        test_loss += criterion(output, target).detach().sum()  # sum up batch loss
        pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
        correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

    scheduler.step()


Test set: Average loss: 0.0003, Accuracy: 9733/10000 (97%)


Test set: Average loss: 0.0002, Accuracy: 9801/10000 (98%)


Test set: Average loss: 0.0002, Accuracy: 9858/10000 (99%)


Test set: Average loss: 0.0002, Accuracy: 9843/10000 (98%)


Test set: Average loss: 0.0001, Accuracy: 9882/10000 (99%)



### 4) 모델 평가

In [8]:
model.eval()
correct = 0
for data, target in test_loader:
    data, target = data.to(device), target.to(device)
    with torch.no_grad():
        output = model(data)
    pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
    correct += pred.eq(target.view_as(pred)).sum().item()

test_acc = correct/len(test_loader.dataset)

In [9]:
test_acc

0.9882

---

# 2 CNN for Sentence Classification

https://www.aclweb.org/anthology/D14-1181/

<img src="http://www.wildml.com/wp-content/uploads/2015/11/Screen-Shot-2015-11-06-at-8.03.47-AM-1024x413.png" />

## 1) 네이버 영화 리뷰 다운로드

In [10]:
# 네이버 영화 리뷰 다운로드
!wget https://github.com/e9t/nsmc/raw/master/ratings.txt

import pandas as pd
import numpy as np
df = pd.read_csv("./ratings.txt",sep='\t').dropna()
df.head(5)

--2023-08-17 05:13:44--  https://github.com/e9t/nsmc/raw/master/ratings.txt
Resolving github.com (github.com)... 140.82.113.3
Connecting to github.com (github.com)|140.82.113.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/e9t/nsmc/master/ratings.txt [following]
--2023-08-17 05:13:44--  https://raw.githubusercontent.com/e9t/nsmc/master/ratings.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 19515078 (19M) [text/plain]
Saving to: ‘ratings.txt’


2023-08-17 05:13:44 (147 MB/s) - ‘ratings.txt’ saved [19515078/19515078]



Unnamed: 0,id,document,label
0,8112052,어릴때보고 지금다시봐도 재밌어요ㅋㅋ,1
1,8132799,"디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산...",1
2,4655635,폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고.,1
3,9251303,와.. 연기가 진짜 개쩔구나.. 지루할거라고 생각했는데 몰입해서 봤다.. 그래 이런...,1
4,10067386,안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화.,1


## 2) 전처리

In [11]:
!pip install konlpy

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m41.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting JPype1>=0.7.0 (from konlpy)
  Downloading JPype1-1.4.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (465 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m465.3/465.3 kB[0m [31m40.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: JPype1, konlpy
Successfully installed JPype1-1.4.1 konlpy-0.6.0


In [12]:
import json
# from tensorflow.keras import preprocessing
from konlpy.tag import Okt
tokenizer = Okt()

# 불용어 정의
stopwords=['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

tokenized=[]
for sentence in df['document'][len(tokenized):]:
    # 한국어 형태소로 분리
    temp = tokenizer.morphs(sentence) # 토큰화
    temp = [word for word in temp if not word in stopwords] # 불용어 제거
    tokenized.append(temp)
print(tokenized[:10])

[['어릴', '때', '보고', '지금', '다시', '봐도', '재밌어요', 'ㅋㅋ'], ['디자인', '을', '배우는', '학생', ',', '외국', '디자이너', '그', '일군', '전통', '을', '통해', '발전', '해가는', '문화', '산업', '부러웠는데', '.', '사실', '우리나라', '에서도', '그', '어려운', '시절', '끝', '까지', '열정', '을', '지킨', '노라노', '같은', '전통', '있어', '저', '같은', '사람', '꿈', '을', '꾸고', '이뤄', '나갈', '수', '있다는', '것', '감사합니다', '.'], ['폴리스스토리', '시리즈', '1', '부터', '뉴', '까지', '버릴께', '하나', '없음', '..', '최고', '.'], ['..', '연기', '진짜', '개', '쩔구나', '..', '지루할거라고', '생각', '했는데', '몰입', '해서', '봤다', '..', '그래', '이런게', '진짜', '영화', '지'], ['안개', '자욱한', '밤하늘', '떠', '있는', '초승달', '같은', '영화', '.'], ['사랑', '을', '해본', '사람', '라면', '처음', '부터', '끝', '까지', '웃을수', '있는', '영화'], ['완전', '감동', '입니다', '다시', '봐도', '감동'], ['개', '전쟁', '2', '나오나요', '?', '나오면', '1', '빠', '로', '보고', '싶음'], ['굿'], ['바보', '아니라', '병', '쉰', '인듯']]


In [13]:
from nltk import FreqDist
vocab = FreqDist(np.hstack(tokenized))
print('단어 집합의 크기 : {}'.format(len(vocab)))

vocab_size = 510
# 상위 vocab_size개의 단어만 보존 -> 510 + 2(pad, unk)
vocab = vocab.most_common(vocab_size)
print('단어 집합의 크기 : {}'.format(len(vocab)))

word_to_index = {word[0] : index + 2 for index, word in enumerate(vocab)}
word_to_index['<PAD>'] = 1
word_to_index['<UNK>'] = 0

단어 집합의 크기 : 122810
단어 집합의 크기 : 510


In [14]:
encoded = []
for line in tokenized: #입력 데이터에서 1줄씩 문장을 읽음
    temp = []
    for w in line: #각 줄에서 1개씩 글자를 읽음
      try:
        temp.append(word_to_index[w]) # 글자를 해당되는 정수로 변환
      except KeyError: # 단어 집합에 없는 단어일 경우 unk로 대체된다.
        temp.append(word_to_index['<UNK>']) # unk의 인덱스로 변환

    encoded.append(temp)

print(encoded[:10])

[[471, 47, 42, 101, 76, 110, 343, 49], [0, 5, 0, 0, 7, 0, 0, 26, 0, 0, 5, 0, 0, 0, 0, 0, 0, 2, 406, 386, 0, 26, 0, 0, 114, 59, 0, 5, 0, 0, 83, 0, 0, 82, 83, 39, 0, 5, 0, 0, 0, 66, 0, 21, 0, 2], [0, 245, 40, 139, 0, 59, 0, 77, 277, 4, 22, 2], [4, 19, 14, 100, 0, 4, 0, 30, 290, 287, 142, 210, 4, 0, 0, 14, 3, 67], [0, 0, 0, 0, 88, 0, 83, 3, 2], [70, 5, 0, 39, 453, 90, 139, 114, 59, 0, 88, 3], [87, 38, 86, 76, 110, 38], [100, 490, 85, 0, 9, 0, 40, 0, 17, 42, 0], [241], [0, 301, 0, 0, 0]]


In [15]:
max_len = max(len(l) for l in encoded)
print('리뷰의 최대 길이 : %d' % max_len)
print('리뷰의 최소 길이 : %d' % min(len(l) for l in encoded))
print('리뷰의 평균 길이 : %f' % (sum(map(len, encoded))/len(encoded)))

max_len = 128
for line in encoded:
    if len(line) < max_len: # 현재 샘플이 정해준 길이보다 짧으면
        line += [word_to_index['<PAD>']] * (max_len - len(line)) # 나머지는 전부 'pad' 토큰으로 채운다.
print('리뷰의 최대 길이 : %d' % max(len(l) for l in encoded))
print('리뷰의 최소 길이 : %d' % min(len(l) for l in encoded))
print('리뷰의 평균 길이 : %f' % (sum(map(len, encoded))/len(encoded)))

##########################################################################################
# 전처리 함수 만들기



리뷰의 최대 길이 : 95
리뷰의 최소 길이 : 0
리뷰의 평균 길이 : 12.552127
리뷰의 최대 길이 : 128
리뷰의 최소 길이 : 128
리뷰의 평균 길이 : 128.000000


In [16]:
with open('./encoded', 'w') as f:
    json.dump(encoded, f)

In [17]:
with open('./encoded', 'r') as f:
    encoded = json.load(f)

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
train_ratio = 0.7
# index = int(len(encoded) * train_ratio)

x_train, x_test, y_train, y_test = train_test_split(encoded, df['label'], shuffle=True, stratify=df['label'], train_size=train_ratio)
y_train = y_train.values
y_test = y_test.values

In [20]:
len(x_train[0])

128

In [21]:
# 데이터셋 만들기
from torch.utils.data import Dataset
class TextDataset(Dataset):
    def __init__(self, x, y):
        self.x = torch.tensor(x, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.float)

    def __getitem__(self, index):
        # x = torch.from_numpy(self.x[index]).float()
        return self.x[index], self.y[index]

    def __len__(self):
        return len(self.x)

train_dataset = TextDataset(x_train, y_train)
test_dataset = TextDataset(x_test, y_test)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64)

## 3) 모델 정의

<img src="https://miro.medium.com/max/770/0*wigQtmJiv0bddwPI." width="500" />

In [27]:
class CNNforText(nn.Module):
    def __init__(self,
                 embed_size=128, # 단어 임베딩 사이즈
                 vocab_size=512, # 단어 vocab size
                 sequence_length=128 # 문장 길이 (128)
                 ):
        super(CNNforText, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size,
                               embedding_dim=embed_size,
                               padding_idx=1)

        # self.name
        self.cnn1 = nn.Sequential(
            nn.Conv2d(1, 128, (2, embed_size)),
            nn.ReLU(True),
            nn.MaxPool2d((embed_size-2, 1))
        )
        self.cnn2 = nn.Sequential(
            nn.Conv2d(1, 128, (3, embed_size)),
            nn.ReLU(True),
            nn.MaxPool2d((embed_size-3, 1))
        )
        self.cnn3 = nn.Sequential(
            nn.Conv2d(1, 128, (4, embed_size)),
            nn.ReLU(True),
            nn.MaxPool2d((embed_size-4, 1))
        )

        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Dropout(0.5),
            nn.Linear(128*3, 1), # 0.5 > 긍정, 0.5 < 부정
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.embedding(x)
        x = x.reshape(-1, 1, 128, 128)
        output1 = self.cnn1(x)
        output2 = self.cnn2(x)
        output3 = self.cnn3(x)
        output = torch.cat([output1, output2, output3], dim=1)
        output.to(device)
        output = self.classifier(output)
        return output

model = CNNforText()

## 4) 학습

- 상당한 RAM을 소모하므로, size를 전체적으로 줄여서 학습시킬 것

In [33]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

criterion = nn.BCELoss().cuda()
optimizer = optim.Adam(model.parameters(), lr=0.01)
scheduler = StepLR(optimizer, step_size=1, gamma=0.7)

In [34]:
# 모델 학습시키기
epochs = 5
from tqdm.auto import tqdm

for epoch in range(1, epochs+1):
    # 학습
    model.train()
    train_loss = []
    for batch_idx, (data, label) in enumerate(tqdm(iter(train_loader))):
        data, label = data.to(device), label.to(device)
        optimizer.zero_grad()
        target = model(data).reshape(-1)
        loss = criterion(target, label)

        loss.backward()
        optimizer.step()
        train_loss.append(loss.item())

    _train_loss = np.mean(train_loss)

    # 테스트
    model.eval()
    val_loss = []
    correct = 0
    for data, label in tqdm(iter(test_loader)):
        data, label = data.to(device), label.to(device)
        with torch.no_grad():
            target = model(data).reshape(-1).detach()
        loss = criterion(target, label)
        val_loss.append(loss.item())
        correct += (target>0.5).type(torch.int).eq(label.view_as(target)).sum().item()

    _val_loss = np.mean(val_loss)

    if epoch % 1 == 0:
        print(f'Epoch [{epoch}], Train Loss : [{_train_loss:.5f}] Val Loss : [{_val_loss:.5f}] Val Accuracy : [{correct/len(test_loader.dataset):.4f}]')

    scheduler.step()

  0%|          | 0/2188 [00:00<?, ?it/s]

  0%|          | 0/938 [00:00<?, ?it/s]

Epoch [1], Train Loss : [0.56115] Val Loss : [0.49860] Val Accuracy : [0.7443]


  0%|          | 0/2188 [00:00<?, ?it/s]

  0%|          | 0/938 [00:00<?, ?it/s]

Epoch [2], Train Loss : [0.54394] Val Loss : [0.48960] Val Accuracy : [0.7516]


  0%|          | 0/2188 [00:00<?, ?it/s]

  0%|          | 0/938 [00:00<?, ?it/s]

Epoch [3], Train Loss : [0.50912] Val Loss : [0.48528] Val Accuracy : [0.7584]


  0%|          | 0/2188 [00:00<?, ?it/s]

  0%|          | 0/938 [00:00<?, ?it/s]

Epoch [4], Train Loss : [0.49902] Val Loss : [0.48710] Val Accuracy : [0.7589]


  0%|          | 0/2188 [00:00<?, ?it/s]

  0%|          | 0/938 [00:00<?, ?it/s]

Epoch [5], Train Loss : [0.49121] Val Loss : [0.47949] Val Accuracy : [0.7574]


## 5) 평가

In [35]:
# 예측값 뽑기
model.eval()
y_pred = torch.Tensor([])
y_real = torch.Tensor([])
for data, label in test_loader:
    data, label = data.to(device), label
    with torch.no_grad():
        output = model(data).detach().cpu()
    y_pred = torch.cat((y_pred, output))
    y_real = torch.cat((y_real, label))

In [36]:
accuracy = (y_pred>0.5).type(torch.int).eq(y_real.view_as(y_pred)).sum().item() / len(y_pred)

In [37]:
accuracy

0.7573919130637687